# Predicting App Ratings

In [33]:
# Package importing
import numpy as np
import pandas as pd
from sklearn import preprocessing

import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

### Data preprocessing (Do not run if using google_reviews_summary.csv)

In [22]:
# Reading in the cleaned datasets
summary = pd.read_csv('googlePS_cleaned.csv')
reviews = pd.read_csv('google_reviews_cleaned.csv')

In [23]:
# Integer encoding categorical variables
label = preprocessing.LabelEncoder()

summary['Type'] = label.fit_transform(summary['Type'])
summary['Category'] = label.fit_transform(summary['Category'])
summary['Content Rating'] = label.fit_transform(summary['Content Rating'])
summary['Genres'] = label.fit_transform(summary['Genres'])
summary['Android Ver'] = label.fit_transform(summary['Android Ver'])

reviews['Sentiment'] = label.fit_transform(reviews['Sentiment'])

# Mapping summary data to text review data

## Creating new columns in review database
reviews['Category'] = ''
reviews['Size'] = ''
reviews['Type'] = ''
reviews['Price'] = ''
reviews['Content Rating'] = ''
reviews['Genres'] = ''
reviews['Android Ver'] = ''

for row in reviews.index:
    appInfo = [None] * 8    # Empty list to hold app information, will hold 8 values max
    a = reviews.loc[row, 'App']    # Gets the name of the app from the review dataset
    
    if appInfo[0] == a:    # Checks to see if we already have the rating for the current app
        # Adds relevant information to respective column
        reviews.loc[row, 'Category'] = appRating[1]
        reviews.loc[row, 'Size'] = appInfo[2]
        reviews.loc[row, 'Type'] = appInfo[3]
        reviews.loc[row, 'Price'] = appInfo[4]
        reviews.loc[row, 'Content Rating'] = appInfo[5]
        reviews.loc[row, 'Genres'] = appInfo[6]
        reviews.loc[row, 'Android Ver'] = appInfo[7]
    
    elif summary['App'].eq(a).any():    # Checks to see if the app name is present in the original dataframe
        index = summary[summary['App'] == a].index.values[0]    # Gets the index of the app name
        
        # Saves all relevant information for future and adds information to respective column
        
        appInfo[0] = row
        
        appInfo[1] = summary.loc[index, 'Category']
        reviews.loc[row, 'Category'] = appInfo[1]
        
        appInfo[2] = summary.loc[index, 'Size']
        reviews.loc[row, 'Size'] = appInfo[2]
        
        appInfo[3] = summary.loc[index, 'Type']
        reviews.loc[row, 'Type'] = appInfo[3]
        
        appInfo[4] = summary.loc[index, 'Price']
        reviews.loc[row, 'Price'] = appInfo[4]
        
        appInfo[5] = summary.loc[index, 'Content Rating']
        reviews.loc[row, 'Content Rating'] = appInfo[5]
        
        appInfo[6] = summary.loc[index, 'Genres']
        reviews.loc[row, 'Genres'] = appInfo[6]
        
        appInfo[7] = summary.loc[index, 'Android Ver']
        reviews.loc[row, 'Android Ver'] = appInfo[7]
        
    else:
        reviews.loc[row, 'Category'] = None    # Sets category value to null if app cannot be found. df.dropna will remove all rows with a single null value
        
reviews = reviews.dropna()    # Removes reviews if no valid app score was found

In [24]:
# Removing irrelevant columns
reviews = reviews.drop(columns = ['App', 'Translated_Review'])

In [25]:
# Saving the newly cleaned data
reviews.to_csv('google_reviews_summary.csv', index=False)

### Model Creation

In [70]:
# Load in cleaned data
df = pd.read_csv('google_reviews_summary.csv')

# Should display 11 columns with 24,041 entries
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24041 entries, 0 to 24040
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Sentiment               24041 non-null  int64  
 1   Sentiment_Polarity      24041 non-null  float64
 2   Sentiment_Subjectivity  24041 non-null  float64
 3   Rating                  24041 non-null  float64
 4   Review                  24041 non-null  object 
 5   Category                24041 non-null  int64  
 6   Size                    24041 non-null  float64
 7   Type                    24041 non-null  int64  
 8   Price                   24041 non-null  float64
 9   Content Rating          24041 non-null  int64  
 10  Genres                  24041 non-null  int64  
 11  Android Ver             24041 non-null  int64  
dtypes: float64(5), int64(6), object(1)
memory usage: 2.2+ MB


In [71]:
# Vectorizing Reviews
vectorizer = TfidfVectorizer()

vector_r = vectorizer.fit_transform(reviews['Review'].values.astype('str'))

print(vector_r.shape)

(24041, 15579)


In [72]:
# Dropping text reviews from model
df = df.drop(columns = ['Review'])

In [73]:
# Converting int columns to floats for model
df['Sentiment'] = df['Sentiment'].astype(float)
df['Category'] = df['Category'].astype(float)
df['Type'] = df['Type'].astype(float)
df['Content Rating'] = df['Content Rating'].astype(float)
df['Genres'] = df['Genres'].astype(float)
df['Android Ver'] = df['Android Ver'].astype(float)

In [74]:
# Making training and test data with 80:20 split
seed = 684093

y = df.pop('Rating')
x = df

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = seed)

In [78]:
# Defining our model
model = tf.keras.Sequential([
    # Input Layer
    tf.keras.layers.Dense(10, input_dim = 10, activation = 'relu'),
    
    # Hidden Layers
    tf.keras.layers.Dense(400, activation = 'relu'),
    tf.keras.layers.Dense(400, activation = 'relu'),
    tf.keras.layers.Dense(400, activation = 'relu'),
    
    # Output Layer
    tf.keras.layers.Dense(1, activation = 'linear')
])

# Compiling the model
## Stochastic gradient descent optimizer
opt = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9)
model.compile(optimizer=opt,
              loss='mean_squared_error',    # Predicting a float value, need to use mean squared error
              metrics=['accuracy'])

In [80]:
# Fitting the model
model.fit(x_train, y_train, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x20381480040>