# Project Prototype 1

In [1]:
# Package importing
import numpy as np
import pandas as pd
from sklearn import preprocessing

import tensorflow as tf

In order to create a neural network using this data, first we have to do a bit more cleaning. The most obvious of which is to encode the categorical data present. In this case the columns that contain categorical data are Type, Category, Content Rating, Genres, and Android Ver. We also want to remove 3 columns from the dataset that are irrelevant to our model. The first is the App column, which only contains the names of the apps. The less obvious one to remove is the Current Ver column, which contains data regarding the current version of every app, which functionally acts as a second name for each app, since apps are likely to use different syntaxes when talking about their versions. Finally, Last Updated needs to be removed since TensorFlow cannot use datetime data.

In [2]:
# Preparing data
df = pd.read_csv('googlePS_cleaned.csv')

# Integer encoding categorical variables
label = preprocessing.LabelEncoder()

df['Type'] = label.fit_transform(df['Type'])
df['Category'] = label.fit_transform(df['Category'])
df['Content Rating'] = label.fit_transform(df['Content Rating'])
df['Genres'] = label.fit_transform(df['Genres'])
df['Android Ver'] = label.fit_transform(df['Android Ver'])

# Removing irrelevant columns
df = df.drop(columns = ['App', 'Current Ver', 'Last Updated'])

df.head(10)

Unnamed: 0,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Android Ver
0,0,4.1,159,19.0,10000,0,0.0,1,9,14
1,0,3.9,967,14.0,500000,0,0.0,1,11,14
2,0,4.7,87510,8.7,5000000,0,0.0,1,9,14
3,0,4.5,215644,25.0,50000000,0,0.0,4,9,17
4,0,4.3,967,2.8,100000,0,0.0,1,10,19
5,0,4.4,167,5.6,50000,0,0.0,1,9,7
6,0,3.8,178,19.0,50000,0,0.0,1,9,14
7,0,4.1,36815,29.0,1000000,0,0.0,1,9,17
8,0,4.4,13791,33.0,1000000,0,0.0,1,9,9
9,0,4.7,121,3.1,10000,0,0.0,1,10,14


### Neural Network

In [17]:
# Making training and test data with 80:20 split
seed = 562764589

df_train = df.sample(frac = 0.8, replace = False, random_state = seed)
train_target = df_train.pop('Rating')
df_test = df.drop(df_train.index)
test_target = df_test.pop('Rating')

In [70]:
# Defining our model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(9, input_dim = 9, activation = 'relu'),
    tf.keras.layers.Dense(81, activation = 'relu'),
    tf.keras.layers.Dense(81, activation = 'relu'),
    tf.keras.layers.Dense(9, activation = 'softmax')
])

In [71]:
# Compiling the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [72]:
# Fitting the model
model.fit(df_train, train_target, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x2b21ba768e0>

In [74]:
test_loss, test_acc = model.evaluate(df_test,  test_target, verbose = True) 

