In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [None]:
#Import Cleaned Data
df = pd.read_csv('cleanedAppsData.csv')

In [None]:
### Feature Engineering ###

#Filtering for relevant variables - Remove App Name
print(df.columns)
df = df.drop(['App Name'], axis = 1)
print(df.columns)

In [None]:
#Reduce number of target classes
df.loc[df['Installs'].isin(['0 - 100', '100 - 500', '500 - 1,000']), 'Installs'] = '0 - 1,000'
df.loc[df['Installs'].isin(['1,000 - 5,000', '5,000 - 10,000']), 'Installs'] = '1,000 - 10,000'
df.loc[df['Installs'].isin(['10,000 - 50,000', '50,000 - 100,000']), 'Installs']  = '10,000 - 100,000'
df.loc[df['Installs'].isin(['100,000 - 500,000', '500,000 - 1,000,000']), 'Installs']  = '100,000 - 1,000,000'
df.loc[df['Installs'].isin(['1,000,000 - 5,000,000', '5,000,000 - 10,000,000']), 'Installs']  = '1,000,000 - 10,000,000'
df.loc[df['Installs'].isin(['10,000,000 - 50,000,000', '50,000,000 - 100,000,000']), 'Installs']  = '10,000,000 - 100,000,000'
df.loc[df['Installs'].isin(['100,000,000 - 500,000,000', '500,000,000 - 1,000,000,000']), 'Installs']  = '100,000,000 - 1,000,000,000'
df.loc[df['Installs'].isin(['1,000,000,000 - 5,000,000,000', '5,000,000,000+']), 'Installs']  = '1,000,000,000+'


df.Installs = pd.Categorical(df.Installs, ['0 - 1,000','1,000 - 10,000', '10,000 - 100,000', '100,000 - 1,000,000', '1,000,000 - 10,000,000', '10,000,000 - 100,000,000', '100,000,000 - 1,000,000,000', '1,000,000,000+'])
print(df.Installs.value_counts().sort_index())
print(df.shape)

In [None]:
#One hot encoding due to Sklearn categorical variable limitation (Sklearn Decision trees treat categorical variable as continuous)
strat = df.Category.values
df = pd.get_dummies(df, columns=['Content Rating', 'Category', 'Game_genre'], drop_first=True)
print(df.columns)

#Train Test Split - Simple Random Sampling (with Stratification)
X = df.drop(['Installs'], axis = 1).values
y = df.Installs.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify = strat, random_state = 42)
X.shape[0] == y.shape[0]

In [None]:
#Decision Tree Classifier Train
dt = DecisionTreeClassifier(max_depth = 8, max_features = 'sqrt')
dt.fit(X_train, y_train)
y_pred = dt.predict(X_train)
acc = accuracy_score(y_train, y_pred)
print("Decision Tree train data accuracy: {:.2f}".format(acc))


In [None]:
#Decision Tree Classifier 
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print("Decision Tree Test data accuracy: {:.2f}".format(acc))

In [None]:
# Decision Tree Cross Val
a = np.mean(cross_val_score(dt, X, y, scoring = 'accuracy', cv = 10))
print("Decision Tree cross validation accuracy: {:.2f}".format(a))

#The Decision Tree gave an accuracy of around 0.63

In [None]:
#K Nearest Neighbors
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
print('knn train data accuracy',knn.score(X_train, y_train))  
print('knn test data accuracy', knn.score(X_test,y_test))
print('knn cross validation accuracy', np.mean(cross_val_score(knn,X, y, cv = 5)))

#KNN gave an accuracy of 0.66