In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, confusion_matrix, recall_score
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn import tree
from sklearn import preprocessing
import graphviz

# Read original dataset

In [2]:
data = pd.read_csv("googleplaystore.csv")

# Dropping unnecessary columns and rows

According to James Le (2018), "If you have many independent features that each correlate well with the class, learning is easy. On the other hand, if the class is a very complex function of the features, you may not be able to learn it." 

The dropped columns are too specific to be used for inference.

Source: https://towardsdatascience.com/12-useful-things-to-know-about-machine-learning-487d3104e28?fbclid=IwAR0W1UAr0o07vpLINhWwoEJJqybNmOPEExUruYLiUB9v0q3o5rhkPz2Wkzw

In [3]:
#Drop columns: App, Last Updated, Current Ver, Android Ver, Unnamed: 13
cleaned_data = data.drop(['App', 'Last Updated', 'Current Ver', 'Android Ver', 'Unnamed: 13'], axis=1)

#Drop rows with Rating = NA
cleaned_data = cleaned_data.dropna()

#Drop rows with 'Varies with device'
cleaned_data = cleaned_data[cleaned_data.Size != 'Varies with device']

# Multiply numbers with 1000 if it ends with K or with 100000 if it ends with M

In [4]:
def value_to_float(x):
    if type(x) == float or type(x) == int:
        return x
    if 'K' in x:
        if len(x) > 1:
            return float(x.replace('K', '')) * 1000
        return 1000.0
    if 'M' in x:
        if len(x) > 1:
            return float(x.replace('M', '')) * 1000000
        return 1000000.0
    return 0.0

In [5]:
cleaned_data.Size = cleaned_data.Size.apply(value_to_float).astype(int)

In [6]:
cleaned_data.Installs = cleaned_data.Installs.map(lambda x: x.rstrip('+'))
cleaned_data.Installs = cleaned_data.Installs.str.replace(',', '')

cleaned_data.Price = cleaned_data.Price.str.replace('$', '').astype(float)

In [7]:
cleaned_data.drop_duplicates(subset=None, keep='first', inplace=True)

In [8]:
cleaned_data.shape
def ifef(col):
    if col > 4.5:
        return  'High'
    if col >= 4 and col <= 4.5:
        return 'Average'
    if col < 4:
        return 'Poor'
cleaned_data['Success'] = cleaned_data['Rating'].apply(ifef)

In [9]:
cleaned_data = cleaned_data.drop('Rating', axis=1)

# Writing cleaned data to csv file

In [10]:
cleaned_data.to_csv('googleplaystore_cleaned.csv')

# Read cleaned dataset

In [11]:
cleaned_data = pd.read_csv("googleplaystore_cleaned.csv")
labeled_data = cleaned_data

# Convert categorical data to indexes

In [12]:
# instantiate labelencoder object
le = LabelEncoder()

# apply labelencoder to categorical data
labeled_data['Category'] = le.fit_transform(labeled_data['Category'])
labeled_data['Type'] = le.fit_transform(labeled_data['Type'])
labeled_data['Content Rating'] = le.fit_transform(labeled_data['Content Rating'])
labeled_data['Genres'] = le.fit_transform(labeled_data['Genres'])
labeled_data['Success'] = le.fit_transform(labeled_data['Success'])

# Splitting dataset

In [13]:
X = labeled_data.drop('Success', axis=1)
Y = labeled_data['Success']

In [14]:
t_size = 0.4
r_state = 10

In [15]:
X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size = t_size, random_state = r_state)

X_train.shape = (5193, 8)

X_test.shape = (2226, 8)

y_train.shape = (5193,)

y_test.shape = (2226,)

# Building the decision tree

In [16]:
clf_gini = DecisionTreeClassifier(criterion = "gini", random_state = r_state, max_depth=3, min_samples_leaf=5)
# clf_gini = tree.DecisionTreeClassifier(random_state=666, min_impurity_decrease=0.009)

Other test values for random state:

    random_state   accuracy_score
    99             0.5658
    15             0.5677
    100            0.5710
    

In [17]:
clf_gini.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=10,
            splitter='best')

In [18]:
y_pred = clf_gini.predict(X_test)

# Values of Metrics

In [None]:
print('\nAccuracy Score: {0:.4f}'.format(accuracy_score(y_test, y_pred)))
print('\nPrecision Scores for Average, Poor, High:')
for i in precision_score(y_test, y_pred, average=None):
    print('>> {0:.4f}'.format(i))
print('\nRecall Scores for Average, Poor, High:')
for i in recall_score(y_test, y_pred,average=None):
    print('>> {0:.4f}'.format(i))
# print(recall_score(y_test, y_pred,average=None))


Accuracy Score: 0.5815

Precision Scores for Average, Poor, High:
>> 0.5867
>> 0.6272
>> 0.4354

Recall Scores for Average, Poor, High:
>> 0.9390
>> 0.1715
>> 0.0924


In [None]:
print('\nConfusion Matrix:')
print(confusion_matrix(y_test, y_pred))


Confusion Matrix:
[[1556   36   65]
 [ 494  106   18]
 [ 602   27   64]]


# Output decision tree image

In [None]:
dot_data = tree.export_graphviz(clf_gini, out_file=None, feature_names=list(X), class_names=["Average", "Poor", "High"], filled=True, rounded=True, special_characters=True)

In [None]:
graph = graphviz.Source(dot_data)
graph.format = 'png'
graph.render('dec_treev2', view=True)

'dec_treev2.png'

# Get user input

In [None]:
def reviews(level):
    if(level == 1):
        if(review_val <= 1463.5):
            return installs(2)
        else:
            return "Average"

    if(level == 2):
        if(review_val <= 25873.5):
            return installs(3)

def installs(level):
    if(level == 2):
        if(install_val <= 3000):
            return installs(3)
        else:
            return genre()

    if(level == 3):
        if(install_val <= 300):
            return "Poor"
        else:
            return "Average"


def genre():
    genres_dictionary = {
      'Art & Design' : 'Average',
      'Auto & Vehicles' : 'Average',
      'Beauty' : 'Average',
      'Books & Reference' : 'Average',
      'Business' : 'Average',
      'Comics' : "Average",
      'Communication' : "Average",
      'Dating' : "Average",
      'Education' : "Average",
      'Entertainment' : "Average",
      'Events' : "Average",
      'Finance' : "Average",
      'Food & Drink' : "Average",
      'Health & Fitness' : "Average",
      'House & Home' : "Average",
      'Libraries & Demo' : "Average",
      'Lifestyle' : "Average",
      'Adventure' : "Average",
      'Arcade' : "Average",
      'Casual' : "Average",
      'Card' : "Average",
      'Strategy' : "High",
      'Action' : "Average",
      'Puzzle' : "Average",
      'Sports' : "Average",
      'Word' : "High",
      'Racing' : "Average",
      'Simulation' : "Average",
      'Board' : "Average",
      'Trivia' : "High",
      'Role Playing' : "Average",
      'Educational' : "Average",
      'Music' : "Average",
      'Music & Audio' : "Average",
      'Video Players & Editors' : "High",
      'Medical' : "Average",
      'Social' : "Average",
      'Shopping' : "Average",
      'Photography' : "Average",
      'Travel & Local' : "High",
      'Tools' : "High",
      'Personalization' : "Average",
      'Productivity' : "Average",
      'Parenting' : "Average",
      'Weather' : "High",
      'News & Magazines' : "Average",
      'Maps & Navigation' : "Average",
      'Casino' : "Average"
    }

    return genres_dictionary.get(genre_name)

In [None]:
print("Welcome to Google App Success Classifier!")
review_val = float(input("Enter the number of reviews of your app: "))
install_val = float(input("Enter the number of installs of your app: "))
genre_name = input("Enter the genre of your app: ")
print(reviews(1))

Welcome to Google App Success Classifier!
