# Predict App Popularity

Objective: 
- 1. Determine how different features are related to the app popularity
- 2. Explain how different features affect the decision
- 2. Output: app_id/popularity(High or Low)

Files:
- X_le.csv  - data with label encode
- X_ohe.csv - data with one hot encode
- y.csv    - label where 1 = high popularity, and 0 = low popularity 

In [1]:
## import needed libraries/packages
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import kurtosis, skew
from IPython.display import display

In [79]:
import sklearn
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error

In [3]:
## display sklearn version
print(sklearn.__version__)

0.24.2


## Modeling (sklearn) - RandomForestClassifier

### One Hot Encoded Data

In [4]:
X_ohe = pd.read_csv('X_ohe.csv')

In [5]:
y = pd.read_csv('y.csv')
y = y.squeeze()

In [6]:
## Create training and test splits
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_ohe, y, test_size = 0.3, random_state = 101, stratify = y)

In [7]:
X_train1.head()

Unnamed: 0,reviews,review_c,review_log,installs_num,price_num,price_b,size_num,last_update_year,latest_ver_h,x0_Everyone,...,x0_Mature 17+,x0_Teen,x1_FAMILY,x1_FINANCE,x1_GAME,x1_HEALTH_AND_FITNESS,x1_LIFESTYLE,x1_MEDICAL,x1_OTHER,x1_TOOLS
460,8,0,3.0,1000,0.0,0,31000000,2018,1,1,...,0,0,0,1,0,0,0,0,0,0
248,308234,1000,18.233666,10000000,0.0,0,38000000,2018,5,1,...,0,0,0,0,0,0,0,0,1,0
1192,961,10,9.908393,100000,0.0,0,1300000,2018,1,1,...,0,0,0,0,0,1,0,0,0,0
298,1,0,0.0,100,0.0,0,2300000,2017,1,1,...,0,0,0,0,0,0,0,0,1,0
574,395,10,8.625709,100000,0.0,0,2600000,2018,4,1,...,0,0,0,0,0,0,0,0,0,1


In [8]:
X_test1.head()

Unnamed: 0,reviews,review_c,review_log,installs_num,price_num,price_b,size_num,last_update_year,latest_ver_h,x0_Everyone,...,x0_Mature 17+,x0_Teen,x1_FAMILY,x1_FINANCE,x1_GAME,x1_HEALTH_AND_FITNESS,x1_LIFESTYLE,x1_MEDICAL,x1_OTHER,x1_TOOLS
183,227401,1000,17.794879,5000000,0.0,0,91000000,2018,1,1,...,0,0,1,0,0,0,0,0,0,0
1204,23,0,4.523562,10000,0.0,0,17000000,2016,1,1,...,0,0,1,0,0,0,0,0,0,0
877,70,0,6.129283,1000,0.0,0,3200000,2018,1,1,...,0,0,0,0,0,0,0,0,0,1
1948,304,10,8.247928,10000,0.0,0,4700000,2018,1,0,...,0,1,1,0,0,0,0,0,0,0
218,1312936,50000,20.324365,10000000,0.0,0,21000000,2018,3,1,...,0,0,0,0,0,0,0,0,1,0


#### Tuning RandomForest Model with Grid Search

- With One Hot Encoded data
- Best 'max_features': 5
- Best 'n_estimators': 100

In [9]:
rf = RandomForestClassifier(criterion = 'entropy',
                            random_state = 99,
                            bootstrap = True)

In [10]:
param_grid = {
    'n_estimators': [50, 75, 100, 125, 150, 175, 200],
    'max_features': [4, 6, 8, 10],
}

In [11]:
gs1 = GridSearchCV(rf, param_grid, cv = 3)

In [12]:
gs1.fit(X_train1, y_train1)
print("best params:", gs1.best_params_)

best params: {'max_features': 10, 'n_estimators': 125}


In [13]:
gs2 = GridSearchCV(rf, param_grid, cv = 4)

In [14]:
gs2.fit(X_train1, y_train1)
print("best params:", gs2.best_params_)

best params: {'max_features': 4, 'n_estimators': 75}


In [15]:
gs3 = GridSearchCV(rf, param_grid, cv = 5)

In [16]:
gs3.fit(X_train1, y_train1)
print("best params:", gs3.best_params_)

best params: {'max_features': 10, 'n_estimators': 200}


#### Basic RandomForest Model 

In [17]:
rf1 = RandomForestClassifier(criterion = 'entropy',
                             n_estimators = 125,
                             max_features = 10, 
                             random_state = 99,
                             bootstrap = True)

In [18]:
rf1.fit(X_train1, y_train1)

RandomForestClassifier(criterion='entropy', max_features=10, n_estimators=125,
                       random_state=99)

In [19]:
y_pred1 = rf1.predict(X_test1)

In [20]:
print(f"The accuracy of the model is {round(accuracy_score(y_test1, y_pred1),3)*100} %")

The accuracy of the model is 73.0 %


In [21]:
confusion_matrix(y_test1, y_pred1, labels = [1, 0])

array([[370,  53],
       [107,  63]], dtype=int64)

In [22]:
y_test1.value_counts()

1    423
0    170
Name: popularity_b, dtype: int64

In [23]:
y_pred1 = pd.DataFrame(y_pred1)
y_pred1.value_counts()

1    477
0    116
dtype: int64

#### Check Variable Importance

In [24]:
# rf.feature_importances_
ft_imp1 = pd.Series(rf1.feature_importances_, 
                    index = X_train1.columns).sort_values(ascending=False)
print(ft_imp1.head(25))

reviews                  0.194002
review_log               0.190796
size_num                 0.184052
installs_num             0.112889
latest_ver_h             0.073851
last_update_year         0.051488
x1_OTHER                 0.024811
price_num                0.022653
x1_FAMILY                0.021813
review_c                 0.017129
x0_Everyone              0.014919
x1_TOOLS                 0.014181
x1_LIFESTYLE             0.012259
x1_MEDICAL               0.009928
x0_Teen                  0.009865
x1_FINANCE               0.009214
x1_GAME                  0.009159
price_b                  0.008950
x1_HEALTH_AND_FITNESS    0.007255
x0_Mature 17+            0.006407
x0_Everyone 10+          0.004379
dtype: float64


### Label Encode Data

In [25]:
X_le = pd.read_csv('X_le.csv')

In [26]:
## Create training and test splits
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_le, y, test_size = 0.3, random_state = 101, stratify = y)

In [27]:
X_train2.head()

Unnamed: 0,reviews,review_c,review_log,installs_num,price_num,price_b,size_num,last_update_year,latest_ver_h,suitable_for,category_b
460,8,0,3.0,1000,0.0,0,31000000,2018,1,0,1
248,308234,1000,18.233666,10000000,0.0,0,38000000,2018,5,0,6
1192,961,10,9.908393,100000,0.0,0,1300000,2018,1,0,3
298,1,0,0.0,100,0.0,0,2300000,2017,1,0,6
574,395,10,8.625709,100000,0.0,0,2600000,2018,4,0,7


In [28]:
X_test2.head()

Unnamed: 0,reviews,review_c,review_log,installs_num,price_num,price_b,size_num,last_update_year,latest_ver_h,suitable_for,category_b
183,227401,1000,17.794879,5000000,0.0,0,91000000,2018,1,0,0
1204,23,0,4.523562,10000,0.0,0,17000000,2016,1,0,0
877,70,0,6.129283,1000,0.0,0,3200000,2018,1,0,7
1948,304,10,8.247928,10000,0.0,0,4700000,2018,1,3,0
218,1312936,50000,20.324365,10000000,0.0,0,21000000,2018,3,0,6


#### Tuning RandomForest Model(Grid Search) with Encode(pipe)

In [29]:
features_to_encode = list(X_train2.columns)[-2:]
feature_to_scale = list(X_train2.columns)[:-2]

In [30]:
transformer = ColumnTransformer(
    transformers = [
        ("num", StandardScaler(), feature_to_scale), 
        ("cat", OneHotEncoder(), features_to_encode)])

In [31]:
rf = RandomForestClassifier(criterion = 'entropy',
                            random_state = 99,
                            bootstrap = True)

In [32]:
rf_pipe1 = Pipeline(steps = [("prep", transformer),
                            ("rf", rf)])

In [33]:
param_grid = {
    'rf__n_estimators': [50, 75, 100, 125, 150, 175, 200],
    'rf__max_features': [4, 6, 8, 10],
}

In [34]:
gs_pipe1 = GridSearchCV(rf_pipe1, param_grid = param_grid, cv = 3)

In [35]:
gs_pipe1.fit(X_train2, y_train2)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('prep',
                                        ColumnTransformer(transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['reviews',
                                                                          'review_c',
                                                                          'review_log',
                                                                          'installs_num',
                                                                          'price_num',
                                                                          'price_b',
                                                                          'size_num',
                                                                          'last_update_year',
                                                             

In [36]:
gs_pipe1.best_params_

{'rf__max_features': 10, 'rf__n_estimators': 100}

In [37]:
y_pred2 = gs_pipe1.predict(X_test2)

In [38]:
print(f"The accuracy of the model is {round(accuracy_score(y_test2, y_pred2),3)*100} %")

The accuracy of the model is 74.0 %


In [39]:
confusion_matrix(y_test2, y_pred2, labels = [1, 0])

array([[371,  52],
       [102,  68]], dtype=int64)

In [40]:
y_test2.value_counts()

1    423
0    170
Name: popularity_b, dtype: int64

In [41]:
pd.DataFrame(y_pred2).value_counts()

1    473
0    120
dtype: int64

#### Basic RandomForest Model with Encode (pipeline) - Approach 1

In [42]:
# OneHotpipe = make_pipeline(OneHotEncoder(sparse = False, handle_unknown = 'ignore'))
# OneHotpipe = make_pipeline(OneHotEncoder())
# OneHotpipe = OneHotEncoder()

In [43]:
# transformer = ColumnTransformer(transformers=[("oh_encode", OneHotpipe, features_to_encode)])
transformer = ColumnTransformer(
    transformers = [
        ("num", StandardScaler(), feature_to_scale), 
        ("cat", OneHotEncoder(), features_to_encode)])

In [44]:
rf_pipe2 = Pipeline(steps = [("prep", transformer),
                            ("rf", rf)])

In [45]:
rf_pipe2.fit(X_train2, y_train2)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['reviews', 'review_c',
                                                   'review_log', 'installs_num',
                                                   'price_num', 'price_b',
                                                   'size_num',
                                                   'last_update_year',
                                                   'latest_ver_h']),
                                                 ('cat', OneHotEncoder(),
                                                  ['suitable_for',
                                                   'category_b'])])),
                ('rf',
                 RandomForestClassifier(criterion='entropy', random_state=99))])

In [46]:
y_pred3 = rf_pipe2.predict(X_test2)

In [47]:
print(f"The accuracy of the model is {round(accuracy_score(y_test2, y_pred3),3)*100} %")

The accuracy of the model is 73.5 %


In [48]:
confusion_matrix(y_test2, y_pred3, labels=[1, 0])

array([[375,  48],
       [109,  61]], dtype=int64)

In [49]:
y_test2.value_counts()

1    423
0    170
Name: popularity_b, dtype: int64

In [50]:
pd.DataFrame(y_pred3).value_counts()

1    484
0    109
dtype: int64

#### Basic RandomForest Model with Encode (pipeline) - Approach 2

In [51]:
## one-hot-encode categorical variables
col_trans = make_column_transformer(
                        (OneHotEncoder(handle_unknown = 'ignore'), features_to_encode),
                        remainder = "passthrough")
# col_trans

In [52]:
rf_pipe3 = make_pipeline(col_trans, rf)
rf_pipe3.fit(X_train2, y_train2)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['suitable_for',
                                                   'category_b'])])),
                ('randomforestclassifier',
                 RandomForestClassifier(criterion='entropy', random_state=99))])

In [53]:
y_pred4 = rf_pipe3.predict(X_test2)

In [54]:
# print(f"The accuracy of the model is {round(accuracy_score(y_test2,y_pred4),3)*100} %")
print(f"The accuracy of the model is {round(rf_pipe3.score(X_test2, y_test2),3)*100} %")

The accuracy of the model is 73.4 %


In [55]:
confusion_matrix(y_test2, y_pred4, labels=[1, 0])

array([[377,  46],
       [112,  58]], dtype=int64)

In [56]:
y_test2.value_counts()

1    423
0    170
Name: popularity_b, dtype: int64

In [57]:
y_pred4 = pd.DataFrame(y_pred4)
y_pred4.value_counts()

1    489
0    104
dtype: int64

#### Check Variable Importance

In [58]:
## Variable Importance
ft_imp2 = pd.Series(rf_pipe3.steps[1][1].feature_importances_ ).sort_values(ascending=False)
print(ft_imp2.head(20))

12    0.195658
14    0.182455
18    0.172168
15    0.110790
20    0.077832
19    0.058280
13    0.031897
10    0.022287
16    0.021488
4     0.019798
0     0.012909
11    0.012630
6     0.011498
8     0.011381
3     0.010436
9     0.010076
17    0.009985
5     0.008377
7     0.007655
2     0.007082
dtype: float64


In [59]:
# rf_pipe3.steps[0][1].transformers_[0][1].get_feature_names(features_to_encode)
lepipe_var_list = rf_pipe3['columntransformer'].transformers_[0][1].get_feature_names(features_to_encode)
rf_pipe3['columntransformer'].transformers_[0][1].get_feature_names(features_to_encode)

array(['suitable_for_0', 'suitable_for_1', 'suitable_for_2',
       'suitable_for_3', 'category_b_0', 'category_b_1', 'category_b_2',
       'category_b_3', 'category_b_4', 'category_b_5', 'category_b_6',
       'category_b_7'], dtype=object)

#### Variable Importance with Variable Names

In [60]:
lepipe_var_list = list(lepipe_var_list)
lepipe_var_list.extend(feature_to_scale)
lepipe_var_list


['suitable_for_0',
 'suitable_for_1',
 'suitable_for_2',
 'suitable_for_3',
 'category_b_0',
 'category_b_1',
 'category_b_2',
 'category_b_3',
 'category_b_4',
 'category_b_5',
 'category_b_6',
 'category_b_7',
 'reviews',
 'review_c',
 'review_log',
 'installs_num',
 'price_num',
 'price_b',
 'size_num',
 'last_update_year',
 'latest_ver_h']

In [61]:
## Variable Importance with variable names
ft_imp3 = pd.Series(rf_pipe3.steps[1][1].feature_importances_, index = lepipe_var_list).sort_values(ascending=False)
print(ft_imp3.head(20))

reviews             0.195658
review_log          0.182455
size_num            0.172168
installs_num        0.110790
latest_ver_h        0.077832
last_update_year    0.058280
review_c            0.031897
category_b_6        0.022287
price_num           0.021488
category_b_0        0.019798
suitable_for_0      0.012909
category_b_7        0.012630
category_b_2        0.011498
category_b_4        0.011381
suitable_for_3      0.010436
category_b_5        0.010076
price_b             0.009985
category_b_1        0.008377
category_b_3        0.007655
suitable_for_2      0.007082
dtype: float64


## Modeling (sklearn) - RandomForestClassifier

In [63]:
# ! pip install xgboost

In [64]:
## load needed packages
import xgboost as xgb
from

In [65]:
## display xgboost version
print(xgb.__version__)

1.6.2


### One Hot Encoded Data

In [71]:
dtrain = xgb.DMatrix(X_train1, label = y_train1)

In [74]:
m = len(X_train1.columns)

21

In [75]:
params = dict(
    objective = "reg:squarederror",
    learning_rate = 0.1,
    num_parallel_tree = 500,
    subsample = 0.7,
    colsample_bynode = int(np.sqrt(m))/m,
    reg_lambda = 0,
    max_depth = 20,
    min_child_weight = 2)

In [76]:
rf_xgb = xgb.train(params, dtrain, num_boost_round = 1)

In [77]:
preds = rf_xgb.predict(xgb.DMatrix(X_test1))
# preds

In [80]:
print(f"RMSE: {mean_squared_error(y_test1, preds):.03f}")

RMSE: 0.234


In [105]:
preds = pd.DataFrame(preds)
y_pred5 = preds[0].apply(lambda x: 1 if x > 0.5 else 0)

In [106]:
print(f"The accuracy of the model is {round(accuracy_score(y_test1, y_pred5),3)*100} %")

The accuracy of the model is 71.3 %
