In [1]:
import numpy as np
import pandas as pd
import warnings; warnings.filterwarnings('ignore')

# Read data from file
df = pd.read_csv('googleplaystore.csv', encoding = 'utf8',index_col=0)

In [2]:
df.head(3)

Unnamed: 0_level_0,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
App,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
"U Launcher Lite – FREE Live Cool Themes, Hide Apps",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up


# Data cleaning

In [3]:
# Google Play
# 移除异常值
df = df.dropna()
df = df.drop_duplicates().reset_index(drop=True)
df = df[(df[['Size']]!= 'Varies with device').all(axis=1)]

# 换算Size 统一为kb
size=[]
for i in df['Size']:
    if i.endswith("M"):
        i=round(float(i.replace("M",""))*1024,1)
        size.append(i)
    else:
        i=round(float(i.replace("k","")),1)
        size.append(i)
df['Size']=size

# 清洗numeric variable 
df['Reviews'] = pd.to_numeric(df['Reviews'])
df['Price'] = df['Price'].apply(lambda x : x.strip('$'))
df['Price'] = pd.to_numeric(df['Price'])
df['Installs'] = df['Installs'].apply(lambda x : x.strip('+').replace(',', ''))
df['Installs'] = pd.to_numeric(df['Installs'])

In [4]:
df['Android Ver'][df['Android Ver'] == 'Varies with device'] = '4.1 and up'
df['android_ver_int'] = df['Android Ver'].str[0:1].astype(int)

In [5]:
from datetime import datetime,date
temp=pd.to_datetime(df['Last Updated'])
df['last_updated_days'] = temp.apply(lambda x:date.today()-datetime.date(x))
df['last_updated_days'] = [i.days for i in df['last_updated_days']]

# Variable coding

In [6]:
# import numpy as np
# np.corrcoef(df['Reviews_log'],df['Installs_log'])

In [7]:
# select numerical features
features = ['Reviews','Size', 'Price','android_ver_int','last_updated_days'] # ,'Installs'

In [8]:
# select categorical features
category_list = df['Category'].unique().tolist() 
category_list = ['cat_' + word for word in category_list]
df = pd.concat([df, pd.get_dummies(df['Category'], prefix='cat')], axis=1) # drop_first=True
features.extend(category_list)

In [9]:
# bin dependent variables - median 4.3
dv = df['Rating']
bins = [0,4.3,5.1]; bin_labels = [1,2]
dv_binned = pd.cut(dv, bins, right = False, labels = bin_labels)
df['Rating_binned'] = dv_binned

In [10]:
# unsuccessful: 0-4.2   successful: 4.3-5.0
print('Original group ratio:')
print(df['Rating_binned'].value_counts() / len(df['Rating_binned']))

Original group ratio:
2    0.531149
1    0.468851
Name: Rating_binned, dtype: float64


# Classification Model (10-fold validation)

In [11]:
df_clean = df[features]
df_clean.head(3)

Unnamed: 0,Reviews,Size,Price,android_ver_int,last_updated_days,cat_ART_AND_DESIGN,cat_AUTO_AND_VEHICLES,cat_BEAUTY,cat_BOOKS_AND_REFERENCE,cat_BUSINESS,...,cat_SPORTS,cat_TRAVEL_AND_LOCAL,cat_TOOLS,cat_PERSONALIZATION,cat_PRODUCTIVITY,cat_PARENTING,cat_WEATHER,cat_VIDEO_PLAYERS,cat_NEWS_AND_MAGAZINES,cat_MAPS_AND_NAVIGATION
0,159,19456.0,0.0,4,474,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,967,14336.0,0.0,4,466,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,87510,8908.8,0.0,4,268,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# define x and y as input
X = df_clean
y = np.array(df['Rating_binned'])

In [13]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R',random_state=None)

print(np.mean(cross_val_score(clf, X, y, cv=10)))

0.5579995358694465


In [14]:
from xgboost import XGBClassifier

clf = XGBClassifier()

print(np.mean(cross_val_score(clf, X, y, cv=10)))

0.5697295168844325


In [15]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(activation='logistic', solver='sgd')

print(np.mean(cross_val_score(clf, X, y, cv=10)))

0.5394980660571524


In [16]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(multi_class='ovr', solver='lbfgs')

print(np.mean(cross_val_score(clf, X, y, cv=10)))

0.5601158242171151


In [17]:
from lightgbm import LGBMClassifier

clf = LGBMClassifier()

print(np.mean(cross_val_score(clf, X, y, cv=10)))

0.5571991043391241


In [18]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(criterion='entropy')

print(np.mean(cross_val_score(clf, X, y, cv=10)))

0.5484229780704458


# XGBClassifier (train vs.test)

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)

In [20]:
clf = XGBClassifier()
clf.fit(X_train, y_train) 
print(clf.score(X_train,y_train))

0.6658799730276467


In [21]:
print(clf.score(X_test,y_test))

0.6307277628032345
