In [1]:
import numpy as np
import pandas as pd
import warnings; warnings.filterwarnings('ignore')

# Read data from file
df = pd.read_csv('AppleStore.csv', encoding = 'utf8',index_col=0)

# Data cleaning

In [2]:
# iOS
# 移除 评分为0的项目 空值 重复项
df = df.dropna()
df = df.drop_duplicates()
df = df[df['user_rating']!=0]

In [3]:
df = df.set_index('track_name')

In [4]:
df.head(3)

Unnamed: 0_level_0,id,size_bytes,currency,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,ver,cont_rating,prime_genre,sup_devices.num,ipadSc_urls.num,lang.num,vpp_lic
track_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
PAC-MAN Premium,281656475,100788224,USD,3.99,21292,26,4.0,4.5,6.3.5,4+,Games,38,5,10,1
Evernote - stay organized,281796108,158578688,USD,0.0,161065,26,4.0,3.5,8.2.2,4+,Productivity,37,5,23,1
"WeatherBug - Local Weather, Radar, Maps, Alerts",281940292,100524032,USD,0.0,188583,2822,3.5,4.5,5.0.0,4+,Weather,37,5,3,1


# Dummy coding

In [5]:
# select numerical features
features = ['rating_count_tot','price','size_bytes','sup_devices.num','ipadSc_urls.num','lang.num'] #

In [6]:
# select categorical features
category_list = df['prime_genre'].unique().tolist() 
category_list = ['cat_' + word for word in category_list]
df = pd.concat([df, pd.get_dummies(df['prime_genre'], prefix='cat')], axis=1) # drop_first=True
features.extend(category_list)

In [7]:
# bin dependent variables - median 4.5
dv = df['user_rating']
bins = [0,4.5,5.1]; bin_labels = [1,2]
dv_binned = pd.cut(dv, bins, right = False, labels = bin_labels)
df['user_rating_binned'] = dv_binned

In [8]:
# unsuccessful: 0-4.4   successful: 4.5-5.0
print('Original group ratio:')
print(df['user_rating_binned'].value_counts() / len(df['user_rating_binned']))

Original group ratio:
2    0.50335
1    0.49665
Name: user_rating_binned, dtype: float64


# Classification Model (10-fold validation)

In [9]:
df_clean = df[features]
df_clean.head(3)

Unnamed: 0_level_0,rating_count_tot,price,size_bytes,sup_devices.num,ipadSc_urls.num,lang.num,cat_Games,cat_Productivity,cat_Weather,cat_Shopping,...,cat_Entertainment,cat_Photo & Video,cat_Navigation,cat_Education,cat_Lifestyle,cat_Food & Drink,cat_News,cat_Book,cat_Medical,cat_Catalogs
track_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PAC-MAN Premium,21292,3.99,100788224,38,5,10,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Evernote - stay organized,161065,0.0,158578688,37,5,23,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
"WeatherBug - Local Weather, Radar, Maps, Alerts",188583,0.0,100524032,37,5,3,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# define x and y as input
X = df_clean
y = np.array(df['user_rating_binned'])

In [11]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(multi_class='ovr', solver='lbfgs')

print(np.mean(cross_val_score(clf, X, y, cv=10)))

0.5327200985985483


In [12]:
from lightgbm import LGBMClassifier

clf = LGBMClassifier()

print(np.mean(cross_val_score(clf, X, y, cv=10)))

0.6445556405457239


In [13]:
from xgboost import XGBClassifier

clf = XGBClassifier()

print(np.mean(cross_val_score(clf, X, y, cv=10)))

0.6440761452406247


In [14]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(criterion='entropy')

print(np.mean(cross_val_score(clf, X, y, cv=10)))

0.6169364019955337


# LGBMClassifier (train vs.test)

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)

In [16]:
clf = LGBMClassifier()
clf.fit(X_train, y_train) 
print(clf.score(X_train,y_train))

0.8049461507778221


In [17]:
print(clf.score(X_test,y_test))

0.6706539074960127


### from math import log
rating_count = float(input('How many people have rated the app?   '))
price = float(input('What is the price of the app? (USD)   '))
size = float(input('What is the size of the app? (kb)   '))
category = input('What is the category of the app?   ')

inputs = []
aaa = {}
aaa['rating_count_tot_log'] = log(rating_count+1)
aaa['price_log'] = log(price+1)
aaa['size_log'] = log(size+1)
inputs.append(aaa)
testx = pd.DataFrame(inputs)

for i in category_list:
    if 'cat_'+category == i:
        testx[i] = 1
    else:
        testx[i] = 0

### testx

### aaa = clf.predict(testx)
if aaa[0]==2:
    print('Successful!')
else:
    print('Unsuccessful:(')