In [39]:
import pandas as pd
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
import sklearn.linear_model as skl
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report 
from sklearn.metrics import mean_squared_error
from sklearn.datasets import make_friedman1
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.model_selection import RandomizedSearchCV

<h2>Load data from csv</h2>

In [2]:
# load training and testing data
training_data = pd.read_csv("train.csv").set_index("id")
prediction_data = pd.read_csv("test.csv").set_index("id")

In [3]:
training_data["playtime_forever"].describe()

count    357.000000
mean       3.119234
std       11.213114
min        0.000000
25%        0.000000
50%        0.083333
75%        1.616667
max      113.800000
Name: playtime_forever, dtype: float64

In [4]:
combined_data = training_data.append(prediction_data, ignore_index=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [5]:
combined_data.tail()

Unnamed: 0,categories,genres,is_free,playtime_forever,price,purchase_date,release_date,tags,total_negative_reviews,total_positive_reviews
442,"Single-player,Steam Achievements,Full controll...",Action,False,,5000.0,"Mar 23, 2018",22-May-13,"Western,FPS,Action,Story Rich,Bullet Time,Shoo...",516.0,9334.0
443,Single-player,"Action,Adventure",False,,3600.0,"Mar 3, 2018",14-Sep-09,"Adventure,Action,Ninja,Stealth,Cute,Singleplay...",176.0,1245.0
444,"Single-player,Multi-player,Cross-Platform Mult...","Simulation,Strategy",False,,11200.0,"Feb 18, 2018",13-Aug-13,"Grand Strategy,Strategy,Historical,Simulation,...",6350.0,33128.0
445,"Single-player,Multi-player,Co-op,Online Co-op,...","Action,Adventure,RPG,Simulation,Sports,Strategy",True,,0.0,"Nov 19, 2017",19-Jun-17,"VR,RPG,Action,Sports,Simulation,Adventure,Stra...",0.0,5.0
446,"Single-player,Multi-player,Steam Achievements,...","Action,Strategy",False,,26800.0,"Nov 25, 2018",28-Sep-17,"Strategy,Fantasy,Turn-Based Strategy,RTS,Grand...",2594.0,17777.0


<h2>Replace Null Value</h2>

In [6]:
combined_data.isnull().sum(axis=0)

categories                 0
genres                     0
is_free                    0
playtime_forever          90
price                      0
purchase_date              4
release_date               0
tags                       0
total_negative_reviews     4
total_positive_reviews     4
dtype: int64

In [7]:
combined_data = combined_data.fillna(value={
    "total_positive_reviews": combined_data["total_positive_reviews"].mean(),
    "total_negative_reviews": combined_data["total_negative_reviews"].mean(),
    "purchase_date": combined_data["purchase_date"].mode()[0]})

#training_data.dropna(how='any', axis=0, inplace=True)
combined_data.isnull().sum(axis=0)

categories                 0
genres                     0
is_free                    0
playtime_forever          90
price                      0
purchase_date              0
release_date               0
tags                       0
total_negative_reviews     0
total_positive_reviews     0
dtype: int64

In [8]:
combined_data.drop(["is_free"], axis=1, inplace=True)

<h2>One Hot Encoding</h2>

In [9]:
dummie_genres = combined_data["genres"].str.get_dummies(sep=',')
dummie_categories = combined_data["categories"].str.get_dummies(sep=',')
dummie_tags = combined_data["tags"].str.get_dummies(sep=',')

In [10]:
combined_data.drop(["genres", "categories", "tags"], axis=1, inplace=True)
combined_data = pd.concat([combined_data, dummie_genres, dummie_categories, dummie_tags], axis=1)

<h2>Date Convertion</h2>

<h3>To Datetime</h3>

In [11]:
purchase_date = pd.to_datetime(combined_data["purchase_date"])
release_date = pd.to_datetime(combined_data["release_date"])

combined_data.drop(["purchase_date", "release_date"], axis=1, inplace=True)

combined_data = pd.concat([combined_data, purchase_date, release_date], axis=1)

In [12]:
combined_data["purchase_date"].describe()

count                     447
unique                    204
top       2019-06-27 00:00:00
freq                       16
first     2015-08-03 00:00:00
last      2019-09-07 00:00:00
Name: purchase_date, dtype: object

In [13]:
combined_data["release_date"].describe()

count                     447
unique                    379
top       2016-04-05 00:00:00
freq                        4
first     2006-08-23 00:00:00
last      2019-07-30 00:00:00
Name: release_date, dtype: object

<h3>Calculate number of days difference</h3>

In [14]:
combined_data["purchase_release_difference"] = (combined_data["purchase_date"] - combined_data["release_date"]).dt.days

In [15]:
NOW=dt.datetime(2019,10,31) 
combined_data["release_days"] = (NOW - combined_data["release_date"]).dt.days
combined_data["purchase_days"] = (NOW - combined_data["purchase_date"]).dt.days

In [16]:
combined_data

Unnamed: 0,playtime_forever,price,total_negative_reviews,total_positive_reviews,Action,Adventure,Animation & Modeling,Audio Production,Casual,Design & Illustration,...,Western,World War I,World War II,Zombies,eSports,purchase_date,release_date,purchase_release_difference,release_days,purchase_days
0,0.000000,3700.0,96.000000,372.000000,0,1,0,0,1,0,...,0,0,0,0,0,2018-07-02,2013-12-10,1665,2151,486
1,0.016667,0.0,0.000000,23.000000,0,0,0,0,0,0,...,0,0,0,0,0,2016-11-26,2015-08-12,472,1541,1069
2,0.000000,5000.0,663.000000,3018.000000,0,1,0,0,1,0,...,0,0,0,0,0,2018-07-02,2014-01-28,1616,2102,486
3,1.533333,9900.0,1746.000000,63078.000000,1,0,0,0,0,0,...,0,0,0,0,0,2016-11-28,2010-03-31,2434,3501,1067
4,22.333333,4800.0,523.000000,8841.000000,1,0,0,0,0,0,...,0,0,0,0,0,2018-03-04,2012-07-30,2043,2649,606
5,2.933333,25800.0,3012.805869,14519.939052,0,0,0,0,0,0,...,0,0,0,0,0,2019-06-27,2019-03-26,93,219,126
6,2.616667,3600.0,1035.000000,39776.000000,1,0,0,0,0,0,...,0,0,0,0,0,2018-02-16,2012-10-23,1942,2564,622
7,0.150000,0.0,3.000000,33.000000,0,0,0,0,0,0,...,0,0,0,0,0,2018-01-29,2016-04-28,641,1281,640
8,0.016667,4000.0,39.000000,1875.000000,0,1,0,0,0,0,...,0,0,0,0,0,2018-11-23,2014-02-25,1732,2074,342
9,0.016667,8800.0,11.000000,47.000000,1,1,0,0,0,0,...,0,0,0,0,0,2017-12-22,2017-10-17,66,744,678


<h2> Normalization </h2>

<h3>Total number of reviews</h3>

In [17]:
combined_data["total_positive_reviews"] = combined_data["total_positive_reviews"]/combined_data["release_days"]
combined_data["total_negative_reviews"] = combined_data["total_negative_reviews"]/combined_data["release_days"]

<h3>All features - mean / std</h3>

In [18]:
combined_data.drop(["release_date", "purchase_date"], axis=1, inplace=True)

In [19]:
features = combined_data.loc[:, combined_data.columns != 'playtime_forever'] 
combined_data.loc[:, combined_data.columns != 'playtime_forever'] = (features-features.mean())/features.std() 

In [20]:
combined_data

Unnamed: 0,playtime_forever,price,total_negative_reviews,total_positive_reviews,Action,Adventure,Animation & Modeling,Audio Production,Casual,Design & Illustration,...,Wargame,Warhammer 40K,Western,World War I,World War II,Zombies,eSports,purchase_release_difference,release_days,purchase_days
0,0.000000,-0.065995,-0.105052,-0.298073,-0.974595,0.936048,-0.047298,-0.047298,2.814647,-0.066965,...,-0.047298,-0.066965,-0.082107,-0.066965,-0.134842,-0.21046,-0.066965,0.719975,0.581348,-0.356158
1,0.016667,-0.070480,-0.106476,-0.301859,-0.974595,-1.065931,-0.047298,-0.047298,-0.354490,-0.066965,...,-0.047298,-0.066965,-0.082107,-0.066965,-0.134842,-0.21046,-0.066965,-0.589557,-0.060054,1.659373
2,0.000000,-0.064419,-0.096409,-0.267810,-0.974595,0.936048,-0.047298,-0.047298,2.814647,-0.066965,...,-0.047298,-0.066965,-0.082107,-0.066965,-0.134842,-0.21046,-0.066965,0.666189,0.529826,-0.356158
3,1.533333,-0.058479,-0.090558,0.129543,1.023772,-1.065931,-0.047298,-0.047298,-0.354490,-0.066965,...,-0.047298,-0.066965,-0.082107,-0.066965,-0.134842,-0.21046,-0.066965,1.564091,2.000846,1.652459
4,22.333333,-0.064661,-0.100174,-0.222238,1.023772,-1.065931,-0.047298,-0.047298,-0.354490,-0.066965,...,-0.047298,-0.066965,-0.082107,-0.066965,-0.134842,-0.21046,-0.066965,1.134898,1.104985,0.058702
5,2.933333,-0.039205,0.332630,1.286614,-0.974595,-1.065931,-0.047298,-0.047298,-0.354490,-0.066965,...,-0.047298,-0.066965,-0.082107,-0.066965,-0.134842,-0.21046,-0.066965,-1.005578,-1.450111,-1.600740
6,2.616667,-0.066116,-0.093592,0.069541,1.023772,-1.065931,-0.047298,-0.047298,-0.354490,-0.066965,...,-0.047298,-0.066965,-0.082107,-0.066965,-0.134842,-0.21046,-0.066965,1.024032,1.015610,0.114017
7,0.150000,-0.070480,-0.106401,-0.301600,-0.974595,-1.065931,-0.047298,-0.047298,-0.354490,-0.066965,...,-0.047298,-0.066965,-0.082107,-0.066965,-0.134842,-0.21046,-0.066965,-0.404049,-0.333439,0.176246
8,0.016667,-0.065631,-0.105876,-0.280552,-0.974595,0.936048,-0.047298,-0.047298,-0.354490,-0.066965,...,-0.047298,-0.066965,-0.082107,-0.066965,-0.134842,-0.21046,-0.066965,0.793519,0.500384,-0.853991
9,0.016667,-0.059812,-0.106004,-0.300703,1.023772,0.936048,-0.047298,-0.047298,-0.354490,-0.066965,...,-0.047298,-0.066965,-0.082107,-0.066965,-0.134842,-0.21046,-0.066965,-1.035215,-0.898084,0.307619


<h2>Pre-training spliting data</h2>

In [21]:
training_data = combined_data.loc[:356, :]
prediction_data = combined_data.loc[357:, combined_data.columns != 'playtime_forever']
prediction_data.reset_index(drop=True, inplace=True)

In [22]:
X = training_data.drop(["playtime_forever"], axis=1)
y = training_data["playtime_forever"]

In [23]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)
print(len(X_train))
print(len(X_test))

285
72


In [24]:
X.columns
#print(len(X.columns))

Index(['price', 'total_negative_reviews', 'total_positive_reviews', 'Action',
       'Adventure', 'Animation & Modeling', 'Audio Production', 'Casual',
       'Design & Illustration', 'Early Access',
       ...
       'Wargame', 'Warhammer 40K', 'Western', 'World War I', 'World War II',
       'Zombies', 'eSports', 'purchase_release_difference', 'release_days',
       'purchase_days'],
      dtype='object', length=372)

<h2> Training and testing</h2>

<h3>Linear Regression</h3>

In [29]:
model = skl.LinearRegression()
model.fit(X_train, y_train)

predict_y = model.predict(X_train).clip(min=0)
print("training error:",np.sqrt(mean_squared_error(y_train, predict_y)))

predict_y = model.predict(X_test).clip(min=0)
print("testing error:", np.sqrt(mean_squared_error(y_test, predict_y)))

training error: 1.2511019232597893e-13
testing error: 78.24439741364478


In [33]:
selector = RFE(model, 200, step=1)
selector.fit(X_train, y_train)
#print(selector.support_ )
#print(selector.ranking_ )
predict_y = selector.predict(X_train).clip(min=0)
print("training error:",np.sqrt(mean_squared_error(y_train, predict_y)))

predict_y = selector.predict(X_test).clip(min=0)
print("testing error:", np.sqrt(mean_squared_error(y_test, predict_y)))

training error: 2.8996260741919686
testing error: 12846048488725.22


In [34]:
# predict_submission = model.predict(prediction_data).clip(min=0)
# submission = pd.read_csv("samplesubmission.csv")
# submission["playtime_forever"] = predict_submission
# submission.to_csv("submission_linear.csv", index=False)

<h3>SGD Regressor</h3>

In [37]:
model = skl.SGDRegressor()
model.fit(X_train, y_train)

predict_y = model.predict(X_train).clip(min=0)
print("training error:",np.sqrt(mean_squared_error(y_train, predict_y)))

predict_y = model.predict(X_test).clip(min=0)
print("testing error:", np.sqrt(mean_squared_error(y_test, predict_y)))

training error: 374.98773597113586
testing error: 124.82870297725573


<h3>Random forest</h3>

In [38]:
model = RandomForestRegressor(max_depth=10, max_leaf_nodes=200, n_estimators=10) #, max_features=len(X.columns))
model.fit(X_train, y_train)

predict_y = model.predict(X_train).clip(min=0)
print("training error:",np.sqrt(mean_squared_error(y_train, predict_y)))

predict_y = model.predict(X_test).clip(min=0)
print("testing error:", np.sqrt(mean_squared_error(y_test, predict_y)))

training error: 4.0153650711083015
testing error: 15.179608670482018


In [57]:
model2 = RandomForestRegressor(max_depth=10, max_leaf_nodes=200, n_estimators=10) #, max_features=len(X.columns))
model2.fit(X, y)

predict_y = model2.predict(X).clip(min=0)
print("training error:",np.sqrt(mean_squared_error(y, predict_y)))

training error: 4.4975013540295254


In [58]:
predict_submission = model2.predict(prediction_data).clip(min=0)
submission = pd.read_csv("samplesubmission.csv")
submission["playtime_forever"] = predict_submission
submission.to_csv("submission_random_forest2.csv", index=False)

In [40]:
predict_submission = model.predict(prediction_data).clip(min=0)
submission = pd.read_csv("samplesubmission.csv")
submission["playtime_forever"] = predict_submission
submission.to_csv("submission_random_forest.csv", index=False)

In [25]:
cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')

array([ -76.42402015,  -37.16199391, -145.19753735, -295.89364315,
        -43.46165917])

In [26]:
model.get_params().keys()

dict_keys(['bootstrap', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

In [41]:
random_grid = {'n_estimators': [10, 50, 100, 200, 600, 1000, 1400, 1800],
               'max_features': ['auto', 'sqrt', 'log2'],
               'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1, 2, 4],
               'bootstrap': [True, False]}

rf_random = RandomizedSearchCV(estimator = RandomForestRegressor(), param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)

predict_y = rf_random.predict(X_train).clip(min=0)
print("training error:",np.sqrt(mean_squared_error(y_train, predict_y)))

predict_y = rf_random.predict(X_test).clip(min=0)
print("testing error:", np.sqrt(mean_squared_error(y_test, predict_y)))

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  3.6min finished


training error: 3.6141676501464044
testing error: 23.095461059734934


In [61]:
rf_random = RandomizedSearchCV(estimator = RandomForestRegressor(), param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X, y)

predict_y = rf_random.predict(X).clip(min=0)
print("training error:",np.sqrt(mean_squared_error(y, predict_y)))

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done 167 tasks      | elapsed:  2.4min


training error: 7.3177682647751805


[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  3.5min finished


In [62]:
print(rf_random.best_params_,"\n")
print(rf_random.best_score_ ,"\n")

{'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 30, 'bootstrap': True} 

0.1471401365868333 



In [49]:
tuned_parameters = [{'n_estimators': [100, 800, 1000, 1600, 1700, 1800, 2000, 2200],
                   'max_features': ['auto'],
                   'max_depth': [10, 20, 50, 80, 100, None],
                   'min_samples_split': [8, 10, 12],
                   'min_samples_leaf': [1, 2, 3, 5],
                   'bootstrap': [False]}]
tuned_model = GridSearchCV(RandomForestRegressor(), tuned_parameters, cv=5,
                   scoring='neg_mean_squared_error', n_jobs = -1, verbose = 2)

tuned_model.fit(X, y)
predict_y = tuned_model.predict(X_train).clip(min=0)
print("training error:",np.sqrt(mean_squared_error(y_train, predict_y)))

predict_y = tuned_model.predict(X_test).clip(min=0)
print("testing error:", np.sqrt(mean_squared_error(y_test, predict_y)))

Fitting 5 folds for each of 576 candidates, totalling 2880 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 18.7min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 37.5min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed: 60.9min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed: 91.5min
[Parallel(n_jobs=-1)]: Done 1977 tasks      | elapsed: 130.2min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed: 175.8min
[Parallel(n_jobs=-1)]: Done 2880 out of 2880 | elapsed: 193.6min finished


training error: 3.6141207892788723
testing error: 23.09592097521368


In [50]:
print(tuned_model.best_params_,"\n")
print(tuned_model.best_score_ ,"\n")

{'bootstrap': False, 'max_depth': 100, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 100} 

-113.49669574433786 



In [51]:
another_model = RandomForestRegressor(bootstrap=False, max_depth=100, max_features='auto', max_leaf_nodes=100, min_samples_leaf=2, min_samples_split=10, n_estimators=100) #, max_features=len(X.columns))

selector = RFECV(estimator=another_model, scoring='neg_mean_squared_error')
selector.fit(X_train, y_train)
#print(selector.support_ )
#print(selector.ranking_ )
print("Optimal number of features : %d" % selector.n_features_)

predict_y = selector.predict(X_train).clip(min=0)
print("training error:",np.sqrt(mean_squared_error(y_train, predict_y)))

predict_y = selector.predict(X_test).clip(min=0)
print("testing error:", np.sqrt(mean_squared_error(y_test, predict_y)))



Optimal number of features : 7
training error: 3.672088442590179
testing error: 23.166036865686998


In [52]:
predict_submission = tuned_model.predict(prediction_data).clip(min=0)
submission = pd.read_csv("samplesubmission.csv")
submission["playtime_forever"] = predict_submission
submission.to_csv("submission_random_forest_tuned.csv", index=False)

In [53]:
# predict_submission = selector.predict(prediction_data).clip(min=0)
# submission = pd.read_csv("samplesubmission.csv")
# submission["playtime_forever"] = predict_submission
# submission.to_csv("submission_random_forest_selector.csv", index=False)

<h3>Ada Boosting</h3>

In [54]:
model = AdaBoostRegressor()
model.fit(X_train, y_train)

predict_y = model.predict(X_train).clip(min=0)
print("training error:",np.sqrt(mean_squared_error(y_train, predict_y)))

predict_y = model.predict(X_test).clip(min=0)
print("testing error:", np.sqrt(mean_squared_error(y_test, predict_y)))

training error: 5.693738641077654
testing error: 19.525774118626327


In [55]:
predict_y

array([ 6.32663551, 10.82205589,  6.4162037 ,  8.09358178, 16.13099415,
        4.35149051,  7.55892857,  7.9016835 ,  4.72316017,  6.4162037 ,
        6.32663551,  3.77725989,  6.32663551,  5.2031746 ,  5.12361111,
       97.925     ,  4.39434251, 19.82857143,  1.64679089,  6.56369048,
        4.99218213,  4.39434251,  5.2031746 ,  6.32663551,  8.39530516,
        7.3962963 ,  6.4162037 ,  6.32663551,  6.32663551, 10.74097222,
        5.2031746 ,  9.76526104,  7.956621  ,  6.32663551,  8.39530516,
        6.32663551, 10.74097222,  3.77725989, 31.50576923,  6.65921053,
        5.12361111,  6.4162037 ,  6.32663551,  6.56369048,  8.09358178,
        7.62682648,  6.32663551, 72.74      ,  6.56369048,  6.65921053,
       71.82307692,  8.39530516,  6.32663551,  7.62682648,  4.39434251,
        7.956621  ,  8.39530516,  7.62682648,  8.09358178, 11.55866667,
        5.2031746 ,  6.65921053,  5.2031746 ,  6.65921053,  6.91142857,
        5.12361111,  5.2031746 ,  5.2031746 ,  5.2031746 , 79.72

In [56]:
selector = RFE(model, 200, step=1)
selector.fit(X_train, y_train)
#print(selector.support_ )
#print(selector.ranking_ )
predict_y = selector.predict(X_train).clip(min=0)
print("training error:",np.sqrt(mean_squared_error(y_train, predict_y)))

predict_y = selector.predict(X_test).clip(min=0)
print("testing error:", np.sqrt(mean_squared_error(y_test, predict_y)))

training error: 5.08739314909401
testing error: 20.66800305146721


In [270]:
# predict_submission = model.predict(prediction_data).clip(min=0)
# submission = pd.read_csv("samplesubmission.csv")
# submission["playtime_forever"] = predict_submission
# submission.to_csv("submission_adaboost.csv", index=False)