In [1]:
import pandas as pd
import numpy as np
import pickle
import os
from collections import defaultdict

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Set to True to save encoder or models
save_directory = "models_and_encoder"
save_encoder = False
save_models = False

In [2]:
df = pd.read_csv('preprocess_data.csv')
df

Unnamed: 0.1,Unnamed: 0,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,0,Console,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,1,Console,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,2,Console,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,3,Console,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.00
4,4,Handheld,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,31.37
...,...,...,...,...,...,...,...,...,...,...
8103,8103,Handheld,2014.0,Action,Namco Bandai Games,0.00,0.00,0.01,0.00,0.01
8104,8104,Handheld,2006.0,Puzzle,Sony Computer Entertainment,0.00,0.00,0.01,0.00,0.01
8105,8105,Console,2003.0,Puzzle,Ubisoft,0.01,0.00,0.00,0.00,0.01
8106,8106,PC,2004.0,Adventure,Ubisoft,0.01,0.00,0.00,0.00,0.01


In [3]:
# Removes outliers (many values are very close to zero and affect model results)
def remove_outliers(col_name,df):
    mean = df[col_name].mean()
    std = df[col_name].std()
    cut_off = std * 2
    lower, upper = mean - cut_off, mean + cut_off
    new_df = df[(df[col_name] < upper) & (df[col_name] > lower)]

    return new_df

prices = ['NA_Sales','EU_Sales','JP_Sales','Other_Sales']

for col in prices:
    df = remove_outliers(col,df)

df.shape

(7171, 10)

In [4]:
# Get categorical data that will be encoded
categorical_df = df.select_dtypes('O')
categorical_df

Unnamed: 0,Platform,Genre,Publisher
283,Console,Action,Nintendo
336,Console,Fighting,Nintendo
346,Console,Racing,Nintendo
365,Console,Fighting,Nintendo
377,Console,Racing,Nintendo
...,...,...,...
8103,Handheld,Action,Namco Bandai Games
8104,Handheld,Puzzle,Sony Computer Entertainment
8105,Console,Puzzle,Ubisoft
8106,PC,Adventure,Ubisoft


In [5]:
# Encode the categorical data and save the encodings to a dictionary
d = defaultdict(LabelEncoder)
X = categorical_df.apply(lambda x: d[x.name].fit_transform(x))

# Save set of possible values the features can take
feature_values = {}
feature_values['Platform'] = set(categorical_df['Platform'])
feature_values['Genre'] = set(categorical_df['Genre'])
feature_values['Publisher'] = set(categorical_df['Publisher'])
X

Unnamed: 0,Platform,Genre,Publisher
283,0,0,4
336,0,2,4
346,0,6,4
365,0,2,4
377,0,6,4
...,...,...,...
8103,1,0,3
8104,1,5,6
8105,0,5,9
8106,2,1,9


In [6]:
if save_encoder:
    with open("models_and_encoder/encoder.pkl","wb") as f:
        pickle.dump(d,f)
    
    
    with open("models_and_encoder/feature_values.pkl","wb") as f:
        pickle.dump(feature_values,f)

In [7]:
# Get four targets for each model
y = df[['NA_Sales','EU_Sales','JP_Sales','Global_Sales']]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.2,train_size=.8,random_state=10)

In [8]:
from sklearn.preprocessing import MinMaxScaler 
scaling = MinMaxScaler()
X_train = scaling.fit_transform(X_train)
X_test = scaling.transform(X_test)

In [10]:
# Random Forest Regressor
rf = RandomForestRegressor(min_samples_leaf=10,random_state=10)
rf.fit(X_train,y_train)
rf_pred = rf.predict(X_test)

print('Accuracy of RF:', rf.score(X_test, y_test)*100)
print('MSE of RF:', mean_squared_error(y_test, rf_pred))
print('R2 Score of RF:', r2_score(y_test,rf_pred))

Accuracy of RF: 19.0425356575713
MSE of RF: 0.06466577300523042
R2 Score of RF: 0.190425356575713


In [11]:
# K Nearest Neighbors Regressor
knn = KNeighborsRegressor(n_neighbors=10,leaf_size=50)
knn.fit(X_train,y_train)
knn_pred = knn.predict(X_test)

print('Accuracy of KNN:', knn.score(X_test, y_test)*100)
print('MSE of KNN:', mean_squared_error(y_test,knn_pred))
print('R2 Score of KNN:', r2_score(y_test,knn_pred))

Accuracy of KNN: 13.307646205838946
MSE of KNN: 0.0690608038327526
R2 Score of KNN: 0.13307646205838947


In [12]:
# Decision Tree Regressor
dt = DecisionTreeRegressor(min_samples_split=30,random_state=10)
dt.fit(X_train,y_train)
dt_pred = dt.predict(X_test)

print('Accuracy of DT:',dt.score(X_test, y_test))
print('MSE of DT:',mean_squared_error(y_test, dt_pred))
print('R2 Score of KNN:', r2_score(y_test,dt_pred))

Accuracy of DT: 0.18505796845931582
MSE of DT: 0.0649894250004955
R2 Score of KNN: 0.18505796845931582


In [38]:
# Get regional train/test data for XGBoost Models
y_na = df['NA_Sales']
y_eu = df['EU_Sales']
y_jp = df['JP_Sales']
y_global = df['Global_Sales']
X_train_na, X_test_na, y_train_na, y_test_na = train_test_split(X, y_na,test_size=.15,train_size=.85,random_state=5)
X_train_eu, X_test_eu, y_train_eu, y_test_eu = train_test_split(X, y_eu,test_size=.15,train_size=.85,random_state=10)
X_train_jp, X_test_jp, y_train_jp, y_test_jp = train_test_split(X, y_jp,test_size=.2,train_size=.8,random_state=10)
X_train_global, X_test_global, y_train_global, y_test_global = train_test_split(X, y_global,test_size=.2,train_size=.8,random_state=10)

In [39]:
from sklearn.preprocessing import MinMaxScaler 
scaling = MinMaxScaler()
X_train_na = scaling.fit_transform(X_train_na)
X_test_na = scaling.transform(X_test_na)

In [40]:
# XG Boost NA
xg_na = GradientBoostingRegressor(random_state=8)
xg_na.fit(X_train_na,y_train_na)
xg_pred_na = xg_na.predict(X_test_na)

print('Accuracy of XGBoost:',xg_na.score(X_test_na, y_test_na))
print('MSE of XGBoost:',mean_squared_error(y_test_na, xg_pred_na))
print('R2 Score of XGBoost:', r2_score(y_test_na,xg_pred_na))

Accuracy of XGBoost: 0.17629263569439513
MSE of XGBoost: 0.09157864523770697
R2 Score of XGBoost: 0.17629263569439513


In [28]:
# XG Boost EU
xg_eu = GradientBoostingRegressor(random_state=10)
xg_eu.fit(X_train_eu,y_train_eu)
xg_pred_eu = xg_eu.predict(X_test_eu)

print('Accuracy of XGBoost:',xg_eu.score(X_test_eu, y_test_eu))
print('MSE of XGBoost:',mean_squared_error(y_test_eu, xg_pred_eu))
print('R2 Score of XGBoost:', r2_score(y_test_eu,xg_pred_eu))

Accuracy of XGBoost: 0.07956606626101814
MSE of XGBoost: 0.01965501689353684
R2 Score of XGBoost: 0.07956606626101814


In [22]:
# XG Boost JP
xg_jp = GradientBoostingRegressor(random_state=10)
xg_jp.fit(X_train_jp,y_train_jp)
xg_pred_jp = xg_jp.predict(X_test_jp)

print('Accuracy of XGBoost:',xg_jp.score(X_test_jp, y_test_jp))
print('MSE of XGBoost:',mean_squared_error(y_test_jp, xg_pred_jp))
print('R2 Score of XGBoost:', r2_score(y_test_jp,xg_pred_jp))

Accuracy of XGBoost: 0.31614926783669883
MSE of XGBoost: 0.0047113310922964466
R2 Score of XGBoost: 0.31614926783669883


In [23]:
# XG Boost Global
xg_global = GradientBoostingRegressor(random_state=10)
xg_global.fit(X_train_global,y_train_global)
xg_pred_global = xg_global.predict(X_test_global)

print('Accuracy of XGBoost:',xg_global.score(X_test_global, y_test_global))
print('MSE of XGBoost:',mean_squared_error(y_test_global, xg_pred_global))
print('R2 Score of XGBoost:', r2_score(y_test_global,xg_pred_global))

Accuracy of XGBoost: 0.14789493453491231
MSE of XGBoost: 0.16356000052851144
R2 Score of XGBoost: 0.14789493453491231


In [17]:
# Save models
if save_models:
    
    rf_file = "rf_model.pkl"
    with open(os.path.join(save_directory,rf_file),"wb") as f:
        pickle.dump(rf,f)
        
    knn_file = "knn_model.pkl"
    with open(os.path.join(save_directory,knn_file),"wb") as f:
        pickle.dump(knn,f)
        
    dt_file = "dt_model.pkl"
    with open(os.path.join(save_directory,dt_file),"wb") as f:
        pickle.dump(dt,f)
        
    xg_na_file = "xg_model_na.pkl"
    with open(os.path.join(save_directory,xg_na_file),"wb") as f:
        pickle.dump(xg_na,f)
    
    xg_eu_file = "xg_model_eu.pkl"
    with open(os.path.join(save_directory,xg_eu_file),"wb") as f:
        pickle.dump(xg_eu,f)
        
    xg_jp_file = "xg_model_jp.pkl"
    with open(os.path.join(save_directory,xg_jp_file),"wb") as f:
        pickle.dump(xg_jp,f)
        
    xg_global_file = "xg_model_global.pkl"
    with open(os.path.join(save_directory,xg_global_file),"wb") as f:
        pickle.dump(xg_global,f)