In [1]:
import pandas as pd
import numpy as np
import pickle
import os
from collections import defaultdict

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Set to True to save encoder or models
save_directory = "models_and_encoder"
save_encoder = False
save_models = False

In [2]:
df = pd.read_csv('vgsales.csv')
df

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.00
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,31.37
...,...,...,...,...,...,...,...,...,...,...,...
16593,16596,Woody Woodpecker in Crazy Castle 5,GBA,2002.0,Platform,Kemco,0.01,0.00,0.00,0.00,0.01
16594,16597,Men in Black II: Alien Escape,GC,2003.0,Shooter,Infogrames,0.01,0.00,0.00,0.00,0.01
16595,16598,SCORE International Baja 1000: The Official Game,PS2,2008.0,Racing,Activision,0.00,0.00,0.00,0.00,0.01
16596,16599,Know How 2,DS,2010.0,Puzzle,7G//AMES,0.00,0.01,0.00,0.00,0.01


In [3]:
# Remove NaN values and columns that won't be used as features or targets
df = df.dropna()
df = df.drop('Year',axis=1)
df = df.drop('Name',axis=1)

In [4]:
# Removes outliers (many values are very close to zero and affect model results)
def remove_outliers(col_name,df):
    mean = df[col_name].mean()
    std = df[col_name].std()
    cut_off = std * 2
    lower, upper = mean - cut_off, mean + cut_off
    new_df = df[(df[col_name] < upper) & (df[col_name] > lower)]

    return new_df

prices = ['NA_Sales','EU_Sales','JP_Sales','Other_Sales']

for col in prices:
    df = remove_outliers(col,df)

df.shape

(14555, 9)

In [5]:
# Get categorical data that will be encoded
categorical_df = df.select_dtypes('O')
categorical_df

Unnamed: 0,Platform,Genre,Publisher
551,GBA,Action,Nintendo
588,GEN,Action,Sega
601,GC,Platform,Sega
617,XB,Action,Take-Two Interactive
619,PS2,Fighting,Midway Games
...,...,...,...
16593,GBA,Platform,Kemco
16594,GC,Shooter,Infogrames
16595,PS2,Racing,Activision
16596,DS,Puzzle,7G//AMES


In [6]:
# Encode the categorical data and save the encodings to a dictionary
d = defaultdict(LabelEncoder)
X = categorical_df.apply(lambda x: d[x.name].fit_transform(x))

# Save set of possible values the features can take
feature_values = {}
feature_values['Platform'] = set(categorical_df['Platform'])
feature_values['Genre'] = set(categorical_df['Genre'])
feature_values['Publisher'] = set(categorical_df['Publisher'])
X

Unnamed: 0,Platform,Genre,Publisher
551,6,0,358
588,8,0,441
601,7,4,441
617,29,0,489
619,16,2,324
...,...,...,...
16593,6,4,268
16594,7,8,240
16595,16,6,21
16596,4,5,8


In [7]:
if save_encoder:
    with open("models_and_encoder/encoder.pkl","wb") as f:
        pickle.dump(d,f)
    
    
    with open("models_and_encoder/feature_values.pkl","wb") as f:
        pickle.dump(feature_values,f)

In [8]:
# Get four targets for each model
y = df[['NA_Sales','EU_Sales','JP_Sales','Global_Sales']]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.2,train_size=.8,random_state=10)

In [9]:
from sklearn.preprocessing import StandardScaler 
scaling = StandardScaler()
X_train = scaling.fit_transform(X_train)
X_test = scaling.transform(X_test)

In [10]:
# Random Forest Regressor
rf = RandomForestRegressor(min_samples_leaf=10,random_state=10)
rf.fit(X_train,y_train)
rf_pred = rf.predict(X_test)

print('Accuracy of RF:', rf.score(X_test, y_test)*100)
print('MSE of RF:', mean_squared_error(y_test, rf_pred))
print('R2 Score of RF:', r2_score(y_test,rf_pred))

Accuracy of RF: 25.005639421879657
MSE of RF: 0.03310370100088697
R2 Score of RF: 0.25005639421879655


In [11]:
# K Nearest Neighbors Regressor
knn = KNeighborsRegressor(n_neighbors=10,leaf_size=50)
knn.fit(X_train,y_train)
knn_pred = knn.predict(X_test)

print('Accuracy of KNN:', knn.score(X_test, y_test)*100)
print('MSE of KNN:', mean_squared_error(y_test,knn_pred))
print('R2 Score of KNN:', r2_score(y_test,knn_pred))

Accuracy of KNN: 17.285721314028002
MSE of KNN: 0.036476521727928535
R2 Score of KNN: 0.17285721314028002


In [12]:
# Decision Tree Regressor
dt = DecisionTreeRegressor(min_samples_split=30,random_state=10)
dt.fit(X_train,y_train)
dt_pred = dt.predict(X_test)

print('Accuracy of DT:',dt.score(X_test, y_test))
print('MSE of DT:',mean_squared_error(y_test, dt_pred))
print('R2 Score of KNN:', r2_score(y_test,dt_pred))

Accuracy of DT: 0.19981430129778108
MSE of DT: 0.03498803056714065
R2 Score of KNN: 0.19981430129778108


In [13]:
# Get regional train/test data for XGBoost Models
y_na = df['NA_Sales']
y_eu = df['EU_Sales']
y_jp = df['JP_Sales']
y_global = df['Global_Sales']
X_train_na, X_test_na, y_train_na, y_test_na = train_test_split(X, y_na,test_size=.2,train_size=.8,random_state=10)
X_train_eu, X_test_eu, y_train_eu, y_test_eu = train_test_split(X, y_eu,test_size=.2,train_size=.8,random_state=10)
X_train_jp, X_test_jp, y_train_jp, y_test_jp = train_test_split(X, y_jp,test_size=.2,train_size=.8,random_state=10)
X_train_global, X_test_global, y_train_global, y_test_global = train_test_split(X, y_global,test_size=.2,train_size=.8,random_state=10)

In [14]:
# XG Boost NA
xg_na = GradientBoostingRegressor(random_state=10)
xg_na.fit(X_train_na,y_train_na)
xg_pred_na = xg_na.predict(X_test_na)

print('Accuracy of XGBoost:',xg_na.score(X_test_na, y_test_na))
print('MSE of XGBoost:',mean_squared_error(y_test_na, xg_pred_na))
print('R2 Score of XGBoost:', r2_score(y_test_na,xg_pred_na))

Accuracy of XGBoost: 0.2236652921867298
MSE of XGBoost: 0.039307805623470006
R2 Score of XGBoost: 0.2236652921867298


In [15]:
# XG Boost EU
xg_eu = GradientBoostingRegressor(random_state=10)
xg_eu.fit(X_train_eu,y_train_eu)
xg_pred_eu = xg_eu.predict(X_test_eu)

print('Accuracy of XGBoost:',xg_eu.score(X_test_eu, y_test_eu))
print('MSE of XGBoost:',mean_squared_error(y_test_eu, xg_pred_eu))
print('R2 Score of XGBoost:', r2_score(y_test_eu,xg_pred_eu))

Accuracy of XGBoost: 0.16444270528181149
MSE of XGBoost: 0.009085044713735467
R2 Score of XGBoost: 0.16444270528181149


In [16]:
# XG Boost JP
xg_jp = GradientBoostingRegressor(random_state=10)
xg_jp.fit(X_train_jp,y_train_jp)
xg_pred_jp = xg_jp.predict(X_test_jp)

print('Accuracy of XGBoost:',xg_jp.score(X_test_jp, y_test_jp))
print('MSE of XGBoost:',mean_squared_error(y_test_jp, xg_pred_jp))
print('R2 Score of XGBoost:', r2_score(y_test_jp,xg_pred_jp))

Accuracy of XGBoost: 0.35173016344444563
MSE of XGBoost: 0.003592520731990601
R2 Score of XGBoost: 0.35173016344444563


In [17]:
# XG Boost Global
xg_global = GradientBoostingRegressor(random_state=10)
xg_global.fit(X_train_global,y_train_global)
xg_pred_global = xg_global.predict(X_test_global)

print('Accuracy of XGBoost:',xg_global.score(X_test_global, y_test_global))
print('MSE of XGBoost:',mean_squared_error(y_test_global, xg_pred_global))
print('R2 Score of XGBoost:', r2_score(y_test_global,xg_pred_global))

Accuracy of XGBoost: 0.19588541435189688
MSE of XGBoost: 0.0840506302973495
R2 Score of XGBoost: 0.19588541435189688


In [18]:
# Save models
if save_models:
    
    rf_file = "rf_model.pkl"
    with open(os.path.join(save_directory,rf_file),"wb") as f:
        pickle.dump(rf,f)
        
    knn_file = "knn_model.pkl"
    with open(os.path.join(save_directory,knn_file),"wb") as f:
        pickle.dump(knn,f)
        
    dt_file = "dt_model.pkl"
    with open(os.path.join(save_directory,dt_file),"wb") as f:
        pickle.dump(dt,f)
        
    xg_na_file = "xg_model_na.pkl"
    with open(os.path.join(save_directory,xg_na_file),"wb") as f:
        pickle.dump(xg_na,f)
    
    xg_eu_file = "xg_model_eu.pkl"
    with open(os.path.join(save_directory,xg_eu_file),"wb") as f:
        pickle.dump(xg_eu,f)
        
    xg_jp_file = "xg_model_jp.pkl"
    with open(os.path.join(save_directory,xg_jp_file),"wb") as f:
        pickle.dump(xg_jp,f)
        
    xg_global_file = "xg_model_global.pkl"
    with open(os.path.join(save_directory,xg_global_file),"wb") as f:
        pickle.dump(xg_global,f)