In [162]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score

from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from joblib import Parallel, delayed

from scipy.stats import randint, uniform, mode

from util import as_numpy

pd.options.display.max_rows = 100

In [5]:
sales = pd.read_csv('data/vgsales.csv')
sales.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [6]:
sales.shape

(16598, 11)

In [7]:
sales.isna().sum()

Rank              0
Name              0
Platform          0
Year            271
Genre             0
Publisher        58
NA_Sales          0
EU_Sales          0
JP_Sales          0
Other_Sales       0
Global_Sales      0
dtype: int64

In [8]:
sales.isna().sum()

Rank              0
Name              0
Platform          0
Year            271
Genre             0
Publisher        58
NA_Sales          0
EU_Sales          0
JP_Sales          0
Other_Sales       0
Global_Sales      0
dtype: int64

In [9]:
sales['Publisher'].value_counts().head(30)

Electronic Arts                           1351
Activision                                 975
Namco Bandai Games                         932
Ubisoft                                    921
Konami Digital Entertainment               832
THQ                                        715
Nintendo                                   703
Sony Computer Entertainment                683
Sega                                       639
Take-Two Interactive                       413
Capcom                                     381
Atari                                      363
Tecmo Koei                                 338
Square Enix                                233
Warner Bros. Interactive Entertainment     232
Disney Interactive Studios                 218
Unknown                                    203
Eidos Interactive                          198
Midway Games                               198
505 Games                                  192
Microsoft Game Studios                     189
D3Publisher  

In [136]:
sales['Genre'].uqn()


Action          3316
Sports          2346
Misc            1739
Role-Playing    1488
Shooter         1310
Adventure       1286
Racing          1249
Platform         886
Simulation       867
Fighting         848
Strategy         681
Puzzle           582
Name: Genre, dtype: int64

In [137]:
3316 / sales.shape[0]

0.19978310639836125

In [11]:
def sales_preprocess(df):
    df = df.dropna()
    df.loc[df['Publisher'].value_counts()[df['Publisher']].values < 100, 'Publisher'] = "Other"
    df.loc[df['Platform'].value_counts()[df['Platform']].values < 80, 'Platform'] = "Other"
    df.drop(['Name', 'Rank'], inplace=True, axis=1)
    df = pd.get_dummies(df, columns=['Publisher', 'Platform'])
    return df
sales_cleaned = sales_preprocess(sales)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [12]:
sales_cleaned.sample(10)

Unnamed: 0,Year,Genre,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Publisher_505 Games,Publisher_Acclaim Entertainment,Publisher_Activision,...,Platform_PS4,Platform_PSP,Platform_PSV,Platform_SAT,Platform_SNES,Platform_Wii,Platform_WiiU,Platform_X360,Platform_XB,Platform_XOne
9329,2009.0,Misc,0.13,0.0,0.0,0.01,0.13,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11444,2008.0,Shooter,0.07,0.0,0.0,0.01,0.08,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1921,2010.0,Action,0.57,0.39,0.02,0.09,1.07,0,0,0,...,0,0,0,0,0,0,0,1,0,0
601,2001.0,Platform,1.7,0.59,0.21,0.07,2.56,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14693,2011.0,Action,0.0,0.0,0.03,0.0,0.03,0,0,0,...,0,0,0,0,0,0,0,1,0,0
14821,2009.0,Strategy,0.0,0.02,0.0,0.0,0.03,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15516,2009.0,Role-Playing,0.0,0.0,0.02,0.0,0.02,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3353,2009.0,Platform,0.13,0.18,0.0,0.29,0.6,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1378,2004.0,Racing,0.69,0.54,0.0,0.18,1.41,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8375,1996.0,Action,0.09,0.06,0.0,0.01,0.17,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
X = sales_cleaned.drop(['Genre'], axis=1)
y = sales_cleaned['Genre']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=420)

In [58]:
clf = DecisionTreeClassifier()

scores = cross_val_score(clf, X_train, y_train, scoring='accuracy')

In [59]:
scores

array([0.25469889, 0.27157652, 0.25978511, 0.26247122, 0.26515733])

In [62]:
distributions = dict(min_samples_split=uniform(0, 1), min_samples_leaf=randint(1,19), )
clf = RandomizedSearchCV(DecisionTreeClassifier(), distributions, n_iter=40)
clf = clf.fit(X_train, y_train)

In [77]:
print(clf.best_score_)
clf.best_params_

0.29465875126327634


{'min_samples_leaf': 9, 'min_samples_split': 0.03772507105613432}

In [78]:
params = [
    dict(kernel=['poly'], C=[1, 10, 100, 1000]),
    dict(kernel=['rbf'], C=[1,10,100,1000], gamma=[0.001,0.01,0.1])
]
clf = GridSearchCV(SVC(max_iter=1000), params, verbose=10, n_jobs=4, cv=3)
clf.fit(X_train, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:   47.6s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:  1.9min
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  2.7min
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  3.8min
[Parallel(n_jobs=4)]: Done  46 out of  48 | elapsed:  5.1min remaining:   13.4s
[Parallel(n_jobs=4)]: Done  48 out of  48 | elapsed:  5.3min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=1000,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=4,
             param_grid=[{'C': [1, 10, 100, 1000], 'kernel': ['poly']},
                         {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.01, 0.1],
                          'kernel': ['rbf']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=10)

In [79]:
print(clf.best_params_)
clf.best_score_

{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}


0.3065531000613874

In [72]:
vc = VotingClassifier([
    ('dc', DecisionTreeClassifier(min_samples_leaf=9, min_samples_split=0.0377)),
    ('svc', SVC(C=1, gamma=0.1, kernel='rbf')),
    ('knn', KNeighborsClassifier(n_neighbors=5)),
])

scores = cross_val_score(vc, X_train, y_train, scoring='accuracy')

In [73]:
scores

array([0.30878404, 0.30724971, 0.29163469, 0.31312356, 0.29969302])

## Bagging

In [14]:
single_clf = DecisionTreeClassifier(min_samples_leaf=9, min_samples_split=0.0377)

scores = cross_val_score(single_clf, X_train, y_train, scoring='accuracy')

In [15]:
scores

array([0.29344074, 0.3030303 , 0.28933231, 0.30046048, 0.28702993])

In [138]:
class BaggingClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, estimator, n_estimators=10, n_jobs=4):
        self.base_estimator_ = estimator
        self.n_estimators_ = n_estimators
        self.n_jobs_ = n_jobs
        self.estimators_ = []
    
    def train_model(self, X, y):
        # Bootstrap a new dataset
        X = as_numpy(X)
        y = as_numpy(y)
        else:
            X = X
            y = y
        
        items = np.random.choice(len(y), len(y), replace=True)
        
        X = X[items]
        y = y[items]
        
        est = clone(self.base_estimator_)
        est = est.fit(X, y)
        
        return est
        
    def fit(self, X, y):
        self.estimators_ = Parallel(n_jobs=self.n_jobs_)(delayed(self.train_model)(X, y) for est_idx in range(self.n_estimators_))
        return self
    
    def predict(self, X):
        result = np.array(Parallel(n_jobs=self.n_jobs_)(delayed(est.predict)(X) for est in self.estimators_))
        #print(result.shape)
        result = np.array(mode(result)[0].ravel())
        #print(result.shape)
        #print(result.shape)
        return result
    
    def score(self, X, y):
        predictions = self.predict(X)
        return accuracy_score(y, predictions)

In [128]:
arr = np.array([[1, 1], [2, 2], [3, 3]])
arr[[2, 1, 0]]

array([[3, 3],
       [2, 2],
       [1, 1]])

In [133]:
bclf = BaggingClassifier(single_clf)

In [134]:
bclf.fit(X_train, y_train)

BaggingClassifier(estimator=None, n_estimators=None, n_jobs=None)

In [135]:
bclf.score(X_train, y_train)

0.32397176181706566

## Comparison of Ensemble Methods

- Single tree: Already done
- Voting classifier: Already done
- Bagging: Already done
- Unbagged random forest in sklearn
- Bagged random forest in sklearn
- Hyperparameter-tuned random forest in sklearn

In [159]:
ub_rf = cross_val_score(RandomForestClassifier(n_estimators=100, bootstrap=False).fit(X_train, y_train), X_train, y_train, scoring='accuracy')
ub_rf

array([0.29305715, 0.28807058, 0.29316961, 0.29854183, 0.29585572])

In [161]:
b_rf = cross_val_score(RandomForestClassifier(n_estimators=100, bootstrap=True).fit(X_train, y_train), X_train, y_train, scoring='accuracy')
b_rf

array([0.29305715, 0.28691983, 0.30698388, 0.30161167, 0.30698388])

In [156]:
rf_gs = RandomizedSearchCV(RandomForestClassifier(), 
    dict(bootstrap=[True, False], n_estimators=[100, 1000], min_samples_split=uniform(0, 1), min_samples_leaf=randint(1, 29), max_features=randint(3, 57)),
verbose=10, cv=3, n_jobs=4, n_iter=40, scoring='accuracy').fit(X_train, y_train)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    4.0s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   10.8s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:   33.6s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:   39.8s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   52.0s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   56.3s
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:  1.0min
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:  1.4min
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:  1.6min
[Parallel(n_jobs=4)]: Done 105 tasks      | elapsed:  2.2min
[Parallel(n_jobs=4)]: Done 120 out of 120 | elapsed:  2.4min finished


In [157]:
rf_gs.best_params_

{'bootstrap': True,
 'max_features': 22,
 'min_samples_leaf': 7,
 'min_samples_split': 0.01561094131911589,
 'n_estimators': 1000}

In [158]:
rf_gs.best_score_

0.3271178637200736