In [597]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.width', 1000)

In [598]:
df = pd.read_csv('../saves/PokemonsForModel.csv')
df2 = pd.read_csv('../saves/ModelDataset.csv')

In [599]:
X = df2.drop('Unnamed: 0', axis=1)
y = df['Catch Rate']

In [600]:
lvl = pd.get_dummies(df['Levelling Rate'], dtype=float)

In [601]:
X = pd.concat([X, lvl], axis=1)

In [602]:
from sklearn.model_selection import train_test_split

In [603]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [604]:
from sklearn.svm import SVR
svr = SVR()
svr.fit(X_train, y_train)
svr.get_params()

{'C': 1.0,
 'cache_size': 200,
 'coef0': 0.0,
 'degree': 3,
 'epsilon': 0.1,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [605]:
from sklearn.model_selection import GridSearchCV

In [606]:
param = {'C' : [1, 5, 10, 20, 50], 'coef0' : [0.001, 0.01,10,0.5],'gamma' : ('auto','scale')}

In [607]:
gs = GridSearchCV(estimator=svr, param_grid=param, verbose=3)
gs.fit(X_train, y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV 1/5] END .....C=1, coef0=0.001, gamma=auto;, score=-0.201 total time=   0.0s
[CV 2/5] END .....C=1, coef0=0.001, gamma=auto;, score=-0.285 total time=   0.0s
[CV 3/5] END .....C=1, coef0=0.001, gamma=auto;, score=-0.215 total time=   0.0s
[CV 4/5] END .....C=1, coef0=0.001, gamma=auto;, score=-0.171 total time=   0.1s
[CV 5/5] END .....C=1, coef0=0.001, gamma=auto;, score=-0.193 total time=   0.0s
[CV 1/5] END .....C=1, coef0=0.001, gamma=scale;, score=0.369 total time=   0.0s
[CV 2/5] END .....C=1, coef0=0.001, gamma=scale;, score=0.272 total time=   0.0s
[CV 3/5] END .....C=1, coef0=0.001, gamma=scale;, score=0.338 total time=   0.0s
[CV 4/5] END .....C=1, coef0=0.001, gamma=scale;, score=0.370 total time=   0.0s
[CV 5/5] END .....C=1, coef0=0.001, gamma=scale;, score=0.367 total time=   0.0s


[CV 1/5] END ......C=1, coef0=0.01, gamma=auto;, score=-0.201 total time=   0.0s
[CV 2/5] END ......C=1, coef0=0.01, gamma=auto;, score=-0.285 total time=   0.0s
[CV 3/5] END ......C=1, coef0=0.01, gamma=auto;, score=-0.215 total time=   0.0s
[CV 4/5] END ......C=1, coef0=0.01, gamma=auto;, score=-0.171 total time=   0.0s
[CV 5/5] END ......C=1, coef0=0.01, gamma=auto;, score=-0.193 total time=   0.0s
[CV 1/5] END ......C=1, coef0=0.01, gamma=scale;, score=0.369 total time=   0.0s
[CV 2/5] END ......C=1, coef0=0.01, gamma=scale;, score=0.272 total time=   0.0s
[CV 3/5] END ......C=1, coef0=0.01, gamma=scale;, score=0.338 total time=   0.0s
[CV 4/5] END ......C=1, coef0=0.01, gamma=scale;, score=0.370 total time=   0.0s
[CV 5/5] END ......C=1, coef0=0.01, gamma=scale;, score=0.367 total time=   0.0s
[CV 1/5] END ........C=1, coef0=10, gamma=auto;, score=-0.201 total time=   0.0s
[CV 2/5] END ........C=1, coef0=10, gamma=auto;, score=-0.285 total time=   0.0s
[CV 3/5] END ........C=1, co

In [608]:
print(gs.best_params_)
gpred = gs.predict(X_test)


{'C': 20, 'coef0': 0.001, 'gamma': 'scale'}


In [609]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error

In [610]:
print(mean_absolute_error(y_test, gpred))
print(mean_squared_error(y_test, gpred))
print(root_mean_squared_error(y_test, gpred))

30.220542317055013
2145.9489267893805
46.324388034699176


In [611]:
from sklearn.ensemble import RandomForestRegressor

In [612]:
rfr = RandomForestRegressor(n_estimators=1000)

In [613]:
rfr.fit(X_train, y_train)

In [614]:
rfr.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 1000,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [615]:
pred = rfr.predict(X_test)

In [616]:
print(mean_absolute_error(y_test, pred))
print(mean_squared_error(y_test, pred))
print(root_mean_squared_error(y_test, pred))


20.984181818181817
1312.1316930129872
36.22335838948381


In [617]:
ydf = pd.DataFrame(y_test, dtype=float)
ydf['Preds'] = pred
ydf['diff'] = abs(ydf['Catch Rate'] - ydf['Preds'])
ydf[:30]

Unnamed: 0,Catch Rate,Preds,diff
90,60.0,49.738,10.262
295,180.0,180.538,0.538
572,60.0,75.583,15.583
382,3.0,3.421,0.421
939,180.0,236.665,56.665
239,45.0,85.216,40.216
584,190.0,181.42,8.58
274,45.0,54.609,9.609
912,45.0,45.427,0.427
20,255.0,249.305,5.695


In [618]:
#Random Forest Regressor has much better performance compared to SVR