In [5]:
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# # Ignore warnings
# import warnings
# warnings.filterwarnings('ignore')

# Import the dataframe
df = pd.read_csv('cacao_engineered.csv')

# Remove unnamed columns and target column
df.drop(df.columns[df.columns.str.contains('unnamed', case = False)] , axis = 1, inplace = True)
X = df.drop(['Rating'], axis=1)

y = df['Rating']

# Clean up columns of X so it works with the model
X.columns = [col.replace(" ", "_").replace(":", "_") for col in X.columns]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print("X: ", X)
print("Y: ", y)

X:         REF  Review_Date  Cocoa_Percent  Company_Location_AF  \
0     1876         2016           63.0                  0.0   
1     1676         2015           70.0                  0.0   
2     1676         2015           70.0                  0.0   
3     1680         2015           70.0                  0.0   
4     1704         2015           70.0                  0.0   
...    ...          ...            ...                  ...   
1790   647         2011           70.0                  0.0   
1791   749         2011           65.0                  0.0   
1792   749         2011           65.0                  0.0   
1793   781         2011           62.0                  0.0   
1794   486         2010           65.0                  0.0   

      Company_Location_AS  Company_Location_CA  Company_Location_CEU  \
0                     0.0                  0.0                   0.0   
1                     0.0                  0.0                   0.0   
2                     0

In [17]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [10, 50, 100], 'epsilon': [0.01, 0.1], 'kernel': ['poly', 'rbf']}

grid_search = GridSearchCV(SVR(), param_grid, cv = 5, scoring = 'r2', refit = True, verbose = 3, n_jobs = 1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

# svr = SVR(kernel = 'rbf')
# svr.fit(X_train, y_train)
# y_pred = svr.predict(X_test)

# print("x: ", X_test)
# print("y_pred: ", y_pred)
# print("y: ", y_test)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END ...C=10, epsilon=0.01, kernel=poly;, score=0.005 total time=   0.0s
[CV 2/5] END ...C=10, epsilon=0.01, kernel=poly;, score=0.003 total time=   0.0s
[CV 3/5] END ...C=10, epsilon=0.01, kernel=poly;, score=0.012 total time=   0.0s
[CV 4/5] END ...C=10, epsilon=0.01, kernel=poly;, score=0.020 total time=   0.0s
[CV 5/5] END ...C=10, epsilon=0.01, kernel=poly;, score=0.014 total time=   0.0s
[CV 1/5] END ....C=10, epsilon=0.01, kernel=rbf;, score=0.007 total time=   0.1s
[CV 2/5] END ....C=10, epsilon=0.01, kernel=rbf;, score=0.002 total time=   0.1s
[CV 3/5] END ....C=10, epsilon=0.01, kernel=rbf;, score=0.012 total time=   0.1s
[CV 4/5] END ....C=10, epsilon=0.01, kernel=rbf;, score=0.022 total time=   0.1s
[CV 5/5] END ....C=10, epsilon=0.01, kernel=rbf;, score=0.012 total time=   0.1s
[CV 1/5] END ...C=10, epsilon=0.1, kernel=poly;, score=-0.002 total time=   0.0s
[CV 2/5] END ....C=10, epsilon=0.1, kernel=poly;

In [29]:
from sklearn.metrics import classification_report

print("Best Param: ", best_params)
print("Best Train Score: ", best_score)

score = grid_search.score(X_test, y_test)

print("score: ", score)

Best Param:  {'C': 100, 'epsilon': 0.01, 'kernel': 'rbf'}
Best Train Score:  0.02359259033992791
score:  -0.00022435678615195798
