In [57]:
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# # Ignore warnings
# import warnings
# warnings.filterwarnings('ignore')

# Import the dataframe
df = pd.read_csv('cacao_engineered.csv')

# Remove unnamed columns and target column
#df.drop(df.columns[df.columns.str.contains('unnamed', case = False)] , axis = 1, inplace = True)
#X = df.drop(['Rating'], axis=1)
X = df.drop(['Rating', 'Unnamed: 0'], axis=1)
y = df['Rating']

# Clean up columns of X so it works with the model
X.columns = [col.replace(" ", "_").replace(":", "_") for col in X.columns]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print("X: ", X_train)
print("Y: ", y_train)

X:         REF  Review Date  Cocoa Percent  Company Location_AF  \
1023  1089         2013           72.0                  0.0   
1732  1756         2016           70.0                  0.0   
1555  1395         2014           70.0                  0.0   
1640   967         2012           70.0                  0.0   
175   1768         2016           83.0                  0.0   
...    ...          ...            ...                  ...   
1130   887         2012           75.0                  0.0   
1294    93         2006           72.0                  0.0   
860    733         2011           72.0                  0.0   
1459   717         2011           67.0                  0.0   
1126  1538         2015           70.0                  0.0   

      Company Location_AS  Company Location_CA  Company Location_CEU  \
1023                  0.0                  0.0                   0.0   
1732                  0.0                  0.0                   0.0   
1555                  0

In [58]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.01, 10, 100], 'epsilon': [0.01, 10, 100], 'kernel': ['poly', 'rbf']}

#'gamma': [0.01, 1, 100],
#'epsilon': [0.01, 1],

grid_search = GridSearchCV(SVR(), param_grid, cv = 5, scoring = 'r2', verbose = 3, n_jobs = 1)
grid_search.fit(X_train, y_train)

# svr = SVR(kernel = 'rbf')
# svr.fit(X_train, y_train)
# y_pred = svr.predict(X_test)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END .C=0.01, epsilon=0.01, kernel=poly;, score=0.003 total time=   0.0s
[CV 2/5] END C=0.01, epsilon=0.01, kernel=poly;, score=-0.007 total time=   0.0s
[CV 3/5] END C=0.01, epsilon=0.01, kernel=poly;, score=-0.015 total time=   0.0s
[CV 4/5] END C=0.01, epsilon=0.01, kernel=poly;, score=-0.004 total time=   0.0s
[CV 5/5] END C=0.01, epsilon=0.01, kernel=poly;, score=-0.001 total time=   0.0s
[CV 1/5] END ..C=0.01, epsilon=0.01, kernel=rbf;, score=0.006 total time=   0.0s
[CV 2/5] END .C=0.01, epsilon=0.01, kernel=rbf;, score=-0.003 total time=   0.0s
[CV 3/5] END .C=0.01, epsilon=0.01, kernel=rbf;, score=-0.011 total time=   0.0s
[CV 4/5] END .C=0.01, epsilon=0.01, kernel=rbf;, score=-0.000 total time=   0.0s
[CV 5/5] END ..C=0.01, epsilon=0.01, kernel=rbf;, score=0.001 total time=   0.0s
[CV 1/5] END ..C=0.01, epsilon=10, kernel=poly;, score=-2.104 total time=   0.0s
[CV 2/5] END ..C=0.01, epsilon=10, kernel=poly;,

In [59]:
from sklearn.metrics import classification_report

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Param: ", best_params)
print("Best Train Score: ", best_score)

score = model.score(X_test, y_test)

print("score: ", score)

Best Param:  {'C': 100, 'epsilon': 0.01, 'kernel': 'rbf'}
Best Train Score:  0.02359259033992791


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Broad Bean Origin_AF
- Broad Bean Origin_AS
- Broad Bean Origin_CA
- Broad Bean Origin_CR
- Broad Bean Origin_NA
- ...
Feature names seen at fit time, yet now missing:
- Broad_Bean_Origin_AF
- Broad_Bean_Origin_AS
- Broad_Bean_Origin_CA
- Broad_Bean_Origin_CR
- Broad_Bean_Origin_NA
- ...


In [None]:
from sklearn.metrics import mean_squared_error, r2_score

model= grid_search.best_estimator_
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# evaluate model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error", mse)
rmse = mse**0.5
print("Root Mean Squared Error", rmse)
r2 = r2_score(y_test, y_pred)

#adjusted rscore is just rscore but the rscore only improves if additional features actually improves the model's performance. 
#Basically it negates the weakness of rscore which is that the score increases as the number of features increase
adjusted_r2score = 1- ((1 - r2) * ((df.shape[0] - 1) / (df.shape[0] - df.shape[1] - 1)))
print("Adjusted R Squared Score: ", adjusted_r2score)

mae = (1/ df.shape[0]) * sum(abs(y_test - y_pred))
print(f"Mean Absolute Error: {mae}")
errors = abs(y_test - y_pred)
mape = (1/len(y_test)) * sum(errors / y_test) * 100
print(f"Mean Absolute Percentage Error: {mape}%")

In [None]:
import joblib

joblib.dump(model, 'SVR_model.pkl')
