In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
import numpy as np

df = pd.read_csv('cacao_engineered.csv')
print(f"shape of dataset: {df.shape}")
#split data 75:25
x = df.loc[:, df.columns != 'Rating']
y = df['Rating']

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.75)

#Random Forest regression with default values
model = RandomForestRegressor()

#fit model
model.fit(x_train, y_train)

#predict on model
y_pred = model.predict(x_test)

#evaluate model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error", mse)

rmse = mse**0.5
print("Root Mean Squared Error", rmse)

r2score = r2_score(y_test, y_pred)
#adjusted rscore is just rscore but the rscore can increase with additional features even though no improvement occurs for model's performance. 
#Basically adjusted negates the weakness of rscore
adjusted_r2score = 1- ((1 - r2score) * ((df.shape[0] - 1) / (df.shape[0] - df.shape[1] - 1)))
print("Adjusted R Squared Score: ", adjusted_r2score)

mae = (1/ df.shape[0]) * sum(abs(y_test - y_pred))
print(f"Mean Absolute Error: {mae}")
errors = abs(y_test - y_pred)
mape = (1/len(y_test)) * sum(errors / y_test) * 100
print(f"Mean Absolute Percentage Error (Accuracy): {mape}%")








shape of dataset: (1795, 23)
Mean Squared Error 0.19062630846325165
Root Mean Squared Error 0.43660772835951
Adjusted R Squared Score:  0.15955653902878442
Mean Absolute Error: 0.08771030640668517
Mean Absolute Percentage Error (Accuracy): 11.54331612151769%


Above output is from the Random Forest Regression with the default settings. Currently, according to the Adjusted R Squared Score, the model is not performing well since it has a score close to 0. In other words, the features currently do not predict the ratings very well. This is further supported by the low accuracy. The Mean Squared Error, Root Mean Squared Error, and Mean Absolute Error currently don't have much meaning since we don't have other measurements to compare them to. So let's try using grid search to see parameters work best for Random Forest Regression. Let's first see what range of parameter works best for our model using Random Search.

In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import numpy as np

df = pd.read_csv('cacao_engineered.csv')
print(f"shape of dataset: {df.shape}")
#split data 75:25
x = df.loc[:, df.columns != 'Rating']
y = df['Rating']

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.75)


model = RandomForestRegressor()
print(model.get_params())
param_grid = {
    'max_samples': [None, 5 ,20, 50 , 80, 100, 200],
    'max_depth': [None, 10,20, 30, 40, 50, 60, 70, 80, 90, 100],
    'max_features': ['sqrt', 'log2', 1],
    'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    'min_samples_split': [2,3,5, 6, 8, 10],
    'min_samples_leaf': [1, 2, 3, 4 , 5, 6, 7,8,9, 10]
}
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=200, cv=5, error_score='raise', verbose=3, n_jobs=-1)
random_search.fit(x_train, y_train)
print('\n')
print(f"Best parameters from Random Search: {random_search.best_params_}")

model_rand_params = random_search.best_estimator_
model_rand_params.predict(x_test)
#evaluate model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error", mse)

rmse = mse**0.5
print("Root Mean Squared Error", rmse)

r2score = r2_score(y_test, y_pred)
#adjusted rscore is just rscore but the rscore can increase with additional features even though no improvement occurs for model's performance. 
#Basically adjusted negates the weakness of rscore
adjusted_r2score = 1- ((1 - r2score) * ((df.shape[0] - 1) / (df.shape[0] - df.shape[1] - 1)))
print("Adjusted R Squared Score: ", adjusted_r2score)

errors = abs(y_test - y_pred)
mape = (1/len(y_test)) * sum(errors / y_test) * 100
print(f"Mean Absolute Percentage Error (Accuracy): {mape}%")

shape of dataset: (1795, 23)
{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
Fitting 5 folds for each of 200 candidates, totalling 1000 fits


Best parameters from Random Search: {'n_estimators': 600, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_samples': None, 'max_features': 'sqrt', 'max_depth': 40}
Mean Squared Error 0.29704891425389757
Root Mean Squared Error 0.5450219392408874
Adjusted R Squared Score:  -0.2495004381924424
Mean Absolute Percentage Error (Accuracy): 14.374450580352605%


With the best parameters returned, we can use these values to explore nearby values using Grid Search. First, let's examine the performance evaluation for the model using these parameters. Comparing to the default mode, MSE has approximately doubled in value while RMSE has only increase by roughly 0.1. This either means that the predictions have further strayed from the actual values or the change in parameters have exacerbated the outliers in the dataset. Adjusted R Square Score has further worsen now exhibiting a negative value. This means that the model fits the test data horribly, shwoing that the model doesn't capture the test data's pattern at all. Despite the worsening of these metrics, The MAPE shows a small boost. This seems contradictory since an increase an error should indicate a decrease inaccuracy. Let's see if we can gain more insights through Grid Search.

In [None]:

# importants = list(model.feature_importances_)
# important_features = []
# df_features = list(x.columns)
# for feature, important in zip(df_features, importants):
#     important_features.append((feature, important))
# important_features = sorted(important_features, key = lambda tuple: tuple[1], reverse=True)
# for tuple in important_features:
#     print(f"Variable: {tuple[0]}       Importance:{tuple[1]}")

# #bar graph of the important features
# x_values = list(range(len(importants)))
# plt.bar(x_values, importants, orientation='vertical') 
# plt.xticks(x_values, df_features, rotation=45)
# plt.xlabel('features')
# plt.ylabel('Importance')
# plt.title('Feature importances')
# plt.show()

In [62]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV


df = pd.read_csv('cacao_engineered.csv')
print(f"shape of dataset: {df.shape}")
#split data 75:25
x = df.loc[:, df.columns != 'Rating']
y = df['Rating']

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.75)

#Random Forest regression with gridsearch values
model_grid = RandomForestRegressor()

#trying a range of values close to the Random Search parameters

param_grid = {
    'max_samples': [None, 100, 500, 1000],
    'max_depth': [30, 40, 50],
    'max_features': ['sqrt'],
    'n_estimators': [500, 600, 700],
    'min_samples_split': [7,8,9],
    'min_samples_leaf': [1, 2]
}
grid_search = GridSearchCV(estimator = model_grid, param_grid = param_grid, verbose= 4, cv=6, error_score='raise', n_jobs = -1)
grid_result = grid_search.fit(x_train, y_train)
print(f"best params for the model are {grid_search.best_params_}")

#predict on model with best parameters from grid search
model_grid_params = grid_search.best_estimator_
y_pred = model_grid_params.predict(x_test)
#evaluate model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error", mse)
rmse = mse**0.5
print("Root Mean Squared Error", rmse)
r2score = r2_score(y_test, y_pred)
#adjusted rscore is just rscore but the rscore only improves if additional features actually improves the model's performance. 
#Basically it negates the weakness of rscore which is that the score increases as the number of features increase
adjusted_r2score = 1- ((1 - r2score) * ((df.shape[0] - 1) / (df.shape[0] - df.shape[1] - 1)))
print("Adjusted R Squared Score: ", adjusted_r2score)

errors = abs(y_test - y_pred)
mape = (1/len(y_test)) * sum(errors / y_test) * 100
print(f"Mean Absolute Percentage Error (Accuracy): {mape}%")


shape of dataset: (1795, 23)
Fitting 6 folds for each of 216 candidates, totalling 1296 fits
best params for the model are {'max_depth': 30, 'max_features': 'sqrt', 'max_samples': None, 'min_samples_leaf': 1, 'min_samples_split': 7, 'n_estimators': 600}
Mean Squared Error 0.17514193129114258
Root Mean Squared Error 0.4184996192246088
Adjusted R Squared Score:  0.1557685169406693
Mean Absolute Percentage Error (Accuracy): 10.952308218649097%
