In [1]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import KFold 
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline 
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score


import warnings 
warnings.filterwarnings('ignore')

In [2]:
X = pd.read_csv('cereal.csv')
X.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
calories,77.0,106.883117,19.484119,50.0,100.0,110.0,110.0,160.0
protein,77.0,2.545455,1.09479,1.0,2.0,3.0,3.0,6.0
fat,77.0,1.012987,1.006473,0.0,0.0,1.0,2.0,5.0
sodium,77.0,159.675325,83.832295,0.0,130.0,180.0,210.0,320.0
fiber,77.0,2.151948,2.383364,0.0,1.0,2.0,3.0,14.0
carbo,77.0,14.597403,4.278956,-1.0,12.0,14.0,17.0,23.0
sugars,77.0,6.922078,4.444885,-1.0,3.0,7.0,11.0,15.0
potass,77.0,96.077922,71.286813,-1.0,40.0,90.0,120.0,330.0
vitamins,77.0,28.246753,22.342523,0.0,25.0,25.0,25.0,100.0
shelf,77.0,2.207792,0.832524,1.0,1.0,2.0,3.0,3.0


In [3]:
from sklearn.preprocessing import MaxAbsScaler
SELECTED_NORMALIZATION_SCALER = MaxAbsScaler()

In [4]:
y = X.rating #Target 
X.drop(['rating'], axis = 1, inplace = True) #Drop the target from the dataset.
X.drop(['type'], axis = 1, inplace = True) #Drop the type column too. Holds no predictive power. can make the model biased 
X.drop(['name'], axis = 1, inplace = True) 

In [5]:
X.head()

Unnamed: 0,mfr,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups
0,N,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33
1,Q,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0
2,K,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33
3,K,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.5
4,R,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75


In [6]:
#Fully write out the manufacturers names in order to fully understand the dataset
manufacturers_name = {'A': 'American Home Food Products',
    'G': 'General Mills',
    'K': 'Kelloggs',
    'N': 'Nabisco',
    'P': 'Post',
    'Q': 'Quaker Oats',
    'R': 'Ralston Purina'

}

X['mfr'] = X['mfr'].map(manufacturers_name)

In [7]:
#cat col to onehotencode and concantenate back into the feature dataset 
cat_col = X[['mfr']]

my_encoder= OneHotEncoder(handle_unknown= 'ignore', sparse= False)
OH_col = pd.DataFrame(my_encoder.fit_transform(cat_col))

#Add index removed by one hot encoding 
OH_col.index = X.index

#Drop the former categorical column 
num_X = X.drop('mfr', axis = 1)

#concatenate the new encoded col into the feature dataframe
new_X = pd.concat([num_X, OH_col], axis =1)

OH_col.columns = OH_col.columns.astype(str)



In [8]:
new_X.head()

Unnamed: 0,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,0,1,2,3,4,5,6
0,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.5,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [15]:
#Train the model in a pipeline using a regression or classfication algorithm and use cross validation to estimate which performs best. 
#We won't won't split our dataset into training and validation sets because the dataset is too small. 77 entries. 
#In the pipeline, we will normalize the feature before training 

#my_pipeline = Pipeline(steps= [('SELECTED_NORMALIZATION_SCALER', MaxAbsScaler()),
                                #('model', RandomForestRegressor(n_estimators=50, random_state=0))]) 


# Define your pipeline
my_pipeline = Pipeline(steps=[
    ('SELECTED_NORMALIZATION_SCALER', MaxAbsScaler()),
    ('model', RandomForestRegressor(n_estimators=50, random_state=0))
])

from sklearn.model_selection import cross_val_score
# Perform cross-validation and get scores
cv_scores = cross_val_score(my_pipeline, new_X, y, cv=7)  # Change cv to the desired number of folds

print("Cross-validation scores:", cv_scores)
print("Average CV score:", cv_scores.mean())



Cross-validation scores: [0.68201164 0.73474955 0.76347665 0.8365353  0.80658893 0.69430895
 0.78521718]
Average CV score: 0.757555457974963


In [16]:
#Trying out other algorithm to see which performs better 
from sklearn.tree import DecisionTreeRegressor

# Define your pipeline
dt_pipeline = Pipeline(steps=[
    ('SELECTED_NORMALIZATION_SCALER', MaxAbsScaler()),
    ('model', DecisionTreeRegressor(random_state=0))
])

# Perform cross-validation and get scores
dt_cv_scores = cross_val_score(dt_pipeline, new_X, y, cv=7)
print("Decision Tree Cross-validation scores:", dt_cv_scores)
print("Decision Tree Average CV score:", dt_cv_scores.mean())


Decision Tree Cross-validation scores: [0.53025928 0.50163433 0.14590096 0.67125786 0.57414729 0.77691114
 0.47551863]
Decision Tree Average CV score: 0.5250899273667223


In [17]:
from sklearn.svm import SVR

# Define your pipeline
svr_pipeline = Pipeline(steps=[
    ('SELECTED_NORMALIZATION_SCALER', MaxAbsScaler()),
    ('model', SVR())
])

# Perform cross-validation and get scores
svr_cv_scores = cross_val_score(svr_pipeline, new_X, y, cv=7)
print("Support Vector Regressor Cross-validation scores:", svr_cv_scores)
print("Support Vector Regressor Average CV score:", svr_cv_scores.mean())


Support Vector Regressor Cross-validation scores: [ 0.03394065  0.36411486  0.2360401   0.25106784  0.25901638 -0.84272378
  0.31574276]
Support Vector Regressor Average CV score: 0.08817125865555238


In [18]:
from sklearn.ensemble import GradientBoostingRegressor

# Define your pipeline
gb_pipeline = Pipeline(steps=[
    ('SELECTED_NORMALIZATION_SCALER', MaxAbsScaler()),
    ('model', GradientBoostingRegressor(random_state=0))
])

# Perform cross-validation and get scores
gb_cv_scores = cross_val_score(gb_pipeline, new_X, y, cv=7)
print("Gradient Boosting Regressor Cross-validation scores:", gb_cv_scores)
print("Gradient Boosting Regressor Average CV score:", gb_cv_scores.mean())


Gradient Boosting Regressor Cross-validation scores: [0.70562019 0.81452813 0.77899923 0.91810081 0.8398238  0.85365015
 0.87045888]
Gradient Boosting Regressor Average CV score: 0.8258830265854239


In [None]:
Support Vector machine is performing way better than the other algorithms. I would have gone ahead to make predictions but there is no test data avaliable