In [183]:
 import pandas as pd
from sklearn.model_selection import train_test_split as tts
import numpy as np
from sklearn.metrics import root_mean_squared_error

This system recommends Van Leeuwen Ice Cream flavors to tasters.

• Find a dataset, or build out your own toy dataset. As a minimum requirement for complexity,
please include numeric ratings for at least five users, across at least five items, with some missing
data.

• Load your data into (for example) an R or pandas dataframe, a Python dictionary or list of lists, (or
another data structure of your choosing). From there, create a user-item matrix.

In [184]:
df = pd.read_csv('VanLeeuwen.csv')

In [185]:
df

Unnamed: 0,Tasters,Black Cherry Chip,Cookies & Cream,Marionberry Cheesecake,Honeycomb,Mint Chip
0,Sam,5,,4.0,,4
1,Nina,4,3.0,5.0,3.0,4
2,Nancy,4,2.0,,,3
3,Aditi,2,2.0,3.0,1.0,2
4,Yerlene,4,,5.0,4.0,5
5,Jose,4,2.0,5.0,4.0,4


In [186]:
df['Mint Chip'] = df['Mint Chip'].astype(float)
df['Black Cherry Chip'] = df['Black Cherry Chip'].astype(float)

In [187]:
df.dtypes

Tasters                    object
Black Cherry Chip         float64
Cookies & Cream           float64
Marionberry Cheesecake    float64
Honeycomb                 float64
Mint Chip                 float64
dtype: object

In [314]:
#tidy-ify
user_item_matrix = pd.melt(df, id_vars=['Tasters'], value_vars=['Black Cherry Chip','Cookies & Cream', 'Marionberry Cheesecake','Honeycomb','Mint Chip'])
user_item_matrix.head()

Unnamed: 0,Tasters,variable,value
0,Sam,Black Cherry Chip,5.0
1,Nina,Black Cherry Chip,4.0
2,Nancy,Black Cherry Chip,4.0
3,Aditi,Black Cherry Chip,2.0
4,Yerlene,Black Cherry Chip,4.0


• Break your ratings into separate training and test datasets.

In [301]:
train_data, test_data = train_test_split(user_item_matrix, test_size=0.2, random_state=30)

Using your training data, calculate the raw average (mean) rating for every user-item combination.

In [302]:
train_data.head()

Unnamed: 0,Tasters,variable,value
8,Nancy,Cookies & Cream,2.0
6,Sam,Cookies & Cream,
25,Nina,Mint Chip,4.0
4,Yerlene,Black Cherry Chip,4.0
11,Jose,Cookies & Cream,2.0


In [303]:
raw_avg = train_data['value'].copy().mean()
raw_avg

np.float64(3.4)

In [304]:
raw_avg_train = train_data.value[train_data['value'].notnull()].copy()
raw_avg_train[:] = raw_avg
raw_avg_train = np.array(raw_avg_train, dtype = 'float')

In [305]:
raw_avg_test = test_data.value[test_data['value'].notnull()].copy()
raw_avg_test[:] = raw_avg
raw_avg_test = np.array(raw_avg_test, dtype = 'float')

In [306]:
raw_avg_train

array([3.4, 3.4, 3.4, 3.4, 3.4, 3.4, 3.4, 3.4, 3.4, 3.4, 3.4, 3.4, 3.4,
       3.4, 3.4, 3.4, 3.4, 3.4, 3.4, 3.4])

In [307]:
raw_avg_test

array([3.4, 3.4, 3.4, 3.4, 3.4])

• Calculate the RMSE for raw average for both your training data and your test data.

In [308]:
root_mean_squared_error(train_data.value[train_data['value'].notnull()],raw_avg_train)

1.1575836902790226

In [309]:
root_mean_squared_error(test_data.value[test_data['value'].notnull()],raw_avg_test)

1.077032961426901

• Using your training data, calculate the bias for each user and each item.

In [310]:
#taster bias
taster_bias = train_data.groupby('Tasters')['value'].mean()
taster_bias = pd.DataFrame(taster_bias)
taster_bias.value = taster_bias.value - raw_avg
taster_bias

Unnamed: 0_level_0,value
Tasters,Unnamed: 1_level_1
Aditi,-1.65
Jose,0.4
Nancy,-0.4
Nina,0.4
Sam,0.6
Yerlene,0.933333


In [311]:
#flavor bias
flavor_bias = train_data.groupby('variable')['value'].mean()
flavor_bias = pd.DataFrame(flavor_bias)
flavor_bias.value = flavor_bias.value - raw_avg
flavor_bias

Unnamed: 0_level_0,value
variable,Unnamed: 1_level_1
Black Cherry Chip,0.2
Cookies & Cream,-1.15
Honeycomb,-0.4
Marionberry Cheesecake,1.35
Mint Chip,-0.066667


• From the raw average, and the appropriate user and item biases, calculate the baseline predictors
for every user-item combination.

In [346]:
#append avg
base_line_predictors = user_item_matrix.copy()
base_line_predictors['raw_avg'] = raw_avg
base_line_predictors

Unnamed: 0,Tasters,variable,value,raw_avg
0,Sam,Black Cherry Chip,5.0,3.4
1,Nina,Black Cherry Chip,4.0,3.4
2,Nancy,Black Cherry Chip,4.0,3.4
3,Aditi,Black Cherry Chip,2.0,3.4
4,Yerlene,Black Cherry Chip,4.0,3.4
5,Jose,Black Cherry Chip,4.0,3.4
6,Sam,Cookies & Cream,,3.4
7,Nina,Cookies & Cream,3.0,3.4
8,Nancy,Cookies & Cream,2.0,3.4
9,Aditi,Cookies & Cream,2.0,3.4


In [347]:
#append biases
base_line_predictors = base_line_predictors.merge(flavor_bias, left_on='variable', right_on='variable', suffixes=(None, '_flavor_bias'))
base_line_predictors = base_line_predictors.merge(taster_bias, left_on='Tasters', right_on='Tasters',suffixes=(None,'_taster_bias'))

In [348]:
#create predicted 
base_line_predictors['predicted'] = base_line_predictors['raw_avg'] + base_line_predictors['value_flavor_bias'] + base_line_predictors['value_taster_bias']

In [352]:
#above 5 or below 1, assumed to be 5 or 1
base_line_predictors.predicted = base_line_predictors.predicted.clip(1, 5)

In [353]:
base_line_predictors

Unnamed: 0,Tasters,variable,value,raw_avg,value_flavor_bias,value_taster_bias,predicted
0,Sam,Black Cherry Chip,5.0,3.4,0.2,0.6,4.2
1,Nina,Black Cherry Chip,4.0,3.4,0.2,0.4,4.0
2,Nancy,Black Cherry Chip,4.0,3.4,0.2,-0.4,3.2
3,Aditi,Black Cherry Chip,2.0,3.4,0.2,-1.65,1.95
4,Yerlene,Black Cherry Chip,4.0,3.4,0.2,0.933333,4.533333
5,Jose,Black Cherry Chip,4.0,3.4,0.2,0.4,4.0
6,Sam,Cookies & Cream,,3.4,-1.15,0.6,2.85
7,Nina,Cookies & Cream,3.0,3.4,-1.15,0.4,2.65
8,Nancy,Cookies & Cream,2.0,3.4,-1.15,-0.4,1.85
9,Aditi,Cookies & Cream,2.0,3.4,-1.15,-1.65,1.0


• Calculate the RMSE for the baseline predictors for both your training data and your test data.

In [368]:
train_data1, test_data1 = train_test_split(base_line_predictors, test_size=0.2, random_state=30)

In [375]:
predicted_train1 = train_data1.predicted[train_data1['value'].notnull()].copy()


In [376]:
predicted_test1 = test_data1.predicted[test_data1['value'].notnull()].copy()

In [380]:
RMSE_train = root_mean_squared_error(train_data1.value[train_data1['value'].notnull()],predicted_train1)
RMSE_train

0.468182063351902

In [381]:
RMSE_test = root_mean_squared_error(test_data1.value[test_data1['value'].notnull()],predicted_test1)
RMSE_test

0.4892170615721954

• Summarize your results.
I created a recommeder for ice cream flavors. The average rating was 3.4. Yerlene was our most positive taster with a +0.933333 bias. Aditi was our most negative taster with a bias of -1.650000. Marionberry Cheesecake seems like an excellent flavor with a +1.350000 bias. Cookies & Cream is least loved with a bias	of -1.150000. Our RMSE for our baseline predictor compared to the their actual values was 0.46 in the train set and 0.48 in the test set, so the predicted ratings were relatively close to actual ratings.