In [1]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.model_selection import cross_val_score
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import time
%matplotlib inline

In [2]:
df = pd.read_csv('epi_r.csv')

In [3]:
display(df.head(2))
print(df.shape)

Unnamed: 0,title,rating,calories,protein,fat,sodium,#cakeweek,#wasteless,22-minute meals,3-ingredient recipes,...,yellow squash,yogurt,yonkers,yuca,zucchini,cookbooks,leftovers,snack,snack week,turkey
0,"Lentil, Apple, and Turkey Wrap",2.5,426.0,30.0,7.0,559.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,Boudin Blanc Terrine with Red Onion Confit,4.375,403.0,18.0,23.0,1439.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


(20052, 680)


In [4]:
X = df[list(set(list(df.columns)) - set(['rating', 'title']))]
y = df['rating']

In [5]:
# Classifies high ratings (>4) and low raings (<4)
y_class = y.copy()
y_class[y_class < 4] = 0
y_class[y_class > 4] = 1
y_class.value_counts()

1.0    10738
0.0     9314
Name: rating, dtype: int64

In [6]:
X.describe()

Unnamed: 0,cocktail party,sesame oil,créme de cacao,providence,kid-friendly,dill,calvados,kumquat,mustard greens,smoker,...,grill/barbecue,pan-fry,no sugar added,drink,meatball,cognac/armagnac,lemon,north carolina,coffee grinder,chill
count,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,...,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0
mean,0.05755,0.002095,0.001047,5e-05,0.089318,0.013515,0.000997,0.002144,0.001795,0.000349,...,0.055057,0.012418,0.156194,0.054209,0.000399,0.004538,0.055057,0.000848,0.000848,0.056154
std,0.232897,0.045719,0.032346,0.007062,0.285209,0.115468,0.031567,0.046259,0.042334,0.018681,...,0.228097,0.110744,0.363048,0.226435,0.019971,0.067215,0.228097,0.029105,0.029105,0.230225
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
# Drop NAN columns
X_null = X.isnull().sum()
null_cols = list(X_null[X_null > 0].index)
print ("Dropping {} columns due to NaNs\n{}".format(len(null_cols), null_cols))
X_feat = X.drop(columns = null_cols)

Dropping 4 columns due to NaNs
['fat', 'sodium', 'protein', 'calories']


In [8]:
# Drop heavily skewed means
low_means = []
for col in X_feat.columns:
    if X_feat[col].mean() < 0.1:
        low_means.append(col)
print ("Dropping {} columns due to insignificant mean".format(len(low_means)))
X_feat = X_feat.drop(columns = low_means)

Dropping 648 columns due to insignificant mean


In [9]:
# Feature list
print ("Number of features: {}".format(len(X_feat.columns)))

Number of features: 26


In [10]:
#SV Classifier
start = time.time()
svc_model = svm.SVC()
fit = svc_model.fit(X_feat, y_class)
y_pred = svc_model.predict(X_feat)
print ("Runtime: %0.2f seconds" % (time.time() - start))



Runtime: 25.96 seconds


In [11]:
start = time.time()
svc_score = svc_model.score(X_feat, y_class)
print ("Runtime: %0.2f seconds" % (time.time() - start))
print ("Baseline Score: %0.3f" % (svc_score))

Runtime: 10.14 seconds
Baseline Score: 0.579


In [12]:
#Improving

start = time.time()
svc_iter_model = svm.SVC(C = 100)
print ("Runtime: %0.2f seconds" % (time.time() - start))

Runtime: 0.00 seconds


In [13]:
# Add nutritional information, imputing nulls with median
X_nut = df[list(set(list(df.columns)) - set(['rating', 'title']))]
X_nut.drop(columns = low_means, inplace = True)
X_nut = X_nut.fillna(X_nut.median())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [None]:

start = time.time()
svc_iter_cross_val_scores = cross_val_score(svc_iter_model, X_nut, y_class, cv=5, gamma=auto)
print ("Runtime: %0.2f seconds" % (time.time() - start))
print ("Mean Accuracy: %0.3f (+/- %0.3f)" % (svc_iter_cross_val_scores.mean(), svc_iter_cross_val_scores.std()))