# Challenge


Transform this regression problem into a binary classifier and clean up the feature set. You can choose whether or not to include nutritional information, but try to cut your feature set down to the 30 most valuable features.

In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
food = pd.read_csv('C:\\Users\\User\\Documents\\Python_scripts\\Thinkful\\epi_r.csv')
food.head()

Unnamed: 0,title,rating,calories,protein,fat,sodium,#cakeweek,#wasteless,22-minute meals,3-ingredient recipes,...,yellow squash,yogurt,yonkers,yuca,zucchini,cookbooks,leftovers,snack,snack week,turkey
0,"Lentil, Apple, and Turkey Wrap",2.5,426.0,30.0,7.0,559.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,Boudin Blanc Terrine with Red Onion Confit,4.375,403.0,18.0,23.0,1439.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Potato and Fennel Soup Hodge,3.75,165.0,6.0,7.0,165.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Mahi-Mahi in Tomato Olive Sauce,5.0,,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Spinach Noodle Casserole,3.125,547.0,20.0,32.0,452.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# Looking at rating - the dependent variable
print(food.rating.value_counts())
print(food.rating[food['rating']>3.75].value_counts().sum()/len(food.rating))

# Transforming rating
food['good'] = np.where(food.rating > 3.75, 1, 0)

4.375    8019
3.750    5169
5.000    2719
0.000    1836
3.125    1489
2.500     532
1.250     164
1.875     124
Name: rating, dtype: int64
0.5355076800319171


In [17]:
from operator import itemgetter

# Trimming the features

# Drop features with na (harsh but following lecture approach)
X = food.drop(['title', 'calories', 'protein', 'fat', 'sodium'], 1)

# Trimming the lazy way

colnames = list(X.columns)
corrlist = []
# Note absolute values to include negative correlation too 
for name in colnames:
    corrlist.append((name, abs(X[name].corr(X['rating']))))

corrlist.sort(key=itemgetter(1), reverse=True)

# Drink and alcoholic, don't mind if I do...
# But what is weelicious?!?
print(corrlist[0:30])

# Getting name from tuple
keeplist=[]
for name in corrlist[0:32]:
    keeplist.append(name[0])

# Keeping top 32 - including rating and good 
X = X.loc[:, X.columns.isin(keeplist)]

# Setting y
y = X.good

# Dropping outcome variables
X.drop(['good', 'rating'], axis=1, inplace=True)

X.head()

[('rating', 1.0), ('good', 0.65569821821827734), ('drink', 0.241388312792117), ('alcoholic', 0.22476703847405771), ('house & garden', 0.21791106216741776), ('gin', 0.19139968509613606), ('spirit', 0.14662455068006544), ('cocktail', 0.14072263069990826), ('cocktail party', 0.14048242075003328), ('bitters', 0.13736794163364138), ('bon appétit', 0.12454881508695716), ('harpercollins', 0.1050941013381619), ('peanut free', 0.096087105738034334), ('soy free', 0.092964693060387155), ('liqueur', 0.090756673725032863), ('rum', 0.084689060660530543), ('tree nut free', 0.081346797380706423), ('non-alcoholic', 0.077475735133518089), ('bake', 0.076977707698064265), ('condiment', 0.076294684804823681), ('roast', 0.073498662658567457), ('fall', 0.067342526942713157), ('sauté', 0.066552670284387494), ('brandy', 0.065481083725895409), ('fortified wine', 0.063814314960384186), ('pernod', 0.062767982349555357), ('créme de cacao', 0.061670117147416575), ('dinner', 0.060957281550180702), ('weelicious', 0.0

Unnamed: 0,alcoholic,bake,bitters,bon appétit,brandy,chartreuse,cocktail,cocktail party,condiment,créme de cacao,...,non-alcoholic,peanut free,pernod,roast,rum,sauté,soy free,spirit,tree nut free,weelicious
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
4,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
from sklearn import model_selection

# Split on X and y
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y)

from sklearn.svm import SVC

# Creating SVC instance
svc = SVC()

# Gridsearch to find optimal values of C - cost function for soft margins - and kernel  - sets the margin
parameters = {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'rbf']}
grid = model_selection.GridSearchCV(svc, parameters)
#Fit on entire dataset (using CV which splits already)
grid.fit(X, y)
print(grid.cv_results_)



{'split0_test_score': array([ 0.55183246,  0.54779357,  0.55213164,  0.56020942,  0.55272999,
        0.55811518,  0.55272999,  0.56215408]), 'split1_test_score': array([ 0.55580491,  0.55101735,  0.55774985,  0.56223818,  0.55774985,
        0.57405745,  0.55774985,  0.5774985 ]), 'split2_test_score': array([ 0.55753404,  0.54900494,  0.56067634,  0.55858148,  0.56067634,
        0.56411791,  0.56067634,  0.57519078]), 'mean_test_score': array([ 0.55505685,  0.54927189,  0.55685218,  0.56034311,  0.55705167,
        0.56542988,  0.55705167,  0.5716138 ]), 'std_test_score': array([ 0.00238701,  0.00132962,  0.00354564,  0.00149578,  0.00328143,
        0.00657441,  0.00328143,  0.0067558 ]), 'rank_test_score': array([7, 8, 6, 3, 4, 2, 4, 1]), 'split0_train_score': array([ 0.56018553,  0.55016084,  0.56108326,  0.5720805 ,  0.56100845,
        0.5788135 ,  0.56100845,  0.5848732 ]), 'split1_train_score': array([ 0.55535607,  0.54877319,  0.55752543,  0.56276182,  0.55752543,
        0.5

In [24]:
# Fitting the model with C and kernel
svc = SVC(C= , kernel='rbf')
svr.fit(X_train,y_train)
accuracy_train = svr.score(X_train, y_train)
print("Accuracy on training set: ", accuracy_train)

# Predict
y_hat = svr.predict(X_test)
accuracy_test = svr.score(X_test, y_test)

print(pd.crosstab( y_test, y_hat))
print(accuracy_test)

0.57091561939
