In [190]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)  #this silences future warnings, like the one on Logistic Regression solver
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [161]:


df = pd.read_csv('https://raw.githubusercontent.com/fivethirtyeight/data/master/thanksgiving-2015/thanksgiving-2015-poll-data.csv')



df.columns = ['ID', 'Celebrate', 'Main_Dish', 'Main_Dish_Other', 'Main_Dish_Cooked', 'Main_Dish_Cooked_Other', 'Stuffing',
              'Stuffing_Other', 'Cranberry_Sauce', 'Cranberry_Sauce_Other', 'Gravy', 'Brussel_Sprouts', 'Carrots', 'Cauliflower',
              'Corn', 'Cornbread', 'Fruit_Salad', 'Green_Beans', 'Mac_and_Cheese', 'Mashed_Potatoes', 'Rolls_Biscuits', 'Squash',
              'Salad', 'Sweet_Potatoes', 'Side_Dish_Other1', 'Side_Dish_Other2', 'Apple_Pie', 'Buttermilk_Pie', 'Cherry_Pie', 
              'Chocolate_Pie', 'Coconut_Cream_Pie', 'Key_Lime_Pie', 'Peach_Pie', 'Pecan_Pie', 'Pumpkin_Pie', 'Sweet_Potato_Pie', 
              'No_Pie', 'Other_Pie1', 'Other_Pie2', 'Apple_Cobbler', 'Blondies', 'Brownies', 'Carrot_Cake', 'Cheesecake', 'Cookies',
              'Fudge', 'Ice_Cream', 'Peach_Cobbler', 'No_Dessert', 'Other_Dessert1', 'Other_Dessert2', 'Prayer', 'Travel_Distance', 'Parade',
              'Kids_Table_Age', 'Old_Friends', 'Friendsgiving', 'Black_Friday_Shopper', 'Retail_Worker', 'Black_Friday_Worker', 'Neighborhood_Type',
              'Age', 'Gender', 'Household_Earnings', 'US_Region']

In [162]:
side_dishes = ['Stuffing',
              'Stuffing_Other', 'Cranberry_Sauce', 'Cranberry_Sauce_Other', 'Gravy', 'Brussel_Sprouts', 'Carrots', 'Cauliflower',
              'Corn', 'Cornbread', 'Fruit_Salad', 'Green_Beans', 'Mac_and_Cheese', 'Mashed_Potatoes', 'Rolls_Biscuits', 'Squash',
              'Salad', 'Sweet_Potatoes', 'Side_Dish_Other1', 'Side_Dish_Other2', 'Apple_Pie', 'Buttermilk_Pie', 'Cherry_Pie', 
              'Chocolate_Pie', 'Coconut_Cream_Pie', 'Key_Lime_Pie', 'Peach_Pie', 'Pecan_Pie', 'Pumpkin_Pie', 'Sweet_Potato_Pie', 
              'No_Pie', 'Other_Pie1', 'Other_Pie2', 'Apple_Cobbler', 'Blondies', 'Brownies', 'Carrot_Cake', 'Cheesecake', 'Cookies',
              'Fudge', 'Ice_Cream', 'Peach_Cobbler', 'No_Dessert', 'Other_Dessert1', 'Other_Dessert2', 'Casserole', 'Meat_Pie', 'Chess_Pie']

vegetables = ['Brussel_Sprouts', 'Carrots', 'Cauliflower', 'Corn', 'Green_Beans',
              'Squash', 'Salad']
starches = ['Stuffing', 'Cornbread', 'Mac_and_Cheese', 'Mashed_Potatoes', 'Rolls_Biscuits',
            'Sweet_Potatoes']
sweet_sides = ['Cranberry_Sauce', 'Fruit_Salad']
desserts = ['Apple_Pie', 'Buttermilk_Pie', 'Cherry_Pie', 
              'Chocolate_Pie', 'Coconut_Cream_Pie', 'Key_Lime_Pie', 'Peach_Pie', 
               'Pecan_Pie', 'Pumpkin_Pie', 'Sweet_Potato_Pie',
            'Apple_Cobbler', 
               'Blondies', 'Brownies', 'Carrot_Cake', 'Cheesecake', 'Cookies',
              'Fudge', 'Ice_Cream', 'Peach_Cobbler', 'Chess_Pie']
pies = ['Apple_Pie', 'Buttermilk_Pie', 'Cherry_Pie', 
              'Chocolate_Pie', 'Coconut_Cream_Pie', 'Key_Lime_Pie', 'Peach_Pie', 
               'Pecan_Pie', 'Pumpkin_Pie', 'Sweet_Potato_Pie', 'Chess_Pie', 'Meat_Pie']
food = ['Stuffing', 'Cranberry_Sauce', 'Gravy', 'Brussel_Sprouts', 'Carrots', 'Cauliflower',
       'Corn', 'Cornbread', 'Fruit_Salad', 'Green_Beans', 'Mac_and_Cheese',
       'Mashed_Potatoes', 'Rolls_Biscuits', 'Squash', 'Salad',
       'Sweet_Potatoes', 'Apple_Pie', 'Buttermilk_Pie', 'Cherry_Pie',
       'Chocolate_Pie', 'Coconut_Cream_Pie', 'Key_Lime_Pie', 'Peach_Pie',
       'Pecan_Pie', 'Pumpkin_Pie', 'Sweet_Potato_Pie',
       'Apple_Cobbler', 'Blondies', 'Brownies', 'Carrot_Cake', 'Cheesecake',
       'Cookies', 'Fudge', 'Ice_Cream', 'Peach_Cobbler', 'Homemade_Cranberry_Sauce', 'Roast_Turkey', 'Smoked_Turkey', 
       'Rice_Stuffing', 'Fried_Turkey', 'Meat_Pie', 'Chess_Pie', 'Casserole', 'Broccoli', 'Cornbread_Stuffing',
       'Canned_Cranberry_Sauce']

In [163]:
#create regional groups

northeast = ['Squash', 'Apple_Pie', 'Homemade_Cranberry_Sauce', 'Cauliflower', 'Corn', 'Brussel_Sprouts', 'Roast_Turkey']
southern = ['Mac_and_Cheese', 'Cornbread', 'Sweet_Potato_Pie', 'Pecan_Pie', 'Fried_Turkey', 'Sweet_Potatoes',
           'Cornbread_Stuffing', 'Chess_Pie', 'Peach_Pie', 'Key_Lime_Pie', 'Canned_Cranberry_Sauce']
midwest_and_plains = ['Cherry_Pie', 'Pumpkin_Pie', 'Rolls_Biscuits', 'Mashed_Potatoes', 'Rice', 'Rice_Stuffing',
                      'Green_Beans', 'Casserole']
west = ['Fruit_Salad', 'Salad', 'Brussel_Sprouts', 'Apple_Pie', 'Cherry_Pie', 'Canned_Cranberry_Sauce']
hawaii = ['Smoked_Turkey', 'Rolls_Biscuits', 'Salad', 'Rice_Stuffing']

In [164]:
def wrangle(df):
    #remove any rows where they do not celebrate
    df = df.drop(df[df.Celebrate == 'No'].index)
    df = df.drop('Celebrate', axis = 1)
    #remove earnings, is creating a map of earning potential by region rather than anything to do with food
    df = df.drop('Household_Earnings', axis=1)
    
    #many columns formatted so that a column has the name of the dish if true and NaN if false
#so replace all the NaN with No and all values with Yes
#also for Parade column
    for column in df.columns:
        if 'Other' in column:
            df[column] = df[column].fillna('No')
        elif (column in side_dishes) & (column != 'Cranberry_Sauce') & (column != 'Stuffing'):
            df[column] = df[column].replace(r'.*', 'Yes', regex=True)
            df[column] = df[column].fillna('No')
        elif (column == 'Cranberry_Sauce') | (column == 'Stuffing'):
            df[column] = df[column].fillna('None')
        elif column == 'Parade':
            df[column] = df[column].replace(r'.*', 'Yes', regex=True)
            df[column] = df[column].fillna('No')
    df = df.replace('YesYes', 'Yes')
    df['Cranberry_Sauce'] = np.where((df['Cranberry_Sauce_Other'].str.contains('homemade', case=False) 
                                      & df['Cranberry_Sauce_Other'].str.contains('canned', case=False)),
                                     'Both', df['Cranberry_Sauce'])
    df['Salad'] = np.where(df['Side_Dish_Other2'].str.contains('salad', case=False), 'Yes', df['Salad'])
    df['Casserole'] = 'No'
    df['Casserole'] = np.where(df['Side_Dish_Other2'].str.contains('casserole', case=False), 'Yes', df['Casserole'])
    df['Stuffing'] = np.where((df['Stuffing_Other'].str.contains('cornbread', case=False) 
                                      | df['Stuffing_Other'].str.contains('corn bread', case=False)), 
                                        'Cornbread', df['Stuffing'])
    df['Meat_Pie'] = df['Other_Pie2'].apply(lambda x: 'Yes' if 'meat' in x.lower() else 'No')
    df['Chess_Pie'] = df['Other_Pie2'].apply(lambda x: 'Yes' if ('chess' in x.lower()) | ('cornmeal' in x.lower()) else 'No')
    df['Main_Dish_Cooked'] = np.where(df['Main_Dish_Cooked_Other'].str.contains('grilled', case=False), 'Grilled', df['Main_Dish_Cooked'])
    df['Main_Dish'] = np.where(df['Main_Dish_Other'].str.contains('turkey', case=False), 'Turkey', df['Main_Dish'])
    df['Main_Dish_Cooked'] = np.where(df['Main_Dish_Cooked_Other'].str.contains('smoked', case=False), 'Smoked', df['Main_Dish_Cooked'])
    df['Broccoli'] = df['Side_Dish_Other2'].apply(lambda x: 'Yes' if 'broccoli' in x.lower() else 'No')
    df['Pumpkin_Pie'] = np.where(df['Other_Dessert2'].str.contains('pumpkin', case=False), 'Yes', df['Pumpkin_Pie'])
    df['Rice'] = df['Side_Dish_Other2'].apply(lambda x: 'Yes' if 'rice' in x.lower() else 'No')
    for column in df.columns:
        if 'Other' in column:
            df = df.drop(column, axis=1)
#simplify main dish options
    main_dishes = ('Turkey', 'Ham/Pork', 'Tofurkey', 'Chicken')
    df['Main_Dish'] = df['Main_Dish'].apply(lambda x : x if x in main_dishes else 'Other')

#remove the ID column, it's meaningless
    df = df.drop('ID', axis=1)
#replace empty values in Black_Friday_Worker with 'No' if person isn't in retail
    df['Black_Friday_Worker'] = df['Retail_Worker'].apply(lambda x : 'No' if x == 'No' else x)

#fix NaN values in other columns
    df['Kids_Table_Age'] = df['Kids_Table_Age'].fillna('No Kids Table')
    mode = df['Age'].mode()[0]
    df['Age'] = df['Age'].fillna(mode)
    mode = df['Travel_Distance'].mode()[0]
    df['Travel_Distance'] = df['Travel_Distance'].fillna(mode)
    df = df.fillna('Other')
    
    df['Kids_Table'] = df['Kids_Table_Age'].apply(lambda x: 'No' if x == 'No Kids Table' else 'Yes')
    
    df['Fried_Turkey'] = df['Main_Dish_Cooked'].apply(lambda x: 'Yes' if x == 'Fried' else 'No')
    df['Homemade_Cranberry_Sauce'] = df['Cranberry_Sauce'].apply(lambda x: 'Yes' if (x == 'Homemade') | (x == 'Both') else 'No') 
    df['Cornbread_Stuffing'] = df['Stuffing'].apply(lambda x: 'Yes' if x == 'Cornbread' else 'No')
    df['Canned_Cranberry_Sauce'] = df['Cranberry_Sauce'].apply(lambda x: 'Yes' if x == 'Canned' else 'No')
    df['Smoked_Turkey'] = df['Main_Dish_Cooked'].apply(lambda x: 'Yes' if x == 'Smoked' else 'No')
    df['Rice_Stuffing'] = df['Stuffing'].apply(lambda x: 'Yes' if 'rice' in x.lower() else 'No')
    df['Roast_Turkey'] = df['Main_Dish_Cooked'].apply(lambda x: 'Yes' if x == 'Roasted' else 'No')
    
    northeast_df = df[df[northeast] == 'Yes']
    southern_df = df[df[southern] == 'Yes']
    flyover_df = df[df[midwest_and_plains] == 'Yes']
    west_df = df[df[west] == 'Yes']
    hawaiian_df = df[df[hawaii] == 'Yes']
        
    for column in df.columns:
        if column not in food:
            df = df.drop(column, axis=1)
    
    northeast_df['NE_Total'] = northeast_df.count(axis=1)
    southern_df['South_Total'] = southern_df.count(axis=1)
    flyover_df['MW_Total'] = flyover_df.count(axis=1)
    west_df['W_Total'] = west_df.count(axis=1)
    hawaiian_df['H_total'] = hawaiian_df.count(axis=1)
    df['NE_Total'] = northeast_df['NE_Total']
    df['South_Total'] = southern_df['South_Total']
    df['MW_Total'] = flyover_df['MW_Total']
    df['W_Total'] = west_df['W_Total']
    df['H_Total'] = hawaiian_df['H_total']
    
    totals = ['NE_Total', 'South_Total', 'MW_Total', 'W_Total', 'H_Total']
    df['Type_of_Thanksgiving'] = df[totals].idxmax(axis=1)
    
    df['Type_of_Thanksgiving'] = df['Type_of_Thanksgiving'].str.replace('_Total', '')
    df['Type_of_Thanksgiving'] = df['Type_of_Thanksgiving'].replace(['MW', 'South', 'NE', 'W', 'H'], ['Midwestern', 'Southern', 'New Englander', 'Californian/Pacific Coast', 'Hawaiian'])
    
    for column in totals:
        df = df.drop(column, axis=1)
    return df

In [165]:
wrangled = wrangle(df)

In [166]:
wrangled.head()

Unnamed: 0,Stuffing,Cranberry_Sauce,Gravy,Brussel_Sprouts,Carrots,Cauliflower,Corn,Cornbread,Fruit_Salad,Green_Beans,...,Chess_Pie,Broccoli,Fried_Turkey,Homemade_Cranberry_Sauce,Cornbread_Stuffing,Canned_Cranberry_Sauce,Smoked_Turkey,Rice_Stuffing,Roast_Turkey,Type_of_Thanksgiving
0,Bread-based,,Yes,No,Yes,No,No,No,No,Yes,...,No,No,No,No,No,No,No,No,No,Southern
1,Bread-based,Other (please specify),Yes,No,No,No,Yes,No,No,Yes,...,No,No,No,No,No,No,No,No,No,Midwestern
2,Rice-based,Homemade,Yes,Yes,Yes,Yes,Yes,Yes,No,No,...,No,No,No,Yes,No,No,No,Yes,Yes,New Englander
3,Bread-based,Homemade,Yes,Yes,No,No,No,Yes,No,No,...,No,No,No,Yes,No,No,No,No,No,Southern
4,Bread-based,Canned,Yes,Yes,No,No,No,Yes,No,No,...,No,No,No,No,No,Yes,No,No,No,Californian/Pacific Coast


In [167]:
train, test = train_test_split(df, random_state=42)
val, test = train_test_split(test, random_state=42)
train = wrangle(train)
test = wrangle(test)
val = wrangle(val)

In [168]:
target = 'Type_of_Thanksgiving'

X_train = train.drop(target, axis=1)
y_train = train[target]

X_val = val.drop(target, axis=1)
y_val = val[target]

X_test = test.drop(target, axis=1)
y_test = test[target]

In [169]:
y_train.value_counts(normalize=True)

Midwestern                   0.495251
New Englander                0.242877
Southern                     0.240163
Californian/Pacific Coast    0.021710
Name: Type_of_Thanksgiving, dtype: float64

So my baseline to beat is 49.5%, for Midwest

In [196]:
pipeline = make_pipeline(OrdinalEncoder(), StandardScaler(), LogisticRegression())

In [171]:
pipeline

Pipeline(memory=None,
         steps=[('ordinalencoder',
                 OrdinalEncoder(categories='auto',
                                dtype=<class 'numpy.float64'>)),
                ('logisticregression',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='warn', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [197]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('ordinalencoder',
                 OrdinalEncoder(categories='auto',
                                dtype=<class 'numpy.float64'>)),
                ('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('logisticregression',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='warn', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [198]:
pipeline.score(X_val, y_val)

0.9405405405405406

In [193]:
pipeline = make_pipeline(OrdinalEncoder(), StandardScaler(), RandomForestClassifier())

In [194]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('ordinalencoder',
                 OrdinalEncoder(categories='auto',
                                dtype=<class 'numpy.float64'>)),
                ('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('randomforestclassifier',
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=10, n_jobs=None,
                                        oob_score

In [195]:
pipeline.score(X_val, y_val)

0.7567567567567568

In [199]:
encoder = OrdinalEncoder()
X_train_encoded = encoder.fit_transform(X_train)

X_train_encoded = pd.DataFrame(X_train_encoded, columns=X_train.columns)

model = LogisticRegression()
model.fit(X_train_encoded, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [200]:
import eli5
from eli5.sklearn import PermutationImportance


perm = PermutationImportance(model).fit(X_train_encoded, y_train)
eli5.show_weights(perm)

Weight,Feature
0.0955  ± 0.0185,x24
0.0944  ± 0.0176,x6
0.0890  ± 0.0259,x40
0.0868  ± 0.0113,x16
0.0847  ± 0.0101,x45
0.0825  ± 0.0079,x12
0.0779  ± 0.0126,x9
0.0746  ± 0.0150,x10
0.0735  ± 0.0125,x11
0.0687  ± 0.0041,x13


In [217]:
from sklearn.model_selection import GridSearchCV

# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(results['mean_test_score'][candidate], results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
            


In [215]:
classifier = RandomForestClassifier()

classifier.fit(X_train_encoded, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [219]:
#Grid Search for hyperparameters

param_grid = {"max_depth": range(30, 50, 2),
              "n_estimators": range(100, 500, 10),
              "min_samples_leaf": range(2, 10)
              }

# run grid search
grid_search = GridSearchCV(classifier, param_grid=param_grid, cv=3, iid=False, n_jobs=-1)
grid_search.fit(X_train_encoded, y_train)
report(grid_search.cv_results_)

Model with rank: 1
Mean validation score: 0.826 (std: 0.016)
Parameters: {'max_depth': 48, 'min_samples_leaf': 2, 'n_estimators': 280}

Model with rank: 2
Mean validation score: 0.822 (std: 0.021)
Parameters: {'max_depth': 36, 'min_samples_leaf': 2, 'n_estimators': 490}

Model with rank: 3
Mean validation score: 0.822 (std: 0.013)
Parameters: {'max_depth': 40, 'min_samples_leaf': 2, 'n_estimators': 450}



In [226]:
pipeline = make_pipeline(OrdinalEncoder(), RandomForestClassifier(
    n_estimators=280, max_depth=48, min_samples_leaf = 2))

In [227]:
pipeline.fit(X_train, y_train)

pipeline.score(X_val, y_val)

0.8216216216216217