In [1]:
import pandas as pd
import numpy as np
import utils, plot_help
import matplotlib.pyplot as plt
from collections import OrderedDict

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, precision_score


In [2]:
df = utils.chunk_loader('data/cleaned/business_merge_feats.csv', read_limit=-1)

df_num = utils.make_num_df(df)
df_num.head()

Unnamed: 0,review_count,stars,road_type,GoodForKids,RestaurantsReservations,Caters,RestaurantsTableService,RestaurantsTakeOut,RestaurantsPriceRange2,OutdoorSeating,...,Health,Hair,cool_change,funny_change,stars_change,useful,avg_month_checkin,span_checkin,median_income,is_open
0,5,3.0,1.0,0,0,0,0,0,0,0,...,0,0,0.0,0.0,0.0,0.0,1.777778,5323,3.5,0
1,128,2.5,8.0,1,1,1,1,1,2,0,...,0,0,-0.042484,-0.04902,-0.075163,-0.156863,36.083333,15143,3.0,1
2,170,4.0,6.0,1,1,0,1,1,2,0,...,0,0,-0.11,-0.19,0.055,-0.215,57.083333,58518,3.5,1
3,3,5.0,1.0,0,0,0,0,0,0,0,...,0,0,0.0,0.0,0.0,-0.5,1.222222,8464,3.5,1
4,3,2.5,6.0,0,0,0,0,0,2,0,...,0,0,0.0,0.0,0.0,0.0,1.0,2971,3.0,1


In [3]:
X_train, X_test, y_train, y_test = utils.train_test_scale(df_num, 
                                                           'is_open', 
                                                           random_state=None)

In [4]:
df_num['is_open'].mean()

0.8116033755274261

In [5]:
def shuffle_col (X, loc):
    #create a copy of original array to break referencing
    X_shuffle = np.copy(X)
    #shuffle in place
    np.random.shuffle(X_shuffle[:,loc])
    return X_shuffle


In [6]:
gbt = GradientBoostingClassifier()
gbt.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [7]:
y_pred =gbt.predict(X_test)

In [8]:
precision_score(y_test, y_pred, average=None, labels=[0, 1])


array([0.76437848, 0.84203822])

In [9]:
f1_base = f1_score(y_test, y_pred, average=None)[0]
f1_base

0.3214982442450254

In [10]:
shuff_dict = OrderedDict()

for i in range(X_train.shape[1]):
    #shuffle at locaiton i
    X_shuff = shuffle_col(X_train, i)
    #instantiate new classifier
    gbt_shuff = GradientBoostingClassifier().fit(X_shuff, y_train)
    #make prediction
    y_shuff =gbt_shuff.predict(X_test)
    #get f1 score
    f1_shuff = f1_score(y_test, y_shuff, average=None)[0]
    #compre MSE
    mse = (f1_shuff - f1_base)**2
    #make sure to delete
    del gbt_shuff
    
    #add to dict
    shuff_dict[i] = mse
    
df_shuff = pd.DataFrame.from_dict(shuff_dict, orient='index')

df_shuff.head()

Unnamed: 0,0
0,5.212823e-05
1,2.152227e-06
2,0.0
3,1.905189e-07
4,4.161827e-05


In [12]:
df_shuff.columns = ['mse']
df_shuff_order = df_shuff.reset_index(drop=False)
df_shuff_order = df_shuff_order.sort_values(by='mse',ascending=False)
df_shuff_order.head()

Unnamed: 0,index,mse
10,10,0.000436
31,31,0.000191
77,77,0.000179
30,30,0.000158
37,37,9.1e-05


In [14]:
df_shuff_order.to_csv('data/cleaned/bus_shuffle_imp.csv')

In [19]:
no_imp = df_shuff_order[df_shuff_order['mse']==0]['index'].tolist()

In [23]:
df_num.columns[no_imp]

Index(['gluten_free', 'dairy_free', 'Spas', 'Hair', 'road_type', 'asian',
       'perms', 'vegan', 'Health', 'kosher', 'halal', 'soy_free', 'vegetarian',
       'AgesAllowed', 'Beauty', 'curly', 'kids', 'thursday', 'africanamerican',
       'dessert', 'intimate', 'romantic', 'valet', 'validated', 'HasTV',
       'brunch', 'RestaurantsCounterService', 'hipster', 'Open24Hours', 'BYOB',
       'Corkage', 'DogsAllowed', 'BusinessAcceptsBitcoin', 'CoatCheck',
       'classy', 'divey', 'extensions', 'monday', 'coloring', 'straightperms',
       'sunday', 'GoodForDancing', 'wednesday', 'tuesday', 'karaoke',
       'touristy', 'video', 'jukebox', 'background_music', 'dj', 'upscale',
       'trendy', 'no_music'],
      dtype='object')