In [1]:
import pandas as pd
import numpy as np
import utils, plot_help
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression


In [2]:
df = utils.chunk_loader('data/cleaned/business_merge_feats.csv', read_limit=-1)


df = df.drop(columns=['latitude', 'longitude', 'postal_code'])

#select only numerical columns
df_num = df.select_dtypes(include=['int', 'float64'])
df_num.shape

(161160, 102)

In [3]:
print(*df_num.columns, sep=', ')

review_count, stars, road_type, GoodForKids, RestaurantsReservations, Caters, RestaurantsTableService, RestaurantsTakeOut, RestaurantsPriceRange2, OutdoorSeating, BikeParking, HasTV, RestaurantsGoodForGroups, RestaurantsDelivery, BusinessAcceptsCreditCards, BusinessAcceptsBitcoin, ByAppointmentOnly, AcceptsInsurance, GoodForDancing, CoatCheck, HappyHour, WheelchairAccessible, DogsAllowed, DriveThru, Corkage, BYOB, Open24Hours, RestaurantsCounterService, dessert, latenight, lunch, dinner, brunch, breakfast, garage, street, validated, lot, valet, romantic, intimate, classy, hipster, divey, touristy, trendy, upscale, casual, dj, background_music, no_music, jukebox, live, video, karaoke, monday, tuesday, friday, wednesday, thursday, sunday, saturday, straightperms, coloring, extensions, africanamerican, curly, kids, perms, asian, dairy_free, gluten_free, vegan, kosher, halal, soy_free, vegetarian, NoiseLevel, WiFi, Alcohol, RestaurantsAttire, BYOBCorkage, Smoking, AgesAllowed, Restaurants,

In [4]:
X_train, X_test, y_train, y_test = utils.train_test_scale(df_num, 
                                                           'is_open', 
                                                           random_state=None)

  return self.partial_fit(X, y)


In [5]:
#instantiate with l1 penalty
logreg = LogisticRegression(penalty='l1', solver='liblinear')

logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [6]:
logreg_coef = logreg.coef_
print(*logreg_coef, sep='/n')

[ 2.94257048e+01  8.16592441e-02 -6.32978041e-03 -5.52960766e-02
 -1.29164208e-01 -9.12696663e-02 -7.27747787e-02  3.85624970e-01
 -9.52923319e-01 -3.64044514e-01  7.63240165e-01 -6.44688340e-02
 -1.22144567e-01  8.41900317e-02 -1.81079149e-01 -1.59557259e-01
  1.99522352e-01  1.84746187e+00  4.41655055e-02  3.74627735e-01
  8.23477942e-01  3.31257016e-01  1.17917831e-01  1.79545223e+00
 -3.60895346e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00
  2.61882911e-01  5.51341345e-01  8.77810035e-01  1.29397247e+00
 -5.32569167e-02  1.03696452e+00 -3.45204103e-01 -5.68195851e-01
  1.98869537e-01 -4.84561655e-01 -1.42818177e-01  3.34028488e-02
  4.83418084e-02 -3.92891628e-02 -3.39091683e-01 -4.34810247e-01
  7.19810490e-02 -3.70798559e-01  5.62373594e-01 -3.20399156e-01
 -9.26935759e-02 -3.50495743e-03  0.00000000e+00  6.16551147e-01
  5.16258020e-01 -3.23451043e-01  0.00000000e+00  2.77193271e-02
  1.79868537e-02  2.72352332e-01  2.96158480e-01  3.64196648e-01
  4.01997405e-01 -4.88880

In [7]:
logreg_coef_std = np.ravel(np.std(X_train, 0)*logreg_coef)

print(logreg_coef_std, sep='/n')

[ 4.26564711e-01  1.93802824e-02 -2.14970577e-03 -2.58566063e-02
 -4.24554926e-02 -3.04513976e-02 -1.81647852e-02  1.83350743e-01
 -2.43373297e-01 -1.23856636e-01  3.73604393e-01 -2.35352138e-02
 -5.54695730e-02  2.31701509e-02 -8.92696609e-02 -6.92048323e-03
  5.96875469e-02  3.08527371e-01  3.29124165e-03  1.88429428e-02
  1.27094829e-01  9.82162097e-02  1.38277368e-02  1.36631749e-01
 -1.00470640e-02  0.00000000e+00  0.00000000e+00  0.00000000e+00
  2.96115197e-02  6.09619064e-02  2.64322403e-01  3.63393090e-01
 -7.89698467e-03  1.49981520e-01 -6.23848375e-02 -1.81592370e-01
  1.19980414e-02 -2.09825691e-01 -1.53249911e-02  2.04905012e-03
  3.21558026e-03 -3.26752989e-03 -2.81697610e-02 -3.61814983e-02
  3.01769133e-03 -4.66273972e-02  3.28342095e-02 -1.11583054e-01
 -5.46061597e-03 -2.61732186e-04  0.00000000e+00  3.00501742e-02
  3.51362901e-02 -5.01491483e-03  0.00000000e+00  1.36412264e-03
  9.18392218e-04  3.28008506e-02  1.79053578e-02  3.15190958e-02
  2.80542675e-02 -5.93046

In [8]:
#get feats with 0 importance
coef_zero_arg = np.argwhere(logreg_coef_std==0)
#flatten
coef_zero_arg = np.ravel(coef_zero_arg)

print(list(coef_zero_arg))

[25, 26, 27, 50, 54, 62, 67, 70, 72, 73, 75, 88, 92, 98]


In [9]:
#features deemed unimportant
df_num.columns[coef_zero_arg]

Index(['BYOB', 'Open24Hours', 'RestaurantsCounterService', 'no_music',
       'karaoke', 'straightperms', 'kids', 'dairy_free', 'vegan', 'kosher',
       'soy_free', 'Spas', 'Health', 'avg_month_checkin'],
      dtype='object')

In [10]:
#get descending order by absolute value
feat_max_desc = np.argsort(np.abs(logreg_coef_std))[::-1]

In [11]:
#top k most important factors
df_num.columns[feat_max_desc[:30]]

Index(['review_count', 'BikeParking', 'dinner', 'Restaurants',
       'AcceptsInsurance', 'lunch', 'RestaurantsPriceRange2',
       'RestaurantsAttire', 'lot', 'RestaurantsTakeOut', 'street', 'Home',
       'breakfast', 'DriveThru', 'HappyHour', 'Alcohol', 'OutdoorSeating',
       'casual', 'WheelchairAccessible', 'WiFi', 'BusinessAcceptsCreditCards',
       'Medical', 'stars_change', 'cool_change', 'garage', 'latenight',
       'ByAppointmentOnly', 'BYOBCorkage', 'RestaurantsGoodForGroups', 'Bars'],
      dtype='object')