In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import confusion_matrix,recall_score,accuracy_score,precision_score,f1_score
from imblearn.over_sampling import SMOTE

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.decomposition import PCA

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df=pd.read_csv('./restaurant_star_prediction_to_use.csv')

In [3]:
df_2018_o_c=df[df.review_year_max>=2016]

In [4]:
restaurant_4_closure=['business_id','name']
location_4_closure=['state','city','longitude','latitude','neighborhood']
restaurant_attribute_num_4_closure=[x for x in df_2018_o_c.columns[15:32] if x not in ['BikeParking','GoodForKids','RestaurantsDelivery','RestaurantsGoodForGroups']]
restaurant_life_4_closure=['review_year_min','review_year_max','review_year_diff']
restaurant_attribute_cat_4_closure=['attribute']
restaurant_category_cat_4_closure=['categories_clean_short']
restaurant_target_4_closure=['is_open']

In [5]:
df_city_rest_density=df_2018_o_c.groupby('city')[['Unnamed: 0']].count()
df_city_rest_density.reset_index(inplace=True)
df_city_rest_density.columns=['city','restaurant_count']
df_2018_o_c=df_2018_o_c.merge(df_city_rest_density,left_on='city',right_on='city')

In [6]:
location_4_closure_to_use=['state','neighborhood','restaurant_count']

In [7]:
X_base_location_only_4_closure=pd.get_dummies(df_2018_o_c[location_4_closure],drop_first=True)

In [8]:
df_2018_o_c[restaurant_attribute_num_4_closure]=df_2018_o_c[restaurant_attribute_num_4_closure].fillna(-1)

In [9]:
df_2018_o_c_dropna=df_2018_o_c[df_2018_o_c.Tuesday_num_of_chkins!=-1].copy()

In [10]:
df_2018_o_c_dropna.reset_index(drop=True,inplace=True)

In [15]:
df_2018_o_c_dropna[restaurant_attribute_num_4_closure].isnull().any()

Caters                     False
HasTV                      False
OutdoorSeating             False
RestaurantsPriceRange2     False
RestaurantsReservations    False
RestaurantsTableService    False
Friday_num_of_chkins       False
Monday_num_of_chkins       False
Saturday_num_of_chkins     False
Sunday_num_of_chkins       False
Thursday_num_of_chkins     False
Tuesday_num_of_chkins      False
Wednesday_num_of_chkins    False
dtype: bool

In [16]:
#location
X_location_4_closure=pd.get_dummies(df_2018_o_c_dropna[location_4_closure_to_use],drop_first=True)
#attribute_num
X_attr_num_4_closure=df_2018_o_c_dropna[restaurant_attribute_num_4_closure]
X_2_4_closure=pd.concat([X_location_4_closure,X_attr_num_4_closure],axis=1)
y_2_4_closure=df_2018_o_c_dropna[restaurant_target_4_closure]

In [19]:
X_3_4_closure=pd.concat([X_2_4_closure,df_2018_o_c_dropna['review_year_diff']],axis=1)

In [20]:
for col in [x for x in X_3_4_closure.columns if 'chkins' in x]:
    X_3_4_closure[col]=X_3_4_closure[col]/X_3_4_closure['review_year_diff'].map(lambda x:1 if x==0 else x)

In [22]:
cv_4_4_closure=CountVectorizer(stop_words='english',ngram_range=(1,1))
vect_4_4_closure=cv_4_4_closure.fit_transform(df_2018_o_c_dropna[restaurant_attribute_cat_4_closure]['attribute'])
df_attr_4_4_closure=pd.DataFrame(vect_4_4_closure.toarray(),columns=cv_4_4_closure.get_feature_names())

In [23]:
X_4_4_closure=pd.concat([X_3_4_closure,df_attr_4_4_closure],axis=1)

In [32]:
cv_5_4_closure_tfidf=TfidfVectorizer(stop_words='english',ngram_range=(1,1))
vect_5_4_closure_tfidf=cv_5_4_closure_tfidf.fit_transform(df_2018_o_c_dropna[restaurant_attribute_cat_4_closure]['attribute'])
df_attr_5_4_closure_tfidf=pd.DataFrame(vect_5_4_closure_tfidf.toarray(),columns=cv_5_4_closure_tfidf.get_feature_names())

In [33]:
X_5_4_closure_tfidf=pd.concat([X_4_4_closure,df_attr_5_4_closure_tfidf],axis=1)

In [34]:
def standardize_text(df, text_field):
    df[text_field] = df[text_field].str.replace(r"http\S+", "")
    df[text_field] = df[text_field].str.replace(r"http", "")
    df[text_field] = df[text_field].str.replace(r"@\S+", "")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df[text_field] = df[text_field].str.replace(r"@", "at")
    df[text_field] = df[text_field].str.lower()
    return df

In [31]:
df_closure_review=standardize_text(df_2018_o_c_dropna,'review_text')

In [35]:
cv_6_4_closure_review=TfidfVectorizer(stop_words='english',ngram_range=(1,1),max_features=500)
vect_6_4_closure_review=cv_6_4_closure_review.fit_transform(df_closure_review['review_text'])
df_6_4_closure_review=pd.DataFrame(vect_6_4_closure_review.toarray(),columns=cv_6_4_closure_review.get_feature_names())
X_6_cv_4_closure=pd.concat([X_5_4_closure_tfidf,df_6_4_closure_review],axis=1)

In [38]:
y_6_cv_4_closure=df_2018_o_c_dropna[restaurant_target_4_closure]

In [40]:
sm = SMOTE(random_state=2)

In [41]:
X_6_cv_4_closure_train,X_6_cv_4_closure_test,y_6_cv_4_closure_train,y_6_cv_4_closure_test=train_test_split(X_6_cv_4_closure,y_6_cv_4_closure,test_size=0.2,random_state=5)
X_train_res, y_train_res = sm.fit_sample(X_6_cv_4_closure_train, y_6_cv_4_closure_train)

  y = column_or_1d(y, warn=True)


In [42]:
pca = PCA(n_components=320)
pca.fit(X_train_res)
X_pca_train=pca.transform(X_train_res)
X_pca_test=pca.transform(X_6_cv_4_closure_test)

In [43]:
clf_lr_6_cv_4_closure_lsvc = LogisticRegression(C=10,penalty='l2')
clf_lr_6_cv_4_closure_lsvc.fit(X_pca_train, y_train_res)
clf_lr_6_cv_4_closure_lsvc_predict=clf_lr_6_cv_4_closure_lsvc.predict(X_pca_test)
df_clf_6_cv_4_closure_lsvc=pd.DataFrame()
df_clf_6_cv_4_closure_lsvc['diff']=np.abs(clf_lr_6_cv_4_closure_lsvc_predict-y_6_cv_4_closure_test.values.ravel())
print(df_clf_6_cv_4_closure_lsvc['diff'].value_counts())
print(df_clf_6_cv_4_closure_lsvc['diff'].sum()/len(df_clf_6_cv_4_closure_lsvc['diff']))



0    3781
1    1582
Name: diff, dtype: int64
0.29498415066194295


In [44]:
confusion_matrix(y_6_cv_4_closure_test,clf_lr_6_cv_4_closure_lsvc_predict)

array([[ 461,  225],
       [1357, 3320]], dtype=int64)

In [45]:
pca_nb = PCA(n_components=320)
pca_nb.fit(X_train_res)
X_pca_nb_train=pca.transform(X_train_res)
X_pca_nb_test=pca.transform(X_6_cv_4_closure_test)

In [46]:
clf_lr_6_cv_4_closure = GaussianNB()
clf_lr_6_cv_4_closure.fit(X_pca_nb_train, y_train_res)
clf_lr_6_cv_4_closure_predict=clf_lr_6_cv_4_closure.predict(X_pca_nb_test)
df_clf_6_cv_4_closure=pd.DataFrame()
df_clf_6_cv_4_closure['diff']=np.abs(clf_lr_6_cv_4_closure_predict-y_6_cv_4_closure_test.values.ravel())
print(df_clf_6_cv_4_closure['diff'].value_counts())
print(df_clf_6_cv_4_closure['diff'].sum()/len(df_clf_6_cv_4_closure['diff']))

1    3662
0    1701
Name: diff, dtype: int64
0.6828267760581764


In [47]:
confusion_matrix(y_6_cv_4_closure_test,clf_lr_6_cv_4_closure_predict)

array([[ 503,  183],
       [3479, 1198]], dtype=int64)

In [48]:
pca_stack = PCA(n_components=320)
pca_stack.fit(X_6_cv_4_closure)
array_pca_stack_full=pca.transform(X_6_cv_4_closure)

In [49]:
df_pca_stack_full=pd.DataFrame(array_pca_stack_full)

In [51]:
df_temp_lr_prob=pd.DataFrame(clf_lr_6_cv_4_closure_lsvc.predict_proba(df_pca_stack_full))

In [52]:
df_temp_nb_prob=pd.DataFrame(clf_lr_6_cv_4_closure.predict_proba(df_pca_stack_full))

In [53]:
X_pca_stack_full=X_6_cv_4_closure.copy()

In [54]:
X_pca_stack_full['lr_prediction_0_prob']=df_temp_lr_prob[0]

In [55]:
X_pca_stack_full['nb_prediction_0_prob']=df_temp_nb_prob[0]

In [56]:
X_stack_train,X_stack_test,y_stack_train,y_stack_test=train_test_split(X_pca_stack_full,y_6_cv_4_closure,test_size=0.2,random_state=5)

In [58]:
X_train_res_stack, y_train_res_stack = sm.fit_sample(X_stack_train, y_stack_train)

  y = column_or_1d(y, warn=True)


In [59]:
pca_stack = PCA(n_components=95)
pca_stack.fit(X_train_res_stack)
X_pca_stack_train=pca_stack.transform(X_train_res_stack)
X_pca_stack_test=pca_stack.transform(X_stack_test)

In [60]:
clf_lr_6_cv_4_closure_stack = LogisticRegression(C=1000,penalty='l2',max_iter=5000)
clf_lr_6_cv_4_closure_stack.fit(X_pca_stack_train, y_train_res_stack)
clf_lr_6_cv_4_closure_stack_predict=clf_lr_6_cv_4_closure_stack.predict(X_pca_stack_test)
df_clf_6_cv_4_closure_stack=pd.DataFrame()
df_clf_6_cv_4_closure_stack['diff']=np.abs(clf_lr_6_cv_4_closure_stack_predict-y_stack_test.values.ravel())
print(df_clf_6_cv_4_closure_stack['diff'].value_counts())
print(df_clf_6_cv_4_closure_stack['diff'].sum()/len(df_clf_6_cv_4_closure_stack['diff']))



0    3783
1    1580
Name: diff, dtype: int64
0.2946112250606004


In [61]:
confusion_matrix(y_stack_test,clf_lr_6_cv_4_closure_stack_predict)

array([[ 467,  219],
       [1361, 3316]], dtype=int64)

In [13]:
from sklearn.externals import joblib
import pickle

In [63]:
joblib.dump(clf_lr_6_cv_4_closure_stack,'restaurant_closure_prediction_model.pkl')

['restaurant_closure_prediction_model.pkl']

In [67]:
X_closure_prediction=pca_stack.transform(X_pca_stack_full)

In [68]:
with open('restaurant_closure_X.pkl','wb') as file:
    pickle.dump(X_closure_prediction,file)

In [15]:
with open('restaurant_closure_to_display.pkl','wb') as file:
    pickle.dump(df_2018_o_c_dropna[['name','address','categories','state','city','categories_clean_short','is_open']],file)

In [75]:
model=joblib.load('restaurant_closure_prediction_model.pkl')

In [76]:
with open('restaurant_closure_X.pkl','rb') as file:
    data=pickle.load(file)

In [77]:
model.predict(data)

array([1, 1, 0, ..., 0, 0, 1], dtype=int64)

In [78]:
confusion_matrix(y_6_cv_4_closure,model.predict(data))

array([[ 2503,  1006],
       [ 6397, 16908]], dtype=int64)

In [79]:
2503/(2503+1006)

0.713308634938729

In [80]:
16908/(16908+6397)

0.7255095473074448