In [1]:
hygiene_text_path= "./Hygiene/hygiene.dat"
hygiene_labels_path= "./Hygiene/hygiene.dat.labels"
hygiene_additional_path= "./Hygiene/hygiene.dat.additional"

In [2]:
import pandas as pd
import numpy as np
import multiprocessing
import gensim
import nltk
import spacy

from UtilWordEmbedding import MeanEmbeddingVectorizer

from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin


from gensim.parsing.preprocessing import strip_tags, strip_punctuation, strip_numeric, strip_short
from gensim.parsing.preprocessing import strip_multiple_whitespaces, strip_non_alphanum, remove_stopwords, stem_text
from gensim.models.word2vec import Word2Vec
from nltk.stem import WordNetLemmatizer, SnowballStemmer

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.feature_extraction import DictVectorizer

from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn import svm
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, GridSearchCV
from tqdm import tqdm


from nltk.corpus import stopwords 
STOP_WORDS = set(stopwords.words('english'))


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
SEED=30

In [3]:
# tokenize and preprocess
# https://radimrehurek.com/gensim/parsing/preprocessing.html
FILTERS_LIST = [lambda x: x.lower(), # lowercase  
                strip_tags, # remove tags
                strip_punctuation, # replace punctuation characters with spaces
                strip_multiple_whitespaces, # remove repeating whitespaces
                # strip_numeric, # remove numbers
                gensim.parsing.preprocessing.remove_stopwords, # remove stopwords
                strip_short, # remove words less than minsize=3 characters long]
                stem_text]
def preprocess(text):
    """
    strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, 
    """
    result_stemmed = []
    for token in gensim.parsing.preprocessing.preprocess_string(text, FILTERS_LIST):
        result_stemmed.append(WordNetLemmatizer().lemmatize(token))
    return result_stemmed

In [4]:
%%time
texts = []
preprocessed_texts = []

with open(hygiene_text_path) as f:
    texts = f.readlines()
    
for _text in tqdm(texts):
    result_stemmed = preprocess(_text)
    preprocessed_texts.append(result_stemmed)
    
all_preprocessed_texts = [" ".join(_text) for _text in preprocessed_texts]

100%|███████████████████████████████████████████████████████████████████████████| 13299/13299 [01:37<00:00, 136.52it/s]


Wall time: 1min 37s


In [49]:
N = 546

# labels 
with open(hygiene_labels_path, 'r') as f:
    labels = [l.rstrip() for l in f]

# texts = []
# with open(hygiene_text_path, 'r') as f:
#     texts = f.read().splitlines(True)


df = pd.DataFrame({"label":labels, "text": texts, 
                   "preprocessed_texts": all_preprocessed_texts,
                   "tokenized_texts": preprocessed_texts})
hygiene_additional = pd.read_csv(hygiene_additional_path,  
                                 names=["cuisines_offered", "zipcode", "num_reviews", "avg_rating"],
                                 dtype={"cuisines_offered": str, 
                                        "zipcode": str,
                                        "num_reviews": str})
df = df.join(hygiene_additional)
df['avg_rating'] = df['avg_rating'].apply(lambda x: str(int(round(x, 0))))

print(df.info())
display(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13299 entries, 0 to 13298
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   label               13299 non-null  object
 1   text                13299 non-null  object
 2   preprocessed_texts  13299 non-null  object
 3   tokenized_texts     13299 non-null  object
 4   cuisines_offered    13299 non-null  object
 5   zipcode             13299 non-null  object
 6   num_reviews         13299 non-null  object
 7   avg_rating          13299 non-null  object
dtypes: object(8)
memory usage: 831.3+ KB
None


Unnamed: 0,label,text,preprocessed_texts,tokenized_texts,cuisines_offered,zipcode,num_reviews,avg_rating
0,1,"The baguettes and rolls are excellent, and alt...",baguett roll excel haven tri excit dozen plu t...,"[baguett, roll, excel, haven, tri, excit, doze...","['Vietnamese', 'Sandwiches', 'Restaurants']",98118,4,4
1,1,I live up the street from Betty. &#160;When my...,live street betti 160 sister town spring break...,"[live, street, betti, 160, sister, town, sprin...","['American (New)', 'Restaurants']",98109,21,4
2,1,I'm worried about how I will review this place...,worri review place strongli think bad night pl...,"[worri, review, place, strongli, think, bad, n...","['Mexican', 'Restaurants']",98103,14,3
3,0,Why can't you access them on Google street vie...,access googl street view like medina yarrow po...,"[access, googl, street, view, like, medina, ya...","['Mexican', 'Tex-Mex', 'Restaurants']",98112,42,4
4,0,Things to like about this place: homemade guac...,thing like place homemad guacamol varieti tast...,"[thing, like, place, homemad, guacamol, variet...","['Mexican', 'Restaurants']",98102,12,3


In [6]:
train_df = df[df["label"] != "[None]"]
test_df = df[df["label"] == "[None]"]

additional_feats = ["cuisines_offered", "zipcode", "num_reviews", "avg_rating"]

train = train_df[["text"] + additional_feats]
train_preprocessed = train_df[["preprocessed_texts"] + additional_feats]
train_tokenized = train_df[["tokenized_texts"] + additional_feats]
train_labels = train_df["label"].astype(int) # needed by sklearn

test = test_df[["text"] + additional_feats]
test_preprocessed = test_df[["preprocessed_texts"] + additional_feats]
test_tokenized = test_df[["tokenized_texts"] + additional_feats]
test_labels = test_df["label"]

print(train.shape, train_preprocessed.shape, train_tokenized.shape, train_labels.shape)
print(test.shape, test_preprocessed.shape, test_tokenized.shape, test_labels.shape)
print(train.dtypes, train_preprocessed.dtypes, train_tokenized.dtypes)

(546, 5) (546, 5) (546, 5) (546,)
(12753, 5) (12753, 5) (12753, 5) (12753,)
text                object
cuisines_offered    object
zipcode             object
num_reviews         object
avg_rating          object
dtype: object preprocessed_texts    object
cuisines_offered      object
zipcode               object
num_reviews           object
avg_rating            object
dtype: object tokenized_texts     object
cuisines_offered    object
zipcode             object
num_reviews         object
avg_rating          object
dtype: object


In [7]:
display(train.head())
display(train_preprocessed.head())
display(train_tokenized.head())

Unnamed: 0,text,cuisines_offered,zipcode,num_reviews,avg_rating
0,"The baguettes and rolls are excellent, and alt...","['Vietnamese', 'Sandwiches', 'Restaurants']",98118,4,4
1,I live up the street from Betty. &#160;When my...,"['American (New)', 'Restaurants']",98109,21,4
2,I'm worried about how I will review this place...,"['Mexican', 'Restaurants']",98103,14,3
3,Why can't you access them on Google street vie...,"['Mexican', 'Tex-Mex', 'Restaurants']",98112,42,4
4,Things to like about this place: homemade guac...,"['Mexican', 'Restaurants']",98102,12,3


Unnamed: 0,preprocessed_texts,cuisines_offered,zipcode,num_reviews,avg_rating
0,baguett roll excel haven tri excit dozen plu t...,"['Vietnamese', 'Sandwiches', 'Restaurants']",98118,4,4
1,live street betti 160 sister town spring break...,"['American (New)', 'Restaurants']",98109,21,4
2,worri review place strongli think bad night pl...,"['Mexican', 'Restaurants']",98103,14,3
3,access googl street view like medina yarrow po...,"['Mexican', 'Tex-Mex', 'Restaurants']",98112,42,4
4,thing like place homemad guacamol varieti tast...,"['Mexican', 'Restaurants']",98102,12,3


Unnamed: 0,tokenized_texts,cuisines_offered,zipcode,num_reviews,avg_rating
0,"[baguett, roll, excel, haven, tri, excit, doze...","['Vietnamese', 'Sandwiches', 'Restaurants']",98118,4,4
1,"[live, street, betti, 160, sister, town, sprin...","['American (New)', 'Restaurants']",98109,21,4
2,"[worri, review, place, strongli, think, bad, n...","['Mexican', 'Restaurants']",98103,14,3
3,"[access, googl, street, view, like, medina, ya...","['Mexican', 'Tex-Mex', 'Restaurants']",98112,42,4
4,"[thing, like, place, homemad, guacamol, variet...","['Mexican', 'Restaurants']",98102,12,3


In [8]:
from sklearn import preprocessing

pipeline = Pipeline([
    ('preprocess', ColumnTransformer(
        [('cuisines_offered', CountVectorizer(min_df=10), 'cuisines_offered'),
         ('zipcode', OneHotEncoder(dtype='int', handle_unknown='ignore'), ['zipcode']),
         ('num_reviews', CountVectorizer(max_df=7, token_pattern='\d+'), 'num_reviews'),
         ('avg_rating', OneHotEncoder(dtype='int', handle_unknown='ignore'), ['avg_rating']),
         ('text', TfidfVectorizer(
                    stop_words='english',
                    strip_accents='unicode',
                    min_df=3,
                    max_df=0.5,
                    ngram_range=(1, 3),
                    max_features=500), 'preprocessed_texts')],
        remainder='passthrough',
    )),
    ('clf', MultinomialNB())
], verbose=False)

# pipeline.fit(X_train, y_train)
# y_pred = pipeline.predict(X_test)
# scores = metrics.f1_score(y_test, y_pred)
scores = cross_val_score(pipeline, train_preprocessed, train_labels, cv=5, scoring= 'f1_macro')
print(scores)
print("Average F1-Score: %0.5f" % np.average(scores))

[0.62650104 0.72474747 0.668357   0.68783693 0.60507246]
Average F1-Score: 0.66250


In [9]:
from sklearn import preprocessing

pipeline = Pipeline([
    ('preprocess', ColumnTransformer(
        [('cuisines_offered', CountVectorizer(min_df=10), 'cuisines_offered'),
         ('zipcode', OneHotEncoder(dtype='int', handle_unknown='ignore'), ['zipcode']),
         ('num_reviews', CountVectorizer(max_df=7, token_pattern='\d+'), 'num_reviews'),
         ('avg_rating', OneHotEncoder(dtype='int', handle_unknown='ignore'), ['avg_rating']),
         ('text', TfidfVectorizer(
                    stop_words='english',
                    strip_accents='unicode',
                    min_df=3,
                    max_df=0.5,
                    ngram_range=(1, 3),
                    max_features=500), 'text')],
        remainder='passthrough',
    )),
    ('clf', MultinomialNB())
], verbose=False)

# pipeline.fit(X_train, y_train)
# y_pred = pipeline.predict(X_test)
# scores = metrics.f1_score(y_test, y_pred)
scores = cross_val_score(pipeline, train, train_labels, cv=5, scoring= 'f1')
print(scores)
print("Average F1-Score: %0.5f" % np.average(scores))

[0.63461538 0.72222222 0.68965517 0.66037736 0.54347826]
Average F1-Score: 0.65007


In [10]:
print(len(df['num_reviews'].value_counts()))
df['num_reviews'].value_counts()

158


1      2193
2      1480
3      1126
4       934
5       767
6       675
7       600
8       484
9       458
10      403
11      352
12      315
13      268
14      246
16      200
15      199
17      185
18      160
19      131
20      125
22      124
21      124
23      107
24      101
25       97
28       86
26       81
27       77
29       68
30       60
32       52
33       51
36       44
37       44
34       43
31       43
35       42
39       39
44       37
38       33
43       30
47       28
42       27
40       26
45       25
46       25
41       23
54       22
59       20
55       15
49       15
52       15
63       15
51       14
48       14
62       14
73       13
57       12
56       11
50       11
61       11
53       10
58       10
66        9
78        9
83        8
67        8
89        8
60        8
65        7
77        6
93        6
70        6
64        6
69        6
84        5
76        5
112       5
85        5
100       4
68        4
74        4
80        4
91  

In [11]:
print(len(df['cuisines_offered'].value_counts()))
df['cuisines_offered'].value_counts()

388


['Thai', 'Restaurants']                                                          640
['American (New)', 'Restaurants']                                                596
['American (Traditional)', 'Restaurants']                                        589
['Mexican', 'Restaurants']                                                       572
['Pizza', 'Restaurants']                                                         524
['Vietnamese', 'Restaurants']                                                    465
['Japanese', 'Sushi Bars', 'Restaurants']                                        459
['Sandwiches', 'Restaurants']                                                    430
['Chinese', 'Restaurants']                                                       394
['Italian', 'Pizza', 'Restaurants']                                              327
['Japanese', 'Restaurants']                                                      282
['Italian', 'Restaurants']                                       

In [12]:

print(len(df['avg_rating'].value_counts()))
print(len(df['zipcode'].value_counts()))

5
30


In [13]:
pipeline = Pipeline([
    ('preprocess', ColumnTransformer(
        [('cuisines_offered', CountVectorizer(), 'cuisines_offered'),
         ('zipcode', OneHotEncoder(dtype='int', handle_unknown='ignore'), ['zipcode']),
         ('num_reviews', CountVectorizer(token_pattern='\d+'), 'num_reviews'),
         ('avg_rating', CountVectorizer(token_pattern='\d+'), 'avg_rating'),
         ('text', TfidfVectorizer(
                    stop_words='english',
                    strip_accents='unicode',
                    min_df=3,
                    max_df=0.5,
                    ngram_range=(1, 3),
                    max_features=500), 'preprocessed_texts')],
        remainder='passthrough',
    )),
    ('clf', MultinomialNB())
], verbose=False)

# pipeline.fit(X_train, y_train)
# y_pred = pipeline.predict(X_test)
# scores = metrics.f1_score(y_test, y_pred)
scores = cross_val_score(pipeline, train_preprocessed, train_labels, cv=5, scoring= 'f1')
print(scores)
print("Average F1-Score: %0.5f" % np.average(scores))

[0.63551402 0.7027027  0.65486726 0.63636364 0.51685393]
Average F1-Score: 0.62926


In [14]:
pipeline = Pipeline([
    ('union', ColumnTransformer(
        [('cuisines_offered', CountVectorizer(), 'cuisines_offered'),
         ('zipcode', OneHotEncoder(dtype='int', handle_unknown='ignore'), ['zipcode']),
         ('num_reviews', CountVectorizer(token_pattern='\d+'), 'num_reviews'),
         ('avg_rating', CountVectorizer(token_pattern='\d+'), 'avg_rating'),
         ('text', TfidfVectorizer(
                stop_words='english',
                strip_accents='unicode',
                min_df=15,
                max_df=0.5,
                ngram_range=(1, 3),
                max_features=500), 'text')],
        remainder='passthrough',
    )),
    ('clf', MultinomialNB())
], verbose=False)

# pipeline.fit(X_train, y_train)
# y_pred = pipeline.predict(X_test)
# score = metrics.f1_score(y_test, y_pred)
scores = cross_val_score(pipeline, train, train_labels, cv=5, scoring= 'f1')
print(scores)
print("Average F1-Score: %0.5f" % np.average(scores))

[0.63551402 0.69090909 0.65486726 0.64220183 0.50574713]
Average F1-Score: 0.62585


In [20]:
def test_classifier(clf, X, y, vectorizer, text_col='text'):
    pipeline = Pipeline([
        ('union', ColumnTransformer(
        [('cuisines_offered', CountVectorizer(min_df=10), 'cuisines_offered'),
         ('zipcode', OneHotEncoder(dtype='int', handle_unknown='ignore'), ['zipcode']),
         ('num_reviews', CountVectorizer(max_df=7, token_pattern='\d+'), 'num_reviews'),
         ('avg_rating', OneHotEncoder(dtype='int', handle_unknown='ignore'), ['avg_rating']),
         ('text', vectorizer, text_col)],
        remainder='passthrough',
    )),
        ('clf', clf)
    ], verbose=False)
    scores = cross_val_score(pipeline, X, y, cv=5, scoring= 'f1_macro')
    print(clf)
    print(scores)
    cv_score = np.average(scores)
    return cv_score

In [53]:
classifiers = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(random_state=SEED, n_estimators=500, n_jobs=-1),
    'Support Vector Machine': svm.SVC(),
    'XGBoost': XGBClassifier(n_estimators=500, 
                            max_depth=5, 
                            learning_rate=0.2, 
                            objective='binary:logistic',
                            scale_pos_weight=2,
                            n_jobs=-1,
                            random_state=SEED),
    'Gradient Boosting': GradientBoostingClassifier()
}

tfidf = TfidfVectorizer(
                    stop_words='english',
                    strip_accents='unicode',
                    min_df=3,
                    max_df=0.5,
                    ngram_range=(1, 3),
                    max_features=500)
bow = CountVectorizer(
                stop_words=STOP_WORDS,
                strip_accents='unicode',
                min_df=5,
                max_df=0.6,
                ngram_range=(1, 3))

In [54]:
for clf_name, clf in classifiers.items():
    cv_score = test_classifier(clf, train, train_labels, 
                               vectorizer=bow, text_col='text')
    print('{}: {}'.format(clf_name, cv_score))

MultinomialNB()
[0.60828157 0.66969697 0.68804714 0.61464646 0.61308316]
Naive Bayes: 0.638751062037948


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegression()
[0.63636364 0.52192982 0.5962963  0.64220183 0.65485665]
Logistic Regression: 0.6103296491826835
RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=30)
[0.66338599 0.61438679 0.58659924 0.64026403 0.668357  ]
Random Forest: 0.6345986095745193
SVC()
[0.63061152 0.61330795 0.5968172  0.60260417 0.54821774]
Support Vector Machine: 0.5983117172144025
XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=0.2, max_delta_step=None, max_depth=5,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=500, n_jobs=-1, num_parallel_tree=None,
              random_state=30, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=2, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=N

In [55]:
for clf_name, clf in classifiers.items():
    cv_score = test_classifier(clf, train_preprocessed, train_labels, 
                               vectorizer=bow, text_col='preprocessed_texts')
    print('{}: {}'.format(clf_name, cv_score))

MultinomialNB()
[0.59880637 0.72474747 0.68783693 0.62385321 0.60336803]
Naive Bayes: 0.6477224016277434


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegression()
[0.54530423 0.58023107 0.62372653 0.6513468  0.6761735 ]
Logistic Regression: 0.6153564259182163
RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=30)
[0.63443004 0.59547908 0.58590122 0.61308316 0.63050847]
Random Forest: 0.6118803977093185
SVC()
[0.56785714 0.61074819 0.5968172  0.59444493 0.51601323]
Support Vector Machine: 0.5771761382483058
XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=0.2, max_delta_step=None, max_depth=5,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=500, n_jobs=-1, num_parallel_tree=None,
              random_state=30, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=2, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=N

In [56]:
for clf_name, clf in classifiers.items():
    cv_score = test_classifier(clf, train, train_labels, 
                               vectorizer=tfidf, text_col='text')
    print('{}: {}'.format(clf_name, cv_score))

MultinomialNB()
[0.65351459 0.72474747 0.668357   0.66947439 0.60507246]
Naive Bayes: 0.6642331837755213
LogisticRegression()
[0.63588216 0.69714574 0.66902834 0.62372653 0.63916476]
Logistic Regression: 0.6529895058523685
RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=30)
[0.64542524 0.68741565 0.63225371 0.6513468  0.6292517 ]
Random Forest: 0.6491386219009174
SVC()
[0.63527851 0.72474747 0.66902834 0.62271    0.62594372]
Support Vector Machine: 0.6555416107222085
XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=0.2, max_delta_step=None, max_depth=5,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=500, n_jobs=-1, num_parallel_tree=None,
              random_state=30, reg_alpha=None, reg_lambda=None,
              

In [57]:
for clf_name, clf in classifiers.items():
    cv_score = test_classifier(clf, train_preprocessed, train_labels, 
                               vectorizer=tfidf, text_col='preprocessed_texts')
    print('{}: {}'.format(clf_name, cv_score))

MultinomialNB()
[0.62650104 0.72474747 0.668357   0.68783693 0.60507246]
Naive Bayes: 0.6625029797815201
LogisticRegression()
[0.64519064 0.72456199 0.69683944 0.59547908 0.61754386]
Logistic Regression: 0.6559230035971904
RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=30)
[0.6090586  0.63274933 0.6138664  0.64993239 0.63150778]
Random Forest: 0.6274228970125558
SVC()
[0.61704244 0.72474747 0.65951878 0.59547908 0.62594372]
Support Vector Machine: 0.6445463003313365
XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=0.2, max_delta_step=None, max_depth=5,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=500, n_jobs=-1, num_parallel_tree=None,
              random_state=30, reg_alpha=None, reg_lambda=None,
              

In [27]:
def create_submission(y_pred, filepath):
    with open(filepath, 'w') as f:
        f.write('yinanhu3\n')
        for label in y_pred:
            f.write(str(int(label)) + '\n')

In [98]:
pipeline = Pipeline([
    ('preprocess', ColumnTransformer(
        [('cuisines_offered', CountVectorizer(min_df=10), 'cuisines_offered'),
         ('zipcode', OneHotEncoder(dtype='int', handle_unknown='ignore'), ['zipcode']),
         ('num_reviews', CountVectorizer(max_df=7, token_pattern='\d+'), 'num_reviews'),
         ('avg_rating', OneHotEncoder(dtype='int', handle_unknown='ignore'), ['avg_rating']),
         ('text', TfidfVectorizer(
                    stop_words='english',
                    strip_accents='unicode',
                    min_df=2,
                    max_df=0.4,
                    ngram_range=(1, 3),
                    max_features=2000), 'preprocessed_texts')],
        remainder='passthrough',
    )),
    ('clf', MultinomialNB())
], verbose=False)

pipeline.fit(train_preprocessed, train_labels)
y_pred = pipeline.predict(test_preprocessed)

In [99]:
submit_path ='submission7_yinanhu3.txt'
create_submission(y_pred, submit_path)

In [42]:

submit_path

'submission7_yinanhu3.txt'

In [101]:

!python submit.py yinanhu3 {submit_path}

Submission completed successfully!
