In [1]:
import os
import pandas as pd
import numpy as np
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, classification_report, f1_score
from keras.models import Sequential
from keras.layers import Dense, Dropout

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
pd.set_option('display.max_colwidth',-1)

  """Entry point for launching an IPython kernel.


In [3]:
df = pd.read_csv('data_cleaned.csv',index_col=False)
df.head()

Unnamed: 0,review_number,facility_ind,security_ind,pricing_ind,location_ind,fb_ind,housekeep_ind,frontoff_ind,Others,cons
0,0,1,0,0,0,0,0,0,0,bedroom bland
1,0,1,0,0,0,0,0,0,0,soft furnish
2,0,1,0,0,0,0,0,0,0,littl furnitur big space
3,0,1,0,0,0,0,0,0,0,uncomfort couch area need colour invent design make want spend time bedroom
4,0,0,0,0,0,1,0,0,0,hotel could resid bar relax drink dinner


In [4]:
# Convert all rows to string
df.cons = df.cons.astype(str)

# col with mixed types use astype() first: https://stackoverflow.com/questions/56243441/check-if-values-in-pandas-dataframe-column-is-integer-and-write-it-to-a-list-if
#df = df[~df.cons.astype(str).str.isdigit()] # removed int rows but cannot remove datetime.time row

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5788 entries, 0 to 5787
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   review_number  5788 non-null   int64 
 1   facility_ind   5788 non-null   int64 
 2   security_ind   5788 non-null   int64 
 3   pricing_ind    5788 non-null   int64 
 4   location_ind   5788 non-null   int64 
 5   fb_ind         5788 non-null   int64 
 6   housekeep_ind  5788 non-null   int64 
 7   frontoff_ind   5788 non-null   int64 
 8   Others         5788 non-null   int64 
 9   cons           5788 non-null   object
dtypes: int64(9), object(1)
memory usage: 452.3+ KB


In [6]:
cols = ['facility_ind','security_ind','pricing_ind','location_ind','fb_ind','housekeep_ind','frontoff_ind','Others']
print('There are',len(df),'rows in total.')
print(len(df[df[cols].sum(axis=1)>1]),'rows involve more than 1 departments.')
print(len(df[df[cols].sum(axis=1)==1]),'rows involve only 1 departments.')
print(len(df[df[cols].sum(axis=1)==0]),'rows are considered no value.')
print('')
print('Feedback to each department:')
print(df[cols].sum())

There are 5788 rows in total.
423 rows involve more than 1 departments.
5365 rows involve only 1 departments.
0 rows are considered no value.

Feedback to each department:
facility_ind     1988
security_ind     65  
pricing_ind      1273
location_ind     66  
fb_ind           408 
housekeep_ind    180 
frontoff_ind     970 
Others           1296
dtype: int64


In [7]:
Y = df[cols]
X = df['cons']
print(len(Y),len(X))

5788 5788


In [8]:
max_f = 5000

cv_set = StratifiedKFold(n_splits=3, shuffle=True, random_state=111)

def reshape_for_resampling(X_train):
    return X_train.values.reshape(-1,1)

ros = RandomOverSampler()
rus = RandomUnderSampler()

def ravel_resampled_data(X_train):
    return X_train.ravel()

f_reshape = FunctionTransformer(reshape_for_resampling)
f_ravel = FunctionTransformer(ravel_resampled_data)

cv = CountVectorizer(max_features=max_f, ngram_range=(1,2))

tfidf = TfidfTransformer()

In [9]:
from keras.wrappers.scikit_learn import KerasClassifier

nparam=5
dropout=0.5
opt='adam'

def create_model(nparam=nparam, input_dim=(max_f,), dropout=dropout, optimizer=opt):
    model = Sequential()
    model.add(Dense(nparam, activation='relu',input_shape=input_dim))
    model.add(Dropout(dropout))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',optimizer=optimizer,metrics=['accuracy'])
    return model

model = KerasClassifier(build_fn=create_model, epochs=10, batch_size=16, verbose=0)

### Fitting category: facility

#### Finding best resampling method

In [10]:
y = Y.iloc[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=111)
print(len(X_train), len(X_test), len(y_train), len(y_test))

print('-------------Fitting category',y.name,'...-----------')

ppl = Pipeline([('countVec',cv),('tfidf',tfidf),('model',model)])
val_score = cross_validate(ppl, X_train, y_train, cv=cv_set, scoring=['accuracy','f1'])
print('Mean cross validation accuracy (no resampling):', val_score['test_accuracy'].mean())
print('Mean cross validation f1 (no resampling):', val_score['test_f1'].mean())
ppl.fit(X_train, y_train)
y_pred = ppl.predict(X_test)
print('Test accuracy (no resampling)',accuracy_score(y_pred, y_test))
print('Test f1 (no resampling)',f1_score(y_pred, y_test))

ppl = Pipeline([('reshape',f_reshape),('resample',ros),('ravel',f_ravel),('countVec',cv),('tfidf',tfidf),('model',model)])
val_score = cross_validate(ppl, X_train, y_train, cv=cv_set, scoring=['accuracy','f1'])
print('Mean cross validation accuracy (ROS):', val_score['test_accuracy'].mean())
print('Mean cross validation f1 (ROS):', val_score['test_f1'].mean())
ppl.fit(X_train, y_train)
y_pred = ppl.predict(X_test)
print('Test accuracy (ROS)',accuracy_score(y_pred, y_test))
print('Test f1 (ROS)',f1_score(y_pred, y_test))

ppl = Pipeline([('reshape',f_reshape),('resample',rus),('ravel',f_ravel),('countVec',cv),('tfidf',tfidf),('model',model)])
val_score = cross_validate(ppl, X_train, y_train, cv=cv_set, scoring=['accuracy','f1'])
print('Mean cross validation accuracy (RUS):', val_score['test_accuracy'].mean())
print('Mean cross validation f1 (RUS):', val_score['test_f1'].mean())
ppl.fit(X_train, y_train)
y_pred = ppl.predict(X_test)
print('Test accuracy (RUS)',accuracy_score(y_pred, y_test))
print('Test f1 (RUS)',f1_score(y_pred, y_test))

4630 1158 4630 1158
-------------Fitting category facility_ind ...-----------
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Mean cross validation accuracy (no resampling): 0.8190066957914567
Mean cross validation f1 (no resampling): 0.7136906230653275
Test accuracy (no resampling) 0.8393782383419689
Test f1 (no resampling) 0.7430939226519336
Mean cross validation accuracy (ROS): 0.8155523244425491
Mean cross validation f1 (ROS): 0.7081918602955994
Test accuracy (ROS) 0.8411053540587219
Test f1 (ROS) 0.7653061224489796
Mean cross validation accuracy (RUS): 0.801295364770645
Mean cross validation f1 (RUS): 0.7346569633342783
Test accuracy (RUS) 0.8238341968911918
Test f1 (RUS) 0.7605633802816901


#### Grid Search using the pipeline giving the highest score

In [11]:
ppl = Pipeline([('reshape',f_reshape),('resample',ros),('ravel',f_ravel),('countVec',cv),('tfidf',tfidf),('model',model)])

param_grid = {
    'countVec__ngram_range': [(1,2), (1,3), (1,4)],
    'countVec__analyzer': ['word','char_wb','char'],
    'countVec__stop_words':[None, 'english'],
    'tfidf__use_idf': [True, False],
    #'model__epochs': [10, ],
    #'model__nparam': [5,10],
    #'model__init': [ 'uniform', 'zeros', 'normal', ], 
    #'model__batch_size':[2, 16, 32],
    #'model__optimizer':['Adam', 'sgd'],
    'model__dropout': [0.5, 0.3, 0]
}

grid = GridSearchCV(estimator=ppl,  
                    n_jobs=-1, 
                    verbose=1,
                    return_train_score=True,
                    cv=cv_set,
                    param_grid=param_grid,
                    scoring='f1')
#grid.get_params

grid_result = grid.fit(X_train, y_train)

# summarize results
print("Best score: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed: 11.1min finished


Best score: 0.752848 using {'countVec__analyzer': 'char_wb', 'countVec__ngram_range': (1, 4), 'countVec__stop_words': 'english', 'model__dropout': 0, 'tfidf__use_idf': False}
0.724206 (0.008158) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0.5, 'tfidf__use_idf': True}
0.734771 (0.016197) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0.5, 'tfidf__use_idf': False}
0.705150 (0.007593) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0.3, 'tfidf__use_idf': True}
0.726164 (0.012207) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0.3, 'tfidf__use_idf': False}
0.708281 (0.006243) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0, 'tfidf__use_id

#### Final prediction

In [48]:
# Definition of the best model
cv = CountVectorizer(max_features=max_f,analyzer='char_wb',ngram_range=(1,4),stop_words='english')
tfidf = TfidfTransformer(use_idf=False)
ppl = Pipeline([('reshape',f_reshape),('resample',ros),('ravel',f_ravel),('countVec',cv),('tfidf',tfidf),('model',model)])
model__dropout=0
model__epochs=10
model__nparam=5
model__optimizer='Adam'

In [49]:
print('-------------Predicting category',y.name,'...-----------')
ppl.fit(X_train, y_train)
y_pred = ppl.predict(X_test)
print('Test accuracy',accuracy_score(y_pred, y_test))
print(classification_report(y_pred, y_test))
print('Test f1 score:',f1_score(y_pred, y_test))
print('')

fac_pred = pd.Series(y_pred.reshape(-1,), name=y.name+'_pred')
display(fac_pred)

-------------Predicting category facility_ind ...-----------
Test accuracy 0.8082901554404145
              precision    recall  f1-score   support

           0       0.81      0.89      0.85       694
           1       0.80      0.69      0.74       464

    accuracy                           0.81      1158
   macro avg       0.81      0.79      0.79      1158
weighted avg       0.81      0.81      0.81      1158

Test f1 score: 0.7424593967517403



0       0
1       1
2       1
3       0
4       0
       ..
1153    1
1154    1
1155    0
1156    0
1157    1
Name: facility_ind_pred, Length: 1158, dtype: int64

### Fitting category: security

#### Finding best resampling method

In [14]:
y = Y.iloc[:,1]
train_index = X_train.index
test_index = X_test.index
y_train = Y.loc[train_index,y.name]
y_test = Y.loc[test_index,y.name]
print(len(X_train), len(X_test), len(y_train), len(y_test))

print('-------------Fitting category',y.name,'...-----------')

ppl = Pipeline([('countVec',cv),('tfidf',tfidf),('model',model)])
val_score = cross_validate(ppl, X_train, y_train, cv=cv_set, scoring=['accuracy','f1'])
print('Mean cross validation accuracy (no resampling):', val_score['test_accuracy'].mean())
print('Mean cross validation f1 (no resampling):', val_score['test_f1'].mean())
ppl.fit(X_train, y_train)
y_pred = ppl.predict(X_test)
print('Test accuracy (no resampling)',accuracy_score(y_pred, y_test))
print('Test f1 (no resampling)',f1_score(y_pred, y_test))

ppl = Pipeline([('reshape',f_reshape),('resample',ros),('ravel',f_ravel),('countVec',cv),('tfidf',tfidf),('model',model)])
val_score = cross_validate(ppl, X_train, y_train, cv=cv_set, scoring=['accuracy','f1'])
print('Mean cross validation accuracy (ROS):', val_score['test_accuracy'].mean())
print('Mean cross validation f1 (ROS):', val_score['test_f1'].mean())
ppl.fit(X_train, y_train)
y_pred = ppl.predict(X_test)
print('Test accuracy (ROS)',accuracy_score(y_pred, y_test))
print('Test f1 (ROS)',f1_score(y_pred, y_test))

4630 1158 4630 1158
-------------Fitting category security_ind ...-----------
Mean cross validation accuracy (no resampling): 0.9889848801820468
Mean cross validation f1 (no resampling): 0.0
Test accuracy (no resampling) 0.9879101899827288
Test f1 (no resampling) 0.0
Mean cross validation accuracy (ROS): 0.9803459436286444
Mean cross validation f1 (ROS): 0.16464646464646465
Test accuracy (ROS) 0.9784110535405872
Test f1 (ROS) 0.3243243243243243


#### Grid Search using the pipeline giving the highest score

In [15]:
ppl = Pipeline([('reshape',f_reshape),('resample',ros),('ravel',f_ravel),('countVec',cv),('tfidf',tfidf),('model',model)])

param_grid = {
    'countVec__ngram_range': [(1,2), (1,3), (1,4)],
    'countVec__analyzer': ['word','char_wb','char'],
    'countVec__stop_words':[None, 'english'],
    'tfidf__use_idf': [True, False],
    #'model__epochs': [10, ],
    #'model__nparam': [5,10],
    #'model__init': [ 'uniform', 'zeros', 'normal', ], 
    #'model__batch_size':[2, 16, 32],
    #'model__optimizer':['Adam', 'sgd'],
    'model__dropout': [0.5, 0.3, 0]
}

grid = GridSearchCV(estimator=ppl,  
                    n_jobs=-1, 
                    verbose=1,
                    return_train_score=True,
                    cv=cv_set,
                    param_grid=param_grid,
                    scoring='f1')
#grid.get_params

grid_result = grid.fit(X_train, y_train)

# summarize results
print("Best score: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed: 15.9min finished


Best score: 0.278761 using {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 3), 'countVec__stop_words': 'english', 'model__dropout': 0.3, 'tfidf__use_idf': False}
0.076190 (0.107750) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0.5, 'tfidf__use_idf': True}
0.142715 (0.135916) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0.5, 'tfidf__use_idf': False}
0.060606 (0.085710) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0.3, 'tfidf__use_idf': True}
0.178127 (0.060247) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0.3, 'tfidf__use_idf': False}
0.079125 (0.076075) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0, 'tfidf__use_idf

#### Final prediction

In [18]:
# Definition of the best model
cv = CountVectorizer(max_features=max_f, analyzer='word',ngram_range=(1,3),stop_words='english')
tfidf = TfidfTransformer(use_idf=False)
ppl = Pipeline([('reshape',f_reshape),('resample',ros),('ravel',f_ravel),('countVec',cv),('tfidf',tfidf),('model',model)])
model__dropout=0.3
model__epochs=10
model__nparam=5
model__optimizer='Adam'

In [20]:
print('-------------Predicting category',y.name,'...-----------')
ppl.fit(X_train, y_train)
y_pred = ppl.predict(X_test)
print('Test accuracy',accuracy_score(y_pred, y_test))
print(classification_report(y_pred, y_test))
print('Test f1 score:',f1_score(y_pred, y_test))
print('')

sec_pred = pd.Series(y_pred.reshape(-1,), name=y.name+'_pred')
display(sec_pred)

-------------Predicting category security_ind ...-----------
Test accuracy 0.9784110535405872
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1141
           1       0.21      0.18      0.19        17

    accuracy                           0.98      1158
   macro avg       0.60      0.58      0.59      1158
weighted avg       0.98      0.98      0.98      1158

Test f1 score: 0.1935483870967742



0       0
1       0
2       0
3       0
4       0
       ..
1153    0
1154    0
1155    0
1156    0
1157    0
Name: security_ind_pred, Length: 1158, dtype: int64

### Fitting category: pricing

#### Finding best resampling method

In [21]:
y = Y.iloc[:,2]
train_index = X_train.index
test_index = X_test.index
y_train = Y.loc[train_index,y.name]
y_test = Y.loc[test_index,y.name]
print(len(X_train), len(X_test), len(y_train), len(y_test))

print('-------------Fitting category',y.name,'...-----------')

ppl = Pipeline([('countVec',cv),('tfidf',tfidf),('model',model)])
val_score = cross_validate(ppl, X_train, y_train, cv=cv_set, scoring=['accuracy','f1'])
print('Mean cross validation accuracy (no resampling):', val_score['test_accuracy'].mean())
print('Mean cross validation f1 (no resampling):', val_score['test_f1'].mean())
ppl.fit(X_train, y_train)
y_pred = ppl.predict(X_test)
print('Test accuracy (no resampling)',accuracy_score(y_pred, y_test))
print('Test f1 (no resampling)',f1_score(y_pred, y_test))

ppl = Pipeline([('reshape',f_reshape),('resample',ros),('ravel',f_ravel),('countVec',cv),('tfidf',tfidf),('model',model)])
val_score = cross_validate(ppl, X_train, y_train, cv=cv_set, scoring=['accuracy','f1'])
print('Mean cross validation accuracy (ROS):', val_score['test_accuracy'].mean())
print('Mean cross validation f1 (ROS):', val_score['test_f1'].mean())
ppl.fit(X_train, y_train)
y_pred = ppl.predict(X_test)
print('Test accuracy (ROS)',accuracy_score(y_pred, y_test))
print('Test f1 (ROS)',f1_score(y_pred, y_test))

4630 1158 4630 1158
-------------Fitting category pricing_ind ...-----------
Mean cross validation accuracy (no resampling): 0.9246223963142924
Mean cross validation f1 (no resampling): 0.8111783779171468
Test accuracy (no resampling) 0.9265975820379966
Test f1 (no resampling) 0.8179871520342613
Mean cross validation accuracy (ROS): 0.9302379009555662
Mean cross validation f1 (ROS): 0.8320852836320208
Test accuracy (ROS) 0.9222797927461139
Test f1 (ROS) 0.8185483870967741


#### Grid Search using the pipeline giving the highest score

In [22]:
ppl = Pipeline([('reshape',f_reshape),('resample',ros),('ravel',f_ravel),('countVec',cv),('tfidf',tfidf),('model',model)])

param_grid = {
    'countVec__ngram_range': [(1,2), (1,3), (1,4)],
    'countVec__analyzer': ['word','char_wb','char'],
    'countVec__stop_words':[None, 'english'],
    'tfidf__use_idf': [True, False],
    #'model__epochs': [10, ],
    #'model__nparam': [5,10],
    #'model__init': [ 'uniform', 'zeros', 'normal', ], 
    #'model__batch_size':[2, 16, 32],
    #'model__optimizer':['Adam', 'sgd'],
    'model__dropout': [0.5, 0.3, 0]
}

grid = GridSearchCV(estimator=ppl,  
                    n_jobs=-1, 
                    verbose=1,
                    return_train_score=True,
                    cv=cv_set,
                    param_grid=param_grid,
                    scoring='f1')
#grid.get_params

grid_result = grid.fit(X_train, y_train)

# summarize results
print("Best score: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed: 12.4min finished


Best score: 0.839882 using {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0.5, 'tfidf__use_idf': False}
0.821799 (0.008813) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0.5, 'tfidf__use_idf': True}
0.839882 (0.005792) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0.5, 'tfidf__use_idf': False}
0.817676 (0.002911) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0.3, 'tfidf__use_idf': True}
0.822763 (0.009244) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0.3, 'tfidf__use_idf': False}
0.797545 (0.009231) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0, 'tfidf__use_idf': Tr

#### Final prediction

In [50]:
# Definition of the best model
cv = CountVectorizer(max_features=max_f, analyzer='word',ngram_range=(1,2))
tfidf = TfidfTransformer(use_idf=False)
ppl = Pipeline([('reshape',f_reshape),('resample',ros),('ravel',f_ravel),('countVec',cv),('tfidf',tfidf),('model',model)])
model__dropout=0.5
model__epochs=10
model__nparam=5
model__optimizer='Adam'

In [51]:
print('-------------Predicting category',y.name,'...-----------')
ppl.fit(X_train, y_train)
y_pred = ppl.predict(X_test)
print('Test accuracy',accuracy_score(y_pred, y_test))
print(classification_report(y_pred, y_test))
print('Test f1 score:',f1_score(y_pred, y_test))
print('')

pri_pred = pd.Series(y_pred.reshape(-1,), name=y.name+'_pred')
display(pri_pred)

-------------Predicting category pricing_ind ...-----------
Test accuracy 0.9317789291882557
              precision    recall  f1-score   support

           0       0.96      0.95      0.96       913
           1       0.83      0.85      0.84       245

    accuracy                           0.93      1158
   macro avg       0.89      0.90      0.90      1158
weighted avg       0.93      0.93      0.93      1158

Test f1 score: 0.8410462776659959



0       1
1       0
2       0
3       0
4       1
       ..
1153    0
1154    0
1155    0
1156    0
1157    0
Name: pricing_ind_pred, Length: 1158, dtype: int64

### Fitting category: location

#### Finding best resampling method

In [26]:
y = Y.iloc[:,3]
train_index = X_train.index
test_index = X_test.index
y_train = Y.loc[train_index,y.name]
y_test = Y.loc[test_index,y.name]
print(len(X_train), len(X_test), len(y_train), len(y_test))

print('-------------Fitting category',y.name,'...-----------')

ppl = Pipeline([('countVec',cv),('tfidf',tfidf),('model',model)])
val_score = cross_validate(ppl, X_train, y_train, cv=cv_set, scoring=['accuracy','f1'])
print('Mean cross validation accuracy (no resampling):', val_score['test_accuracy'].mean())
print('Mean cross validation f1 (no resampling):', val_score['test_f1'].mean())
ppl.fit(X_train, y_train)
y_pred = ppl.predict(X_test)
print('Test accuracy (no resampling)',accuracy_score(y_pred, y_test))
print('Test f1 (no resampling)',f1_score(y_pred, y_test))

ppl = Pipeline([('reshape',f_reshape),('resample',ros),('ravel',f_ravel),('countVec',cv),('tfidf',tfidf),('model',model)])
val_score = cross_validate(ppl, X_train, y_train, cv=cv_set, scoring=['accuracy','f1'])
print('Mean cross validation accuracy (ROS):', val_score['test_accuracy'].mean())
print('Mean cross validation f1 (ROS):', val_score['test_f1'].mean())
ppl.fit(X_train, y_train)
y_pred = ppl.predict(X_test)
print('Test accuracy (ROS)',accuracy_score(y_pred, y_test))
print('Test f1 (ROS)',f1_score(y_pred, y_test))

4630 1158 4630 1158
-------------Fitting category location_ind ...-----------
Mean cross validation accuracy (no resampling): 0.9883369319574612
Mean cross validation f1 (no resampling): 0.0
Test accuracy (no resampling) 0.9896373056994818
Test f1 (no resampling) 0.0
Mean cross validation accuracy (ROS): 0.9870411754236917
Mean cross validation f1 (ROS): 0.26859219962668235
Test accuracy (ROS) 0.9905008635578584
Test f1 (ROS) 0.4761904761904762


#### Grid Search using the pipeline giving the highest score

In [27]:
ppl = Pipeline([('reshape',f_reshape),('resample',ros),('ravel',f_ravel),('countVec',cv),('tfidf',tfidf),('model',model)])

param_grid = {
    'countVec__ngram_range': [(1,2), (1,3), (1,4)],
    'countVec__analyzer': ['word','char_wb','char'],
    'countVec__stop_words':[None, 'english'],
    'tfidf__use_idf': [True, False],
    #'model__epochs': [10, ],
    #'model__nparam': [5,10],
    #'model__init': [ 'uniform', 'zeros', 'normal', ], 
    #'model__batch_size':[2, 16, 32],
    #'model__optimizer':['Adam', 'sgd'],
    'model__dropout': [0.5, 0.3, 0]
}

grid = GridSearchCV(estimator=ppl,  
                    n_jobs=-1, 
                    verbose=1,
                    return_train_score=True,
                    cv=cv_set,
                    param_grid=param_grid,
                    scoring='f1')
#grid.get_params

grid_result = grid.fit(X_train, y_train)

# summarize results
print("Best score: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 10.8min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed: 18.0min finished


Best score: 0.365538 using {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': 'english', 'model__dropout': 0.5, 'tfidf__use_idf': False}
0.245174 (0.173581) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0.5, 'tfidf__use_idf': True}
0.319359 (0.151399) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0.5, 'tfidf__use_idf': False}
0.254603 (0.078444) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0.3, 'tfidf__use_idf': True}
0.316285 (0.148616) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0.3, 'tfidf__use_idf': False}
0.217094 (0.121542) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0, 'tfidf__use_idf

#### Final prediction

In [52]:
# Definition of the best model
cv = CountVectorizer(max_features=max_f, analyzer='word',ngram_range=(1,2),stop_words='english')
tfidf = TfidfTransformer(use_idf=False)
ppl = Pipeline([('reshape',f_reshape),('resample',ros),('ravel',f_ravel),('countVec',cv),('tfidf',tfidf),('model',model)])
model__dropout=0.5
model__epochs=10
model__nparam=5
model__optimizer='Adam'

In [53]:
print('-------------Predicting category',y.name,'...-----------')
ppl.fit(X_train, y_train)
y_pred = ppl.predict(X_test)
print('Test accuracy',accuracy_score(y_pred, y_test))
print(classification_report(y_pred, y_test))
print('Test f1 score:',f1_score(y_pred, y_test))
print('')

loc_pred = pd.Series(y_pred.reshape(-1,), name=y.name+'_pred')
display(loc_pred)

-------------Predicting category location_ind ...-----------
Test accuracy 0.9887737478411054
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1149
           1       0.33      0.44      0.38         9

    accuracy                           0.99      1158
   macro avg       0.66      0.72      0.69      1158
weighted avg       0.99      0.99      0.99      1158

Test f1 score: 0.380952380952381



0       0
1       0
2       0
3       0
4       0
       ..
1153    0
1154    0
1155    0
1156    0
1157    1
Name: location_ind_pred, Length: 1158, dtype: int64

### Fitting category: F&B

#### Finding best resampling method

In [30]:
y = Y.iloc[:,4]
train_index = X_train.index
test_index = X_test.index
y_train = Y.loc[train_index,y.name]
y_test = Y.loc[test_index,y.name]
print(len(X_train), len(X_test), len(y_train), len(y_test))

print('-------------Fitting category',y.name,'...-----------')

ppl = Pipeline([('countVec',cv),('tfidf',tfidf),('model',model)])
val_score = cross_validate(ppl, X_train, y_train, cv=cv_set, scoring=['accuracy','f1'])
print('Mean cross validation accuracy (no resampling):', val_score['test_accuracy'].mean())
print('Mean cross validation f1 (no resampling):', val_score['test_f1'].mean())
ppl.fit(X_train, y_train)
y_pred = ppl.predict(X_test)
print('Test accuracy (no resampling)',accuracy_score(y_pred, y_test))
print('Test f1 (no resampling)',f1_score(y_pred, y_test))

ppl = Pipeline([('reshape',f_reshape),('resample',ros),('ravel',f_ravel),('countVec',cv),('tfidf',tfidf),('model',model)])
val_score = cross_validate(ppl, X_train, y_train, cv=cv_set, scoring=['accuracy','f1'])
print('Mean cross validation accuracy (ROS):', val_score['test_accuracy'].mean())
print('Mean cross validation f1 (ROS):', val_score['test_f1'].mean())
ppl.fit(X_train, y_train)
y_pred = ppl.predict(X_test)
print('Test accuracy (ROS)',accuracy_score(y_pred, y_test))
print('Test f1 (ROS)',f1_score(y_pred, y_test))

ppl = Pipeline([('reshape',f_reshape),('resample',rus),('ravel',f_ravel),('countVec',cv),('tfidf',tfidf),('model',model)])
val_score = cross_validate(ppl, X_train, y_train, cv=cv_set, scoring=['accuracy','f1'])
print('Mean cross validation accuracy (RUS):', val_score['test_accuracy'].mean())
print('Mean cross validation f1 (RUS):', val_score['test_f1'].mean())
ppl.fit(X_train, y_train)
y_pred = ppl.predict(X_test)
print('Test accuracy (RUS)',accuracy_score(y_pred, y_test))
print('Test f1 (RUS)',f1_score(y_pred, y_test))

4630 1158 4630 1158
-------------Fitting category fb_ind ...-----------
Mean cross validation accuracy (no resampling): 0.9349888683306525
Mean cross validation f1 (no resampling): 0.17740753596359285
Test accuracy (no resampling) 0.9455958549222798
Test f1 (no resampling) 0.47058823529411764
Mean cross validation accuracy (ROS): 0.9356379358784505
Mean cross validation f1 (ROS): 0.4330462669290212
Test accuracy (ROS) 0.9464594127806563
Test f1 (ROS) 0.5079365079365079
Mean cross validation accuracy (RUS): 0.7840284050651615
Mean cross validation f1 (RUS): 0.37678672923683126
Test accuracy (RUS) 0.9196891191709845
Test f1 (RUS) 0.541871921182266


#### Grid Search using the pipeline giving the highest score

In [31]:
ppl = Pipeline([('reshape',f_reshape),('resample',rus),('ravel',f_ravel),('countVec',cv),('tfidf',tfidf),('model',model)])

param_grid = {
    'countVec__ngram_range': [(1,2), (1,3), (1,4)],
    'countVec__analyzer': ['word','char_wb','char'],
    'countVec__stop_words':[None, 'english'],
    'tfidf__use_idf': [True, False],
    #'model__epochs': [10, ],
    #'model__nparam': [5,10],
    #'model__init': [ 'uniform', 'zeros', 'normal', ], 
    #'model__batch_size':[2, 16, 32],
    #'model__optimizer':['Adam', 'sgd'],
    'model__dropout': [0.5, 0.3, 0]
}

grid = GridSearchCV(estimator=ppl,  
                    n_jobs=-1, 
                    verbose=1,
                    return_train_score=True,
                    cv=cv_set,
                    param_grid=param_grid,
                    scoring='f1')
#grid.get_params

grid_result = grid.fit(X_train, y_train)

# summarize results
print("Best score: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:  3.8min finished


Best score: 0.466154 using {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 3), 'countVec__stop_words': None, 'model__dropout': 0.3, 'tfidf__use_idf': False}
nan (nan) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0.5, 'tfidf__use_idf': True}
nan (nan) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0.5, 'tfidf__use_idf': False}
nan (nan) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0.3, 'tfidf__use_idf': True}
nan (nan) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0.3, 'tfidf__use_idf': False}
nan (nan) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0, 'tfidf__use_idf': True}
nan (nan) with: {'countVec__analyzer': 'word',

#### Final prediction

In [54]:
# Definition of the best model
cv = CountVectorizer(max_features=max_f, analyzer='word',ngram_range=(1,3))
tfidf = TfidfTransformer(use_idf=False)
ppl = Pipeline([('reshape',f_reshape),('resample',rus),('ravel',f_ravel),('countVec',cv),('tfidf',tfidf),('model',model)])
model__dropout=0.3
model__epochs=10
model__nparam=5
model__optimizer='Adam'

In [55]:
print('-------------Predicting category',y.name,'...-----------')
ppl.fit(X_train, y_train)
y_pred = ppl.predict(X_test)
print('Test accuracy',accuracy_score(y_pred, y_test))
print(classification_report(y_pred, y_test))
print('Test f1 score:',f1_score(y_pred, y_test))
print('')

fnb_pred = pd.Series(y_pred.reshape(-1,), name=y.name+'_pred')
display(fnb_pred)

-------------Predicting category fb_ind ...-----------
Test accuracy 0.7841105354058722
              precision    recall  f1-score   support

           0       0.78      0.99      0.87       848
           1       0.87      0.23      0.36       310

    accuracy                           0.78      1158
   macro avg       0.82      0.61      0.62      1158
weighted avg       0.80      0.78      0.73      1158

Test f1 score: 0.36224489795918363



0       1
1       0
2       0
3       1
4       0
       ..
1153    0
1154    0
1155    0
1156    0
1157    0
Name: fb_ind_pred, Length: 1158, dtype: int64

### Fitting category: housekeeping

#### Finding best resampling method

In [34]:
y = Y.iloc[:,5]
train_index = X_train.index
test_index = X_test.index
y_train = Y.loc[train_index,y.name]
y_test = Y.loc[test_index,y.name]
print(len(X_train), len(X_test), len(y_train), len(y_test))

print('-------------Fitting category',y.name,'...-----------')

ppl = Pipeline([('countVec',cv),('tfidf',tfidf),('model',model)])
val_score = cross_validate(ppl, X_train, y_train, cv=cv_set, scoring=['accuracy','f1'])
print('Mean cross validation accuracy (no resampling):', val_score['test_accuracy'].mean())
print('Mean cross validation f1 (no resampling):', val_score['test_f1'].mean())
ppl.fit(X_train, y_train)
y_pred = ppl.predict(X_test)
print('Test accuracy (no resampling)',accuracy_score(y_pred, y_test))
print('Test f1 (no resampling)',f1_score(y_pred, y_test))

ppl = Pipeline([('reshape',f_reshape),('resample',ros),('ravel',f_ravel),('countVec',cv),('tfidf',tfidf),('model',model)])
val_score = cross_validate(ppl, X_train, y_train, cv=cv_set, scoring=['accuracy','f1'])
print('Mean cross validation accuracy (ROS):', val_score['test_accuracy'].mean())
print('Mean cross validation f1 (ROS):', val_score['test_f1'].mean())
ppl.fit(X_train, y_train)
y_pred = ppl.predict(X_test)
print('Test accuracy (ROS)',accuracy_score(y_pred, y_test))
print('Test f1 (ROS)',f1_score(y_pred, y_test))

ppl = Pipeline([('reshape',f_reshape),('resample',rus),('ravel',f_ravel),('countVec',cv),('tfidf',tfidf),('model',model)])
val_score = cross_validate(ppl, X_train, y_train, cv=cv_set, scoring=['accuracy','f1'])
print('Mean cross validation accuracy (RUS):', val_score['test_accuracy'].mean())
print('Mean cross validation f1 (RUS):', val_score['test_f1'].mean())
ppl.fit(X_train, y_train)
y_pred = ppl.predict(X_test)
print('Test accuracy (RUS)',accuracy_score(y_pred, y_test))
print('Test f1 (RUS)',f1_score(y_pred, y_test))

4630 1158 4630 1158
-------------Fitting category housekeep_ind ...-----------
Mean cross validation accuracy (no resampling): 0.9686825957553026
Mean cross validation f1 (no resampling): 0.026143790849673196
Test accuracy (no resampling) 0.9715025906735751
Test f1 (no resampling) 0.0
Mean cross validation accuracy (ROS): 0.975378107381153
Mean cross validation f1 (ROS): 0.4441400304414003
Test accuracy (ROS) 0.9810017271157168
Test f1 (ROS) 0.5


ValueError: Error when checking input: expected dense_161_input to have shape (5000,) but got array with shape (3571,)

ValueError: Error when checking input: expected dense_163_input to have shape (5000,) but got array with shape (3293,)

ValueError: Error when checking input: expected dense_165_input to have shape (5000,) but got array with shape (3688,)



Mean cross validation accuracy (RUS): nan
Mean cross validation f1 (RUS): nan
Test accuracy (RUS) 0.5777202072538861
Test f1 (RUS) 0.11252268602540834


#### Grid Search using the pipeline giving the highest score

In [35]:
ppl = Pipeline([('reshape',f_reshape),('resample',ros),('ravel',f_ravel),('countVec',cv),('tfidf',tfidf),('model',model)])

param_grid = {
    'countVec__ngram_range': [(1,2), (1,3), (1,4)],
    'countVec__analyzer': ['word','char_wb','char'],
    'countVec__stop_words':[None, 'english'],
    'tfidf__use_idf': [True, False],
    #'model__epochs': [10, ],
    #'model__nparam': [5,10],
    #'model__init': [ 'uniform', 'zeros', 'normal', ], 
    #'model__batch_size':[2, 16, 32],
    #'model__optimizer':['Adam', 'sgd'],
    'model__dropout': [0.5, 0.3, 0]
}

grid = GridSearchCV(estimator=ppl,  
                    n_jobs=-1, 
                    verbose=1,
                    return_train_score=True,
                    cv=cv_set,
                    param_grid=param_grid,
                    scoring='f1')
#grid.get_params

grid_result = grid.fit(X_train, y_train)

# summarize results
print("Best score: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed: 18.5min finished


Best score: 0.540873 using {'countVec__analyzer': 'char', 'countVec__ngram_range': (1, 4), 'countVec__stop_words': 'english', 'model__dropout': 0.5, 'tfidf__use_idf': False}
0.383559 (0.047498) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0.5, 'tfidf__use_idf': True}
0.502347 (0.003320) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0.5, 'tfidf__use_idf': False}
0.420447 (0.090873) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0.3, 'tfidf__use_idf': True}
0.530074 (0.030256) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0.3, 'tfidf__use_idf': False}
0.367915 (0.043204) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0, 'tfidf__use_idf

#### Final prediction

In [56]:
# Definition of the best model
cv = CountVectorizer(max_features=max_f, analyzer='char',ngram_range=(1,4),stop_words='english')
tfidf = TfidfTransformer(use_idf=False)
ppl = Pipeline([('reshape',f_reshape),('resample',ros),('ravel',f_ravel),('countVec',cv),('tfidf',tfidf),('model',model)])
model__dropout=0.5
model__epochs=10
model__nparam=5
model__optimizer='Adam'

In [57]:
print('-------------Predicting category',y.name,'...-----------')
ppl.fit(X_train, y_train)
y_pred = ppl.predict(X_test)
print('Test accuracy',accuracy_score(y_pred, y_test))
print(classification_report(y_pred, y_test))
print('Test f1 score:',f1_score(y_pred, y_test))
print('')

hsk_pred = pd.Series(y_pred.reshape(-1,), name=y.name+'_pred')
display(hsk_pred)

-------------Predicting category housekeep_ind ...-----------
Test accuracy 0.9732297063903281
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1126
           1       0.52      0.53      0.52        32

    accuracy                           0.97      1158
   macro avg       0.75      0.76      0.75      1158
weighted avg       0.97      0.97      0.97      1158

Test f1 score: 0.5230769230769231



0       0
1       0
2       0
3       0
4       0
       ..
1153    1
1154    0
1155    0
1156    0
1157    0
Name: housekeep_ind_pred, Length: 1158, dtype: int64

### Fitting category: front office

#### Finding best resampling method

In [38]:
y = Y.iloc[:,6]
train_index = X_train.index
test_index = X_test.index
y_train = Y.loc[train_index,y.name]
y_test = Y.loc[test_index,y.name]
print(len(X_train), len(X_test), len(y_train), len(y_test))

print('-------------Fitting category',y.name,'...-----------')

ppl = Pipeline([('countVec',cv),('tfidf',tfidf),('model',model)])
val_score = cross_validate(ppl, X_train, y_train, cv=cv_set, scoring=['accuracy','f1'])
print('Mean cross validation accuracy (no resampling):', val_score['test_accuracy'].mean())
print('Mean cross validation f1 (no resampling):', val_score['test_f1'].mean())
ppl.fit(X_train, y_train)
y_pred = ppl.predict(X_test)
print('Test accuracy (no resampling)',accuracy_score(y_pred, y_test))
print('Test f1 (no resampling)',f1_score(y_pred, y_test))

ppl = Pipeline([('reshape',f_reshape),('resample',ros),('ravel',f_ravel),('countVec',cv),('tfidf',tfidf),('model',model)])
val_score = cross_validate(ppl, X_train, y_train, cv=cv_set, scoring=['accuracy','f1'])
print('Mean cross validation accuracy (ROS):', val_score['test_accuracy'].mean())
print('Mean cross validation f1 (ROS):', val_score['test_f1'].mean())
ppl.fit(X_train, y_train)
y_pred = ppl.predict(X_test)
print('Test accuracy (ROS)',accuracy_score(y_pred, y_test))
print('Test f1 (ROS)',f1_score(y_pred, y_test))

4630 1158 4630 1158
-------------Fitting category frontoff_ind ...-----------
Mean cross validation accuracy (no resampling): 0.9153341123822892
Mean cross validation f1 (no resampling): 0.7004559735303569
Test accuracy (no resampling) 0.9222797927461139
Test f1 (no resampling) 0.7204968944099378
Mean cross validation accuracy (ROS): 0.9187900228006138
Mean cross validation f1 (ROS): 0.7368633886006998
Test accuracy (ROS) 0.917098445595855
Test f1 (ROS) 0.7272727272727272


#### Grid Search using the pipeline giving the highest score

In [39]:
ppl = Pipeline([('reshape',f_reshape),('resample',ros),('ravel',f_ravel),('countVec',cv),('tfidf',tfidf),('model',model)])

param_grid = {
    'countVec__ngram_range': [(1,2), (1,3), (1,4)],
    'countVec__analyzer': ['word','char_wb','char'],
    'countVec__stop_words':[None, 'english'],
    'tfidf__use_idf': [True, False],
    #'model__epochs': [10, ],
    #'model__nparam': [5,10],
    #'model__init': [ 'uniform', 'zeros', 'normal', ], 
    #'model__batch_size':[2, 16, 32],
    #'model__optimizer':['Adam', 'sgd'],
    'model__dropout': [0.5, 0.3, 0]
}

grid = GridSearchCV(estimator=ppl,  
                    n_jobs=-1, 
                    verbose=1,
                    return_train_score=True,
                    cv=cv_set,
                    param_grid=param_grid,
                    scoring='f1')
#grid.get_params

grid_result = grid.fit(X_train, y_train)

# summarize results
print("Best score: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  9.2min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed: 14.0min finished


Best score: 0.746159 using {'countVec__analyzer': 'char', 'countVec__ngram_range': (1, 4), 'countVec__stop_words': None, 'model__dropout': 0.5, 'tfidf__use_idf': False}
0.671070 (0.013089) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0.5, 'tfidf__use_idf': True}
0.705822 (0.022883) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0.5, 'tfidf__use_idf': False}
0.699471 (0.003628) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0.3, 'tfidf__use_idf': True}
0.723001 (0.014675) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0.3, 'tfidf__use_idf': False}
0.653675 (0.041956) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0, 'tfidf__use_idf': Tr

#### Final prediction

In [40]:
# Definition of the best model
cv = CountVectorizer(max_features=max_f, analyzer='char',ngram_range=(1,4))
tfidf = TfidfTransformer(use_idf=False)
ppl = Pipeline([('reshape',f_reshape),('resample',ros),('ravel',f_ravel),('countVec',cv),('tfidf',tfidf),('model',model)])
model__dropout=0.5
model__epochs=10
model__nparam=5
model__optimizer='Adam'

In [41]:
print('-------------Predicting category',y.name,'...-----------')
ppl.fit(X_train, y_train)
y_pred = ppl.predict(X_test)
print('Test accuracy',accuracy_score(y_pred, y_test))
print(classification_report(y_pred, y_test))
print('Test f1 score:',f1_score(y_pred, y_test))
print('')

fro_pred = pd.Series(y_pred.reshape(-1,), name=y.name+'_pred')
display(fro_pred)

-------------Predicting category frontoff_ind ...-----------
Test accuracy 0.9101899827288429
              precision    recall  f1-score   support

           0       0.94      0.95      0.95       952
           1       0.77      0.71      0.74       206

    accuracy                           0.91      1158
   macro avg       0.85      0.83      0.84      1158
weighted avg       0.91      0.91      0.91      1158

Test f1 score: 0.7386934673366835



0       0
1       0
2       0
3       0
4       0
       ..
1153    0
1154    0
1155    0
1156    1
1157    0
Name: frontoff_ind_pred, Length: 1158, dtype: int64

### Fitting category: others

#### Finding best resampling method

In [42]:
y = Y.iloc[:,7]
train_index = X_train.index
test_index = X_test.index
y_train = Y.loc[train_index,y.name]
y_test = Y.loc[test_index,y.name]
print(len(X_train), len(X_test), len(y_train), len(y_test))

print('-------------Fitting category',y.name,'...-----------')

ppl = Pipeline([('countVec',cv),('tfidf',tfidf),('model',model)])
val_score = cross_validate(ppl, X_train, y_train, cv=cv_set, scoring=['accuracy','f1'])
print('Mean cross validation accuracy (no resampling):', val_score['test_accuracy'].mean())
print('Mean cross validation f1 (no resampling):', val_score['test_f1'].mean())
ppl.fit(X_train, y_train)
y_pred = ppl.predict(X_test)
print('Test accuracy (no resampling)',accuracy_score(y_pred, y_test))
print('Test f1 (no resampling)',f1_score(y_pred, y_test))

ppl = Pipeline([('reshape',f_reshape),('resample',ros),('ravel',f_ravel),('countVec',cv),('tfidf',tfidf),('model',model)])
val_score = cross_validate(ppl, X_train, y_train, cv=cv_set, scoring=['accuracy','f1'])
print('Mean cross validation accuracy (ROS):', val_score['test_accuracy'].mean())
print('Mean cross validation f1 (ROS):', val_score['test_f1'].mean())
ppl.fit(X_train, y_train)
y_pred = ppl.predict(X_test)
print('Test accuracy (ROS)',accuracy_score(y_pred, y_test))
print('Test f1 (ROS)',f1_score(y_pred, y_test))

ppl = Pipeline([('reshape',f_reshape),('resample',rus),('ravel',f_ravel),('countVec',cv),('tfidf',tfidf),('model',model)])
val_score = cross_validate(ppl, X_train, y_train, cv=cv_set, scoring=['accuracy','f1'])
print('Mean cross validation accuracy (RUS):', val_score['test_accuracy'].mean())
print('Mean cross validation f1 (RUS):', val_score['test_f1'].mean())
ppl.fit(X_train, y_train)
y_pred = ppl.predict(X_test)
print('Test accuracy (RUS)',accuracy_score(y_pred, y_test))
print('Test f1 (RUS)',f1_score(y_pred, y_test))

4630 1158 4630 1158
-------------Fitting category Others ...-----------
Mean cross validation accuracy (no resampling): 0.8332616966477389
Mean cross validation f1 (no resampling): 0.5026292363329401
Test accuracy (no resampling) 0.8341968911917098
Test f1 (no resampling) 0.5126903553299492
Mean cross validation accuracy (ROS): 0.7984841845226702
Mean cross validation f1 (ROS): 0.5892902460143453
Test accuracy (ROS) 0.7918825561312608
Test f1 (ROS) 0.5512104283054003
Mean cross validation accuracy (RUS): 0.752265090435719
Mean cross validation f1 (RUS): 0.5686930411923621
Test accuracy (RUS) 0.7469775474956822
Test f1 (RUS) 0.5784172661870505


#### Grid Search using the pipeline giving the highest score

In [43]:
ppl = Pipeline([('reshape',f_reshape),('resample',ros),('ravel',f_ravel),('countVec',cv),('tfidf',tfidf),('model',model)])

param_grid = {
    'countVec__ngram_range': [(1,2), (1,3), (1,4)],
    'countVec__analyzer': ['word','char_wb','char'],
    'countVec__stop_words':[None, 'english'],
    'tfidf__use_idf': [True, False],
    #'model__epochs': [10, ],
    #'model__nparam': [5,10],
    #'model__init': [ 'uniform', 'zeros', 'normal', ], 
    #'model__batch_size':[2, 16, 32],
    #'model__optimizer':['Adam', 'sgd'],
    'model__dropout': [0.5, 0.3, 0]
}

grid = GridSearchCV(estimator=ppl,  
                    n_jobs=-1, 
                    verbose=1,
                    return_train_score=True,
                    cv=cv_set,
                    param_grid=param_grid,
                    scoring='f1')
#grid.get_params

grid_result = grid.fit(X_train, y_train)

# summarize results
print("Best score: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed: 12.5min finished


Best score: 0.595358 using {'countVec__analyzer': 'char_wb', 'countVec__ngram_range': (1, 4), 'countVec__stop_words': 'english', 'model__dropout': 0, 'tfidf__use_idf': False}
0.535658 (0.041585) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0.5, 'tfidf__use_idf': True}
0.567991 (0.027723) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0.5, 'tfidf__use_idf': False}
0.555181 (0.033671) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0.3, 'tfidf__use_idf': True}
0.554091 (0.026849) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0.3, 'tfidf__use_idf': False}
0.522520 (0.039842) with: {'countVec__analyzer': 'word', 'countVec__ngram_range': (1, 2), 'countVec__stop_words': None, 'model__dropout': 0, 'tfidf__use_id

#### Final prediction

In [58]:
# Definition of the best model
cv = CountVectorizer(max_features=max_f, analyzer='char_wb',ngram_range=(1,4),stop_words='english')
tfidf = TfidfTransformer(use_idf=False)
ppl = Pipeline([('reshape',f_reshape),('resample',ros),('ravel',f_ravel),('countVec',cv),('tfidf',tfidf),('model',model)])
model__dropout=0
model__epochs=10
model__nparam=5
model__optimizer='Adam'

In [59]:
print('-------------Predicting category',y.name,'...-----------')
ppl.fit(X_train, y_train)
y_pred = ppl.predict(X_test)
print('Test accuracy',accuracy_score(y_pred, y_test))
print(classification_report(y_pred, y_test))
print('Test f1 score:',f1_score(y_pred, y_test))
print('')

oth_pred = pd.Series(y_pred.reshape(-1,), name=y.name+'_pred')
display(oth_pred)

-------------Predicting category Others ...-----------
Test accuracy 0.7832469775474957
              precision    recall  f1-score   support

           0       0.80      0.91      0.85       785
           1       0.74      0.51      0.60       373

    accuracy                           0.78      1158
   macro avg       0.77      0.71      0.73      1158
weighted avg       0.78      0.78      0.77      1158

Test f1 score: 0.6022187004754359



0       0
1       0
2       1
3       0
4       0
       ..
1153    0
1154    1
1155    1
1156    0
1157    1
Name: Others_pred, Length: 1158, dtype: int64

### Combine predictions

In [60]:
df_pred = pd.concat([X_test.reset_index(),fac_pred,sec_pred,pri_pred,loc_pred,fnb_pred,hsk_pred,fro_pred,oth_pred], axis=1)
display(df_pred)
df_pred.to_csv('df_pred.csv')

Unnamed: 0,index,cons,facility_ind_pred,security_ind_pred,pricing_ind_pred,location_ind_pred,fb_ind_pred,housekeep_ind_pred,frontoff_ind_pred,Others_pred
0,5600,breakfast wasnt includ room,0,0,1,0,1,0,0,0
1,3127,delux room fruit snack crowd fridg automat sensor u touch one item pay even freez item,1,0,0,0,0,0,0,0
2,4930,infin pool alway jampack,1,0,0,0,0,0,0,1
3,863,quick learn noth marina bay sand either free inexpens,0,0,0,0,1,0,0,0
4,2997,want use pay extra,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
1153,5038,open room got room smell cigarett,1,0,0,0,0,1,0,0
1154,5142,swim pool crowd seem public pool esclus one,1,0,0,0,0,0,0,1
1155,1319,place massiv lot walk room end corridor,0,0,0,0,0,0,0,1
1156,3397,check long queue long flight tiresom,0,0,0,0,0,0,1,0
