In [21]:
import pandas as pd
import re
from sklearn import model_selection, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from imblearn.over_sampling import RandomOverSampler
import numpy as np
from sklearn import ensemble
from numpy import arange

In [22]:
data = pd.read_csv('/Users/visenze/Desktop/data_cleaned.csv')
data = data.fillna(0)
departments=['facility_ind', 'security_ind', 'pricing_ind','location_ind', 'fb_ind', 'housekeep_ind', 'frontoff_ind', 'Others']
for depart in departments:
    data[depart] = data[depart].astype(int)
data['cons'] = [str (item) for item in data['cons']]

In [23]:
# split data
train_x_fac, test_x_fac, train_y_fac, test_y_fac = model_selection.train_test_split(data['cons'],data['facility_ind'],test_size=0.2,random_state=42)
train_x_sec, test_x_sec, train_y_sec, test_y_sec = model_selection.train_test_split(data['cons'],data['security_ind'],test_size=0.2,random_state=42)
train_x_pri, test_x_pri, train_y_pri, test_y_pri = model_selection.train_test_split(data['cons'],data['pricing_ind'],test_size=0.2,random_state=42)
train_x_loc, test_x_loc, train_y_loc, test_y_loc = model_selection.train_test_split(data['cons'],data['location_ind'],test_size=0.2,random_state=42)
train_x_fb, test_x_fb, train_y_fb, test_y_fb = model_selection.train_test_split(data['cons'],data['fb_ind'],test_size=0.2,random_state=42)
train_x_hk, test_x_hk, train_y_hk, test_y_hk = model_selection.train_test_split(data['cons'],data['housekeep_ind'],test_size=0.2,random_state=42)
train_x_ff, test_x_ff, train_y_ff, test_y_ff = model_selection.train_test_split(data['cons'],data['frontoff_ind'],test_size=0.2,random_state=42)
train_x_oth, test_x_oth, train_y_oth, test_y_oth = model_selection.train_test_split(data['cons'],data['Others'],test_size=0.2,random_state=42)

In [24]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')

xtrain_fac_count = count_vect.fit_transform(train_x_fac)
xtest_fac_count = count_vect.transform(test_x_fac)

xtrain_sec_count = count_vect.fit_transform(train_x_sec)
xtest_sec_count = count_vect.transform(test_x_sec)

xtrain_pri_count = count_vect.fit_transform(train_x_pri)
xtest_pri_count = count_vect.transform(test_x_pri)

xtrain_loc_count = count_vect.fit_transform(train_x_loc)
xtest_loc_count = count_vect.transform(test_x_loc)

xtrain_fb_count = count_vect.fit_transform(train_x_fb)
xtest_fb_count = count_vect.transform(test_x_fb)

xtrain_hk_count = count_vect.fit_transform(train_x_hk)
xtest_hk_count = count_vect.transform(test_x_hk)

xtrain_ff_count = count_vect.fit_transform(train_x_ff)
xtest_ff_count = count_vect.transform(test_x_ff)

xtrain_oth_count =  count_vect.fit_transform(train_x_oth)
xtest_oth_count =  count_vect.transform(test_x_oth)

In [25]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}',dtype=np.float32, sublinear_tf=True)

xtrain_fac_tfidf =  tfidf_vect.fit_transform(train_x_fac)
xtest_fac_tfidf =  tfidf_vect.transform(test_x_fac)

xtrain_sec_tfidf =  tfidf_vect.fit_transform(train_x_sec)
xtest_sec_tfidf =  tfidf_vect.transform(test_x_sec)

xtrain_pri_tfidf =  tfidf_vect.fit_transform(train_x_pri)
xtest_pri_tfidf =  tfidf_vect.transform(test_x_pri)

xtrain_loc_tfidf =  tfidf_vect.fit_transform(train_x_loc)
xtest_loc_tfidf =  tfidf_vect.transform(test_x_loc)

xtrain_fb_tfidf =  tfidf_vect.fit_transform(train_x_fb)
xtest_fb_tfidf =  tfidf_vect.transform(test_x_fb)

xtrain_hk_tfidf =  tfidf_vect.fit_transform(train_x_hk)
xtest_hk_tfidf =  tfidf_vect.transform(test_x_hk)

xtrain_ff_tfidf =  tfidf_vect.fit_transform(train_x_ff)
xtest_ff_tfidf =  tfidf_vect.transform(test_x_ff)

xtrain_oth_tfidf =  tfidf_vect.fit_transform(train_x_oth)
xtest_oth_tfidf =  tfidf_vect.transform(test_x_oth)

In [29]:
oversample = RandomOverSampler(sampling_strategy='minority')
xtrain_fac_count, train_y_fac = oversample.fit_resample(xtrain_fac_count, train_y_fac)
xtrain_fac_tfidf, train_y_fac = oversample.fit_resample(xtrain_fac_tfidf, train_y_fac)

In [64]:
# facility
# oversampling
oversample = RandomOverSampler(sampling_strategy='minority',random_state=42)
xtrain_fac_count, train_y_fac = oversample.fit_resample(xtrain_fac_count, train_y_fac)
xtrain_fac_tfidf, train_y_fac = oversample.fit_resample(xtrain_fac_tfidf, train_y_fac)

# Naive Bayes on Count Vectors
classifier = naive_bayes.MultinomialNB(alpha=3)
classifier.fit(xtrain_fac_count, train_y_fac)
predictions = classifier.predict(xtest_fac_count)
accuracy = metrics.accuracy_score(predictions, test_y_fac)
recall = metrics.recall_score(test_y_fac, predictions)
precision = metrics.precision_score(test_y_fac, predictions)
f1 = metrics.f1_score(test_y_fac, predictions)
print ("NB, Count Vectors, accuracy: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# Naive Bayes on Word Level TF IDF Vectors
classifier = naive_bayes.MultinomialNB(alpha=3)
classifier.fit(xtrain_fac_tfidf, train_y_fac)
predictions = classifier.predict(xtest_fac_tfidf)
accuracy = metrics.accuracy_score(predictions, test_y_fac)
recall = metrics.recall_score(test_y_fac, predictions)
precision = metrics.precision_score(test_y_fac, predictions)
f1 = metrics.f1_score(test_y_fac, predictions)
print ("NB, WordLevel TF-IDF: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# Naive Bayes on Ngram Level TF IDF Vectors
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,2), dtype=np.float32, sublinear_tf=True)
xtrain_fac_tfidf_ngram =  tfidf_vect_ngram.fit_transform(train_x_fac)
xtest_fac_tfidf_ngram =  tfidf_vect_ngram.transform(test_x_fac)
xtrain_fac_tfidf_ngram, train_y_fac = oversample.fit_resample(xtrain_fac_tfidf_ngram, train_y_fac)

classifier = naive_bayes.MultinomialNB()
classifier.fit(xtrain_fac_tfidf_ngram, train_y_fac)
predictions = classifier.predict(xtest_fac_tfidf_ngram)
accuracy = metrics.accuracy_score(predictions, test_y_fac)
recall = metrics.recall_score(test_y_fac, predictions)
precision = metrics.precision_score(test_y_fac, predictions)
f1 = metrics.f1_score(test_y_fac, predictions)
print ("NB, N-Gram Vectors: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# Naive Bayes on Character Level TF IDF Vectors
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char',strip_accents='unicode',token_pattern=r'\w{1,}', ngram_range=(1,6),dtype=np.float32, sublinear_tf=True)
xtrain_fac_tfidf_ngram_chars =  tfidf_vect_ngram_chars.fit_transform(train_x_fac) 
xtest_fac_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_x_fac) 
xtrain_fac_tfidf_ngram_chars, train_y_fac = oversample.fit_resample(xtrain_fac_tfidf_ngram_chars, train_y_fac)

classifier = naive_bayes.MultinomialNB()
classifier.fit(xtrain_fac_tfidf_ngram_chars, train_y_fac)
predictions = classifier.predict(xtest_fac_tfidf_ngram_chars)
accuracy = metrics.accuracy_score(predictions, test_y_fac)
recall = metrics.recall_score(test_y_fac, predictions)
precision = metrics.precision_score(test_y_fac, predictions)
f1 = metrics.f1_score(test_y_fac, predictions)
print ("NB, CharLevel Vectors: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# RF on Count Vectors
classifier = ensemble.RandomForestClassifier(n_estimators=300,random_state=42)
classifier.fit(xtrain_fac_count, train_y_fac)
predictions = classifier.predict(xtest_fac_count)
accuracy = metrics.accuracy_score(predictions, test_y_fac)
recall = metrics.recall_score(test_y_fac, predictions)
precision = metrics.precision_score(test_y_fac, predictions)
f1 = metrics.f1_score(test_y_fac, predictions)
print ("RF, Count Vectors, accuracy: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# RF on Word Level TF IDF Vectors
classifier = ensemble.RandomForestClassifier(n_estimators=300,random_state=42)
classifier.fit(xtrain_fac_tfidf, train_y_fac)
predictions = classifier.predict(xtest_fac_tfidf)
accuracy = metrics.accuracy_score(predictions, test_y_fac)
recall = metrics.recall_score(test_y_fac, predictions)
precision = metrics.precision_score(test_y_fac, predictions)
f1 = metrics.f1_score(test_y_fac, predictions)
print ("RF, WordLevel TF-IDF: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

NB, Count Vectors, accuracy:  0.7884283246977547 recall:  0.82 precision:  0.654690618762475 f1 0.728079911209767
NB, WordLevel TF-IDF:  0.7970639032815199 recall:  0.795 precision:  0.6751592356687898 f1 0.730195177956372
NB, N-Gram Vectors:  0.7763385146804835 recall:  0.405 precision:  0.8852459016393442 f1 0.5557461406518012
NB, CharLevel Vectors:  0.7806563039723662 recall:  0.825 precision:  0.642023346303502 f1 0.7221006564551422
RF, Count Vectors, accuracy:  0.8393782383419689 recall:  0.7625 precision:  0.7702020202020202 f1 0.7663316582914572
RF, WordLevel TF-IDF:  0.8350604490500864 recall:  0.7725 precision:  0.7555012224938875 f1 0.7639060568603213


In [45]:
# security
# oversampling
oversample = RandomOverSampler(sampling_strategy='minority',random_state=42)
xtrain_sec_count, train_y_sec = oversample.fit_resample(xtrain_sec_count, train_y_sec)
xtrain_sec_tfidf, train_y_sec = oversample.fit_resample(xtrain_sec_tfidf, train_y_sec)

# Naive Bayes on Count Vectors
classifier = naive_bayes.MultinomialNB()
classifier.fit(xtrain_sec_count, train_y_sec)
predictions = classifier.predict(xtest_sec_count)
accuracy = metrics.accuracy_score(predictions, test_y_sec)
recall = metrics.recall_score(test_y_sec, predictions)
precision = metrics.precision_score(test_y_sec, predictions)
f1 = metrics.f1_score(test_y_sec, predictions)
print ("NB, Count Vectors, accuracy: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# Naive Bayes on Word Level TF IDF Vectors
classifier = naive_bayes.MultinomialNB()
classifier.fit(xtrain_sec_tfidf, train_y_sec)
predictions = classifier.predict(xtest_sec_tfidf)
accuracy = metrics.accuracy_score(predictions, test_y_sec)
recall = metrics.recall_score(test_y_sec, predictions)
precision = metrics.precision_score(test_y_sec, predictions)
f1 = metrics.f1_score(test_y_sec, predictions)
print ("NB, WordLevel TF-IDF: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# Naive Bayes on Ngram Level TF IDF Vectors
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,2), dtype=np.float32, sublinear_tf=True)
xtrain_sec_tfidf_ngram =  tfidf_vect_ngram.fit_transform(train_x_sec)
xtest_sec_tfidf_ngram =  tfidf_vect_ngram.transform(test_x_sec)
xtrain_sec_tfidf_ngram, train_y_sec = oversample.fit_resample(xtrain_sec_tfidf_ngram, train_y_sec)

classifier = naive_bayes.MultinomialNB()
classifier.fit(xtrain_sec_tfidf_ngram, train_y_sec)
predictions = classifier.predict(xtest_sec_tfidf_ngram)
accuracy = metrics.accuracy_score(predictions, test_y_sec)
recall = metrics.recall_score(test_y_sec, predictions)
precision = metrics.precision_score(test_y_sec, predictions)
f1 = metrics.f1_score(test_y_sec, predictions)
print ("NB, N-Gram Vectors: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# Naive Bayes on Character Level TF IDF Vectors
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char',strip_accents='unicode',token_pattern=r'\w{1,}', ngram_range=(1,3),dtype=np.float32, sublinear_tf=True)
xtrain_sec_tfidf_ngram_chars =  tfidf_vect_ngram_chars.fit_transform(train_x_sec) 
xtest_sec_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_x_sec) 
xtrain_sec_tfidf_ngram_chars, train_y_sec = oversample.fit_resample(xtrain_sec_tfidf_ngram_chars, train_y_sec)

classifier = naive_bayes.MultinomialNB()
classifier.fit(xtrain_sec_tfidf_ngram_chars, train_y_sec)
predictions = classifier.predict(xtest_sec_tfidf_ngram_chars)
accuracy = metrics.accuracy_score(predictions, test_y_sec)
recall = metrics.recall_score(test_y_sec, predictions)
precision = metrics.precision_score(test_y_sec, predictions)
f1 = metrics.f1_score(test_y_sec, predictions)
print ("NB, CharLevel Vectors: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# RF on Count Vectors
classifier = ensemble.RandomForestClassifier(n_estimators=400,random_state=42)
classifier.fit(xtrain_sec_count, train_y_sec)
predictions = classifier.predict(xtest_sec_count)
accuracy = metrics.accuracy_score(predictions, test_y_sec)
recall = metrics.recall_score(test_y_sec, predictions)
precision = metrics.precision_score(test_y_sec, predictions)
f1 = metrics.f1_score(test_y_sec, predictions)
print ("RF, Count Vectors, accuracy: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# RF on Word Level TF IDF Vectors
classifier = ensemble.RandomForestClassifier(n_estimators=300,random_state=42)
classifier.fit(xtrain_sec_tfidf, train_y_sec)
predictions = classifier.predict(xtest_sec_tfidf)
accuracy = metrics.accuracy_score(predictions, test_y_sec)
recall = metrics.recall_score(test_y_sec, predictions)
precision = metrics.precision_score(test_y_sec, predictions)
f1 = metrics.f1_score(test_y_sec, predictions)
print ("RF, WordLevel TF-IDF: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

NB, Count Vectors, accuracy:  0.9438687392055267 recall:  0.6470588235294118 precision:  0.15714285714285714 f1 0.2528735632183908
NB, WordLevel TF-IDF:  0.9412780656303973 recall:  0.7647058823529411 precision:  0.16883116883116883 f1 0.2765957446808511


  _warn_prf(average, modifier, msg_start, len(result))


NB, N-Gram Vectors:  0.9853195164075993 recall:  0.0 precision:  0.0 f1 0.0
NB, CharLevel Vectors:  0.9404145077720207 recall:  0.5882352941176471 precision:  0.1388888888888889 f1 0.22471910112359547
RF, Count Vectors, accuracy:  0.9758203799654577 recall:  0.17647058823529413 precision:  0.17647058823529413 f1 0.17647058823529413
RF, WordLevel TF-IDF:  0.9758203799654577 recall:  0.058823529411764705 precision:  0.07692307692307693 f1 0.06666666666666667


In [69]:
# price
# oversampling
oversample = RandomOverSampler(sampling_strategy='minority',random_state=42)
xtrain_pri_count, train_y_pri = oversample.fit_resample(xtrain_pri_count, train_y_pri)
xtrain_pri_tfidf, train_y_pri = oversample.fit_resample(xtrain_pri_tfidf, train_y_pri)

# Naive Bayes on Count Vectors
classifier = naive_bayes.MultinomialNB()
classifier.fit(xtrain_pri_count, train_y_pri)
predictions = classifier.predict(xtest_pri_count)
accuracy = metrics.accuracy_score(predictions, test_y_pri)
recall = metrics.recall_score(test_y_pri, predictions)
precision = metrics.precision_score(test_y_pri, predictions)
f1 = metrics.f1_score(test_y_pri, predictions)
print ("NB, Count Vectors, accuracy: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# Naive Bayes on Word Level TF IDF Vectors
classifier = naive_bayes.MultinomialNB()
classifier.fit(xtrain_pri_tfidf, train_y_pri)
predictions = classifier.predict(xtest_pri_tfidf)
accuracy = metrics.accuracy_score(predictions, test_y_pri)
recall = metrics.recall_score(test_y_pri, predictions)
precision = metrics.precision_score(test_y_pri, predictions)
f1 = metrics.f1_score(test_y_pri, predictions)
print ("NB, WordLevel TF-IDF: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# Naive Bayes on Ngram Level TF IDF Vectors
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,2), dtype=np.float32, sublinear_tf=True)
xtrain_pri_tfidf_ngram =  tfidf_vect_ngram.fit_transform(train_x_pri)
xtest_pri_tfidf_ngram =  tfidf_vect_ngram.transform(test_x_pri)
xtrain_pri_tfidf_ngram, train_y_pri = oversample.fit_resample(xtrain_pri_tfidf_ngram, train_y_pri)

classifier = naive_bayes.MultinomialNB()
classifier.fit(xtrain_pri_tfidf_ngram, train_y_pri)
predictions = classifier.predict(xtest_pri_tfidf_ngram)
accuracy = metrics.accuracy_score(predictions, test_y_pri)
recall = metrics.recall_score(test_y_pri, predictions)
precision = metrics.precision_score(test_y_pri, predictions)
f1 = metrics.f1_score(test_y_pri, predictions)
print ("NB, N-Gram Vectors: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# Naive Bayes on Character Level TF IDF Vectors
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char',strip_accents='unicode',token_pattern=r'\w{1,}', ngram_range=(1,7),dtype=np.float32, sublinear_tf=True)
xtrain_pri_tfidf_ngram_chars =  tfidf_vect_ngram_chars.fit_transform(train_x_pri) 
xtest_pri_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_x_pri) 
xtrain_pri_tfidf_ngram_chars, train_y_pri = oversample.fit_resample(xtrain_pri_tfidf_ngram_chars, train_y_pri)

classifier = naive_bayes.MultinomialNB()
classifier.fit(xtrain_pri_tfidf_ngram_chars, train_y_pri)
predictions = classifier.predict(xtest_pri_tfidf_ngram_chars)
accuracy = metrics.accuracy_score(predictions, test_y_pri)
recall = metrics.recall_score(test_y_pri, predictions)
precision = metrics.precision_score(test_y_pri, predictions)
f1 = metrics.f1_score(test_y_pri, predictions)
print ("NB, CharLevel Vectors: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# RF on Count Vectors
classifier = ensemble.RandomForestClassifier(n_estimators=150,random_state=42)
classifier.fit(xtrain_pri_count, train_y_pri)
predictions = classifier.predict(xtest_pri_count)
accuracy = metrics.accuracy_score(predictions, test_y_pri)
recall = metrics.recall_score(test_y_pri, predictions)
precision = metrics.precision_score(test_y_pri, predictions)
f1 = metrics.f1_score(test_y_pri, predictions)
print ("RF, Count Vectors, accuracy: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# RF on Word Level TF IDF Vectors
classifier = ensemble.RandomForestClassifier(n_estimators=300,random_state=42)
classifier.fit(xtrain_pri_tfidf, train_y_pri)
predictions = classifier.predict(xtest_pri_tfidf)
accuracy = metrics.accuracy_score(predictions, test_y_pri)
recall = metrics.recall_score(test_y_pri, predictions)
precision = metrics.precision_score(test_y_pri, predictions)
f1 = metrics.f1_score(test_y_pri, predictions)
print ("RF, WordLevel TF-IDF: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

NB, Count Vectors, accuracy:  0.8765112262521589 recall:  0.9109311740890689 precision:  0.6502890173410405 f1 0.7588532883642497
NB, WordLevel TF-IDF:  0.8842832469775475 recall:  0.902834008097166 precision:  0.6696696696696697 f1 0.7689655172413793
NB, N-Gram Vectors:  0.8920552677029361 recall:  0.4979757085020243 precision:  0.9919354838709677 f1 0.6630727762803235
NB, CharLevel Vectors:  0.886873920552677 recall:  0.9271255060728745 precision:  0.6695906432748538 f1 0.7775891341256368
RF, Count Vectors, accuracy:  0.9395509499136442 recall:  0.8947368421052632 precision:  0.8339622641509434 f1 0.86328125
RF, WordLevel TF-IDF:  0.9473229706390328 recall:  0.9068825910931174 precision:  0.8549618320610687 f1 0.8801571709233792


In [77]:
# location
# oversampling
oversample = RandomOverSampler(sampling_strategy='minority',random_state=42)
xtrain_loc_count, train_y_loc = oversample.fit_resample(xtrain_loc_count, train_y_loc)
xtrain_loc_tfidf, train_y_loc = oversample.fit_resample(xtrain_loc_tfidf, train_y_loc)

# Naive Bayes on Count Vectors
classifier = naive_bayes.MultinomialNB()
classifier.fit(xtrain_loc_count, train_y_loc)
predictions = classifier.predict(xtest_loc_count)
accuracy = metrics.accuracy_score(predictions, test_y_loc)
recall = metrics.recall_score(test_y_loc, predictions)
precision = metrics.precision_score(test_y_loc, predictions)
f1 = metrics.f1_score(test_y_loc, predictions)
print ("NB, Count Vectors, accuracy: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# Naive Bayes on Word Level TF IDF Vectors
classifier = naive_bayes.MultinomialNB()
classifier.fit(xtrain_loc_tfidf, train_y_loc)
predictions = classifier.predict(xtest_loc_tfidf)
accuracy = metrics.accuracy_score(predictions, test_y_loc)
recall = metrics.recall_score(test_y_loc, predictions)
precision = metrics.precision_score(test_y_loc, predictions)
f1 = metrics.f1_score(test_y_loc, predictions)
print ("NB, WordLevel TF-IDF: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# Naive Bayes on Ngram Level TF IDF Vectors
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,2), dtype=np.float32, sublinear_tf=True)
xtrain_loc_tfidf_ngram =  tfidf_vect_ngram.fit_transform(train_x_loc)
xtest_loc_tfidf_ngram =  tfidf_vect_ngram.transform(test_x_loc)
xtrain_loc_tfidf_ngram, train_y_loc = oversample.fit_resample(xtrain_loc_tfidf_ngram, train_y_loc)

classifier = naive_bayes.MultinomialNB()
classifier.fit(xtrain_loc_tfidf_ngram, train_y_loc)
predictions = classifier.predict(xtest_loc_tfidf_ngram)
accuracy = metrics.accuracy_score(predictions, test_y_loc)
recall = metrics.recall_score(test_y_loc, predictions)
precision = metrics.precision_score(test_y_loc, predictions)
f1 = metrics.f1_score(test_y_loc, predictions)
print ("NB, N-Gram Vectors: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# Naive Bayes on Character Level TF IDF Vectors
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char',strip_accents='unicode',token_pattern=r'\w{1,}', ngram_range=(1,4),dtype=np.float32, sublinear_tf=True)
xtrain_loc_tfidf_ngram_chars =  tfidf_vect_ngram_chars.fit_transform(train_x_loc) 
xtest_loc_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_x_loc) 
xtrain_loc_tfidf_ngram_chars, train_y_loc = oversample.fit_resample(xtrain_loc_tfidf_ngram_chars, train_y_loc)

classifier = naive_bayes.MultinomialNB()
classifier.fit(xtrain_loc_tfidf_ngram_chars, train_y_loc)
predictions = classifier.predict(xtest_loc_tfidf_ngram_chars)
accuracy = metrics.accuracy_score(predictions, test_y_loc)
recall = metrics.recall_score(test_y_loc, predictions)
precision = metrics.precision_score(test_y_loc, predictions)
f1 = metrics.f1_score(test_y_loc, predictions)
print ("NB, CharLevel Vectors: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# RF on Count Vectors
classifier = ensemble.RandomForestClassifier(n_estimators=300,random_state=42)
classifier.fit(xtrain_loc_count, train_y_loc)
predictions = classifier.predict(xtest_loc_count)
accuracy = metrics.accuracy_score(predictions, test_y_loc)
recall = metrics.recall_score(test_y_loc, predictions)
precision = metrics.precision_score(test_y_loc, predictions)
f1 = metrics.f1_score(test_y_loc, predictions)
print ("RF, Count Vectors, accuracy: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# RF on Word Level TF IDF Vectors
classifier = ensemble.RandomForestClassifier(n_estimators=100,random_state=42)
classifier.fit(xtrain_loc_tfidf, train_y_loc)
predictions = classifier.predict(xtest_loc_tfidf)
accuracy = metrics.accuracy_score(predictions, test_y_loc)
recall = metrics.recall_score(test_y_loc, predictions)
precision = metrics.precision_score(test_y_loc, predictions)
f1 = metrics.f1_score(test_y_loc, predictions)
print ("RF, WordLevel TF-IDF: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

NB, Count Vectors, accuracy:  0.9395509499136442 recall:  0.75 precision:  0.08108108108108109 f1 0.14634146341463414
NB, WordLevel TF-IDF:  0.9516407599309153 recall:  0.75 precision:  0.1 f1 0.17647058823529416


  _warn_prf(average, modifier, msg_start, len(result))


NB, N-Gram Vectors:  0.9930915371329879 recall:  0.0 precision:  0.0 f1 0.0
NB, CharLevel Vectors:  0.9620034542314335 recall:  0.75 precision:  0.125 f1 0.21428571428571427
RF, Count Vectors, accuracy:  0.9939550949913645 recall:  0.25 precision:  0.6666666666666666 f1 0.36363636363636365
RF, WordLevel TF-IDF:  0.9939550949913645 recall:  0.25 precision:  0.6666666666666666 f1 0.36363636363636365


In [83]:
# food and beverage
# oversampling
oversample = RandomOverSampler(sampling_strategy='minority',random_state=42)
xtrain_fb_count, train_y_fb = oversample.fit_resample(xtrain_fb_count, train_y_fb)
xtrain_fb_tfidf, train_y_fb = oversample.fit_resample(xtrain_fb_tfidf, train_y_fb)

# Naive Bayes on Count Vectors
classifier = naive_bayes.MultinomialNB()
classifier.fit(xtrain_fb_count, train_y_fb)
predictions = classifier.predict(xtest_fb_count)
accuracy = metrics.accuracy_score(predictions, test_y_fb)
recall = metrics.recall_score(test_y_fb, predictions)
precision = metrics.precision_score(test_y_fb, predictions)
f1 = metrics.f1_score(test_y_fb, predictions)
print ("NB, Count Vectors, accuracy: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# Naive Bayes on Word Level TF IDF Vectors
classifier = naive_bayes.MultinomialNB(alpha=3)
classifier.fit(xtrain_fb_tfidf, train_y_fb)
predictions = classifier.predict(xtest_fb_tfidf)
accuracy = metrics.accuracy_score(predictions, test_y_fb)
recall = metrics.recall_score(test_y_fb, predictions)
precision = metrics.precision_score(test_y_fb, predictions)
f1 = metrics.f1_score(test_y_fb, predictions)
print ("NB, WordLevel TF-IDF: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# Naive Bayes on Ngram Level TF IDF Vectors
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,2), dtype=np.float32, sublinear_tf=True)
xtrain_fb_tfidf_ngram =  tfidf_vect_ngram.fit_transform(train_x_fb)
xtest_fb_tfidf_ngram =  tfidf_vect_ngram.transform(test_x_fb)
xtrain_fb_tfidf_ngram, train_y_fb = oversample.fit_resample(xtrain_fb_tfidf_ngram, train_y_fb)

classifier = naive_bayes.MultinomialNB()
classifier.fit(xtrain_fb_tfidf_ngram, train_y_fb)
predictions = classifier.predict(xtest_fb_tfidf_ngram)
accuracy = metrics.accuracy_score(predictions, test_y_fb)
recall = metrics.recall_score(test_y_fb, predictions)
precision = metrics.precision_score(test_y_fb, predictions)
f1 = metrics.f1_score(test_y_fb, predictions)
print ("NB, N-Gram Vectors: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# Naive Bayes on Character Level TF IDF Vectors
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char',strip_accents='unicode',token_pattern=r'\w{1,}', ngram_range=(1,4),dtype=np.float32, sublinear_tf=True)
xtrain_fb_tfidf_ngram_chars =  tfidf_vect_ngram_chars.fit_transform(train_x_fb) 
xtest_fb_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_x_fb) 
xtrain_fb_tfidf_ngram_chars, train_y_fb = oversample.fit_resample(xtrain_fb_tfidf_ngram_chars, train_y_fb)

classifier = naive_bayes.MultinomialNB(alpha=4)
classifier.fit(xtrain_fb_tfidf_ngram_chars, train_y_fb)
predictions = classifier.predict(xtest_fb_tfidf_ngram_chars)
accuracy = metrics.accuracy_score(predictions, test_y_fb)
recall = metrics.recall_score(test_y_fb, predictions)
precision = metrics.precision_score(test_y_fb, predictions)
f1 = metrics.f1_score(test_y_fb, predictions)
print ("NB, CharLevel Vectors: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# RF on Count Vectors
classifier = ensemble.RandomForestClassifier(n_estimators=200,random_state=42)
classifier.fit(xtrain_fb_count, train_y_fb)
predictions = classifier.predict(xtest_fb_count)
accuracy = metrics.accuracy_score(predictions, test_y_fb)
recall = metrics.recall_score(test_y_fb, predictions)
precision = metrics.precision_score(test_y_fb, predictions)
f1 = metrics.f1_score(test_y_fb, predictions)
print ("RF, Count Vectors, accuracy: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# RF on Word Level TF IDF Vectors
classifier = ensemble.RandomForestClassifier(n_estimators=200,random_state=42)
classifier.fit(xtrain_fb_tfidf, train_y_fb)
predictions = classifier.predict(xtest_fb_tfidf)
accuracy = metrics.accuracy_score(predictions, test_y_fb)
recall = metrics.recall_score(test_y_fb, predictions)
precision = metrics.precision_score(test_y_fb, predictions)
f1 = metrics.f1_score(test_y_fb, predictions)
print ("RF, WordLevel TF-IDF: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

NB, Count Vectors, accuracy:  0.8549222797927462 recall:  0.7319587628865979 precision:  0.3333333333333333 f1 0.4580645161290322
NB, WordLevel TF-IDF:  0.8609671848013817 recall:  0.7731958762886598 precision:  0.35046728971962615 f1 0.48231511254019294
NB, N-Gram Vectors:  0.9162348877374784 recall:  0.0 precision:  0.0 f1 0.0


  _warn_prf(average, modifier, msg_start, len(result))


NB, CharLevel Vectors:  0.8557858376511226 recall:  0.8762886597938144 precision:  0.3541666666666667 f1 0.5044510385756676
RF, Count Vectors, accuracy:  0.9335060449050087 recall:  0.44329896907216493 precision:  0.6515151515151515 f1 0.52760736196319
RF, WordLevel TF-IDF:  0.9369602763385146 recall:  0.4639175257731959 precision:  0.6818181818181818 f1 0.5521472392638036


In [90]:
# housekeeping
# oversampling
oversample = RandomOverSampler(sampling_strategy='minority',random_state=42)
xtrain_hk_count, train_y_hk = oversample.fit_resample(xtrain_hk_count, train_y_hk)
xtrain_hk_tfidf, train_y_hk = oversample.fit_resample(xtrain_hk_tfidf, train_y_hk)

# Naive Bayes on Count Vectors
classifier = naive_bayes.MultinomialNB()
classifier.fit(xtrain_hk_count, train_y_hk)
predictions = classifier.predict(xtest_hk_count)
accuracy = metrics.accuracy_score(predictions, test_y_hk)
recall = metrics.recall_score(test_y_hk, predictions)
precision = metrics.precision_score(test_y_hk, predictions)
f1 = metrics.f1_score(test_y_hk, predictions)
print ("NB, Count Vectors, accuracy: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# Naive Bayes on Word Level TF IDF Vectors
classifier = naive_bayes.MultinomialNB(alpha=4)
classifier.fit(xtrain_hk_tfidf, train_y_hk)
predictions = classifier.predict(xtest_hk_tfidf)
accuracy = metrics.accuracy_score(predictions, test_y_hk)
recall = metrics.recall_score(test_y_hk, predictions)
precision = metrics.precision_score(test_y_hk, predictions)
f1 = metrics.f1_score(test_y_hk, predictions)
print ("NB, WordLevel TF-IDF: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# Naive Bayes on Ngram Level TF IDF Vectors
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,2), dtype=np.float32, sublinear_tf=True)
xtrain_hk_tfidf_ngram =  tfidf_vect_ngram.fit_transform(train_x_hk)
xtest_hk_tfidf_ngram =  tfidf_vect_ngram.transform(test_x_hk)
xtrain_hk_tfidf_ngram, train_y_hk = oversample.fit_resample(xtrain_hk_tfidf_ngram, train_y_hk)

classifier = naive_bayes.MultinomialNB()
classifier.fit(xtrain_hk_tfidf_ngram, train_y_hk)
predictions = classifier.predict(xtest_hk_tfidf_ngram)
accuracy = metrics.accuracy_score(predictions, test_y_hk)
recall = metrics.recall_score(test_y_hk, predictions)
precision = metrics.precision_score(test_y_hk, predictions)
f1 = metrics.f1_score(test_y_hk, predictions)
print ("NB, N-Gram Vectors: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# Naive Bayes on Character Level TF IDF Vectors
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char',strip_accents='unicode',token_pattern=r'\w{1,}', ngram_range=(1,6),dtype=np.float32, sublinear_tf=True)
xtrain_hk_tfidf_ngram_chars =  tfidf_vect_ngram_chars.fit_transform(train_x_hk) 
xtest_hk_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_x_hk) 
xtrain_hk_tfidf_ngram_chars, train_y_hk = oversample.fit_resample(xtrain_hk_tfidf_ngram_chars, train_y_hk)

classifier = naive_bayes.MultinomialNB()
classifier.fit(xtrain_hk_tfidf_ngram_chars, train_y_hk)
predictions = classifier.predict(xtest_hk_tfidf_ngram_chars)
accuracy = metrics.accuracy_score(predictions, test_y_hk)
recall = metrics.recall_score(test_y_hk, predictions)
precision = metrics.precision_score(test_y_hk, predictions)
f1 = metrics.f1_score(test_y_hk, predictions)
print ("NB, CharLevel Vectors: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# RF on Count Vectors
classifier = ensemble.RandomForestClassifier(n_estimators=100,random_state=42)
classifier.fit(xtrain_hk_count, train_y_hk)
predictions = classifier.predict(xtest_hk_count)
accuracy = metrics.accuracy_score(predictions, test_y_hk)
recall = metrics.recall_score(test_y_hk, predictions)
precision = metrics.precision_score(test_y_hk, predictions)
f1 = metrics.f1_score(test_y_hk, predictions)
print ("RF, Count Vectors, accuracy: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# RF on Word Level TF IDF Vectors
classifier = ensemble.RandomForestClassifier(n_estimators=100,random_state=42)
classifier.fit(xtrain_hk_tfidf, train_y_hk)
predictions = classifier.predict(xtest_hk_tfidf)
accuracy = metrics.accuracy_score(predictions, test_y_hk)
recall = metrics.recall_score(test_y_hk, predictions)
precision = metrics.precision_score(test_y_hk, predictions)
f1 = metrics.f1_score(test_y_hk, predictions)
print ("RF, WordLevel TF-IDF: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

NB, Count Vectors, accuracy:  0.9291882556131261 recall:  0.6904761904761905 precision:  0.29591836734693877 f1 0.41428571428571426
NB, WordLevel TF-IDF:  0.885146804835924 recall:  0.7619047619047619 precision:  0.2064516129032258 f1 0.32487309644670054
NB, N-Gram Vectors:  0.9637305699481865 recall:  0.0 precision:  0.0 f1 0.0


  _warn_prf(average, modifier, msg_start, len(result))


NB, CharLevel Vectors:  0.8972366148531952 recall:  0.7380952380952381 precision:  0.22302158273381295 f1 0.3425414364640884
RF, Count Vectors, accuracy:  0.9749568221070811 recall:  0.35714285714285715 precision:  0.8823529411764706 f1 0.5084745762711864
RF, WordLevel TF-IDF:  0.9732297063903281 recall:  0.2857142857142857 precision:  0.9230769230769231 f1 0.43636363636363634


In [95]:
# frontoff
# oversampling
oversample = RandomOverSampler(sampling_strategy='minority',random_state=42)
xtrain_ff_count, train_y_ff = oversample.fit_resample(xtrain_ff_count, train_y_ff)
xtrain_ff_tfidf, train_y_ff = oversample.fit_resample(xtrain_ff_tfidf, train_y_ff)

# Naive Bayes on Count Vectors
classifier = naive_bayes.MultinomialNB(alpha=8)
classifier.fit(xtrain_ff_count, train_y_ff)
predictions = classifier.predict(xtest_ff_count)
accuracy = metrics.accuracy_score(predictions, test_y_ff)
recall = metrics.recall_score(test_y_ff, predictions)
precision = metrics.precision_score(test_y_ff, predictions)
f1 = metrics.f1_score(test_y_ff, predictions)
print ("NB, Count Vectors, accuracy: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# Naive Bayes on Word Level TF IDF Vectors
classifier = naive_bayes.MultinomialNB(alpha=4)
classifier.fit(xtrain_ff_tfidf, train_y_ff)
predictions = classifier.predict(xtest_ff_tfidf)
accuracy = metrics.accuracy_score(predictions, test_y_ff)
recall = metrics.recall_score(test_y_ff, predictions)
precision = metrics.precision_score(test_y_ff, predictions)
f1 = metrics.f1_score(test_y_ff, predictions)
print ("NB, WordLevel TF-IDF: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# Naive Bayes on Ngram Level TF IDF Vectors
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,2), dtype=np.float32, sublinear_tf=True)
xtrain_ff_tfidf_ngram =  tfidf_vect_ngram.fit_transform(train_x_ff)
xtest_ff_tfidf_ngram =  tfidf_vect_ngram.transform(test_x_ff)
xtrain_ff_tfidf_ngram, train_y_ff = oversample.fit_resample(xtrain_ff_tfidf_ngram, train_y_ff)

classifier = naive_bayes.MultinomialNB()
classifier.fit(xtrain_ff_tfidf_ngram, train_y_ff)
predictions = classifier.predict(xtest_ff_tfidf_ngram)
accuracy = metrics.accuracy_score(predictions, test_y_ff)
recall = metrics.recall_score(test_y_ff, predictions)
precision = metrics.precision_score(test_y_ff, predictions)
f1 = metrics.f1_score(test_y_ff, predictions)
print ("NB, N-Gram Vectors: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# Naive Bayes on Character Level TF IDF Vectors
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char',strip_accents='unicode',token_pattern=r'\w{1,}', ngram_range=(1,5),dtype=np.float32, sublinear_tf=True)
xtrain_ff_tfidf_ngram_chars =  tfidf_vect_ngram_chars.fit_transform(train_x_ff) 
xtest_ff_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_x_ff) 
xtrain_ff_tfidf_ngram_chars, train_y_ff = oversample.fit_resample(xtrain_ff_tfidf_ngram_chars, train_y_ff)

classifier = naive_bayes.MultinomialNB()
classifier.fit(xtrain_ff_tfidf_ngram_chars, train_y_ff)
predictions = classifier.predict(xtest_ff_tfidf_ngram_chars)
accuracy = metrics.accuracy_score(predictions, test_y_ff)
recall = metrics.recall_score(test_y_ff, predictions)
precision = metrics.precision_score(test_y_ff, predictions)
f1 = metrics.f1_score(test_y_ff, predictions)
print ("NB, CharLevel Vectors: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# RF on Count Vectors
classifier = ensemble.RandomForestClassifier(n_estimators=200,random_state=42)
classifier.fit(xtrain_ff_count, train_y_ff)
predictions = classifier.predict(xtest_ff_count)
accuracy = metrics.accuracy_score(predictions, test_y_ff)
recall = metrics.recall_score(test_y_ff, predictions)
precision = metrics.precision_score(test_y_ff, predictions)
f1 = metrics.f1_score(test_y_ff, predictions)
print ("RF, Count Vectors, accuracy: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# RF on Word Level TF IDF Vectors
classifier = ensemble.RandomForestClassifier(n_estimators=100,random_state=42)
classifier.fit(xtrain_ff_tfidf, train_y_ff)
predictions = classifier.predict(xtest_ff_tfidf)
accuracy = metrics.accuracy_score(predictions, test_y_ff)
recall = metrics.recall_score(test_y_ff, predictions)
precision = metrics.precision_score(test_y_ff, predictions)
f1 = metrics.f1_score(test_y_ff, predictions)
print ("RF, WordLevel TF-IDF: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

NB, Count Vectors, accuracy:  0.8488773747841105 recall:  0.8813559322033898 precision:  0.5032258064516129 f1 0.6406570841889117
NB, WordLevel TF-IDF:  0.8566493955094991 recall:  0.8813559322033898 precision:  0.5182724252491694 f1 0.6527196652719665
NB, N-Gram Vectors:  0.8713298791018999 recall:  0.15819209039548024 precision:  1.0 f1 0.2731707317073171
NB, CharLevel Vectors:  0.8592400690846287 recall:  0.864406779661017 precision:  0.523972602739726 f1 0.6524520255863538
RF, Count Vectors, accuracy:  0.9300518134715026 recall:  0.7796610169491526 precision:  0.7666666666666667 f1 0.773109243697479
RF, WordLevel TF-IDF:  0.9335060449050087 recall:  0.7740112994350282 precision:  0.7873563218390804 f1 0.7806267806267806


In [101]:
# others
# oversampling
oversample = RandomOverSampler(sampling_strategy='minority')
xtrain_oth_count, train_y_oth = oversample.fit_resample(xtrain_oth_count, train_y_oth)
xtrain_oth_tfidf, train_y_oth = oversample.fit_resample(xtrain_oth_tfidf, train_y_oth)

# Naive Bayes on Count Vectors
classifier = naive_bayes.MultinomialNB(alpha=2)
classifier.fit(xtrain_oth_count, train_y_oth)
predictions = classifier.predict(xtest_oth_count)
accuracy = metrics.accuracy_score(predictions, test_y_oth)
recall = metrics.recall_score(test_y_oth, predictions)
precision = metrics.precision_score(test_y_oth, predictions)
f1 = metrics.f1_score(test_y_oth, predictions)
print ("NB, Count Vectors, accuracy: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# Naive Bayes on Word Level TF IDF Vectors
classifier = naive_bayes.MultinomialNB()
classifier.fit(xtrain_oth_tfidf, train_y_oth)
predictions = classifier.predict(xtest_oth_tfidf)
accuracy = metrics.accuracy_score(predictions, test_y_oth)
recall = metrics.recall_score(test_y_oth, predictions)
precision = metrics.precision_score(test_y_oth, predictions)
f1 = metrics.f1_score(test_y_oth, predictions)
print ("NB, WordLevel TF-IDF: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# Naive Bayes on Ngram Level TF IDF Vectors
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,2), dtype=np.float32, sublinear_tf=True)
xtrain_oth_tfidf_ngram =  tfidf_vect_ngram.fit_transform(train_x_oth)
xtest_oth_tfidf_ngram =  tfidf_vect_ngram.transform(test_x_oth)
xtrain_oth_tfidf_ngram, train_y_oth = oversample.fit_resample(xtrain_oth_tfidf_ngram, train_y_oth)

classifier = naive_bayes.MultinomialNB()
classifier.fit(xtrain_oth_tfidf_ngram, train_y_oth)
predictions = classifier.predict(xtest_oth_tfidf_ngram)
accuracy = metrics.accuracy_score(predictions, test_y_oth)
recall = metrics.recall_score(test_y_oth, predictions)
precision = metrics.precision_score(test_y_oth, predictions)
f1 = metrics.f1_score(test_y_oth, predictions)
print ("NB, N-Gram Vectors: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# Naive Bayes on Character Level TF IDF Vectors
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char',strip_accents='unicode',token_pattern=r'\w{1,}', ngram_range=(1,4),dtype=np.float32, sublinear_tf=True)
xtrain_oth_tfidf_ngram_chars =  tfidf_vect_ngram_chars.fit_transform(train_x_oth) 
xtest_oth_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_x_oth) 
xtrain_oth_tfidf_ngram_chars, train_y_oth = oversample.fit_resample(xtrain_oth_tfidf_ngram_chars, train_y_oth)

classifier = naive_bayes.MultinomialNB()
classifier.fit(xtrain_oth_tfidf_ngram_chars, train_y_oth)
predictions = classifier.predict(xtest_oth_tfidf_ngram_chars)
accuracy = metrics.accuracy_score(predictions, test_y_oth)
recall = metrics.recall_score(test_y_oth, predictions)
precision = metrics.precision_score(test_y_oth, predictions)
f1 = metrics.f1_score(test_y_oth, predictions)
print ("NB, CharLevel Vectors: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# RF on Count Vectors
classifier = ensemble.RandomForestClassifier(n_estimators=100,random_state=42)
classifier.fit(xtrain_oth_count, train_y_oth)
predictions = classifier.predict(xtest_oth_count)
accuracy = metrics.accuracy_score(predictions, test_y_oth)
recall = metrics.recall_score(test_y_oth, predictions)
precision = metrics.precision_score(test_y_oth, predictions)
f1 = metrics.f1_score(test_y_oth, predictions)
print ("RF, Count Vectors, accuracy: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

# RF on Word Level TF IDF Vectors
classifier = ensemble.RandomForestClassifier(n_estimators=100,random_state=42)
classifier.fit(xtrain_oth_tfidf, train_y_oth)
predictions = classifier.predict(xtest_oth_tfidf)
accuracy = metrics.accuracy_score(predictions, test_y_oth)
recall = metrics.recall_score(test_y_oth, predictions)
precision = metrics.precision_score(test_y_oth, predictions)
f1 = metrics.f1_score(test_y_oth, predictions)
print ("RF, WordLevel TF-IDF: ", accuracy, 'recall: ', recall,'precision: ',precision,'f1',f1)

NB, Count Vectors, accuracy:  0.7728842832469776 recall:  0.7251908396946565 precision:  0.49868766404199477 f1 0.5909797822706065
NB, WordLevel TF-IDF:  0.7754749568221071 recall:  0.7175572519083969 precision:  0.5026737967914439 f1 0.5911949685534592
NB, N-Gram Vectors:  0.7797927461139896 recall:  0.026717557251908396 precision:  1.0 f1 0.0520446096654275
NB, CharLevel Vectors:  0.7806563039723662 recall:  0.7175572519083969 precision:  0.5108695652173914 f1 0.5968253968253969
RF, Count Vectors, accuracy:  0.803972366148532 recall:  0.6526717557251909 precision:  0.5570032573289903 f1 0.6010544815465729
RF, WordLevel TF-IDF:  0.8143350604490501 recall:  0.6106870229007634 precision:  0.5860805860805861 f1 0.5981308411214954
