## Models(NB, SVM, LR) with ngram

In [1]:
import pickle
import pandas as pd
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn import svm 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [2]:
ptrain = pd.read_csv('ptrain.csv')

In [3]:
def load_data(path):
    data = pd.read_csv(path)
    x = data.reviewText.tolist()
    y = data.sentiment.tolist()
    return x, y

In [4]:
train_x, train_y = load_data('ptrain.csv')
test_x, test_y = load_data('ptest.csv')

In [5]:
tf_ngram_vec = TfidfVectorizer(ngram_range=(1, 3), max_features=40000)
tf_ngram_vec.fit(train_x)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=40000, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [6]:
train_features = tf_ngram_vec.transform(train_x)
test_features = tf_ngram_vec.transform(test_x)

In [7]:
start = time.time() 
mnb_model = MultinomialNB()
svm_model = svm.SVC(kernel='rbf', gamma=0.7, C=1.0).fit(train_features, train_y)
lr_model = LogisticRegression(penalty='l2', C=10)
models = [mnb_model, svm_model, lr_model]
model_names = ['Naive Bayes', 'SVM', 'Logistic Regression']
for name, model in zip(model_names, models):
    model.fit(train_features, train_y)
    pred = model.predict(test_features)
    accuracy = metrics.accuracy_score(pred, test_y)
    print('Accuracy of %s: %f' %(name, accuracy))
    print(metrics.classification_report(y_true=test_y, y_pred=pred))
print('Total processing time is: ', time.time()-start)

Accuracy of Naive Bayes: 0.858200
              precision    recall  f1-score   support

         neg       0.85      0.88      0.86     12500
         pos       0.87      0.84      0.86     12500

   micro avg       0.86      0.86      0.86     25000
   macro avg       0.86      0.86      0.86     25000
weighted avg       0.86      0.86      0.86     25000

Accuracy of SVM: 0.889120
              precision    recall  f1-score   support

         neg       0.89      0.89      0.89     12500
         pos       0.89      0.89      0.89     12500

   micro avg       0.89      0.89      0.89     25000
   macro avg       0.89      0.89      0.89     25000
weighted avg       0.89      0.89      0.89     25000





Accuracy of Logistic Regression: 0.884160
              precision    recall  f1-score   support

         neg       0.88      0.89      0.89     12500
         pos       0.89      0.88      0.88     12500

   micro avg       0.88      0.88      0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000

Total processing time is:  2299.3191978931427


In [8]:
print('done!')

done!


In [9]:
print('done')

done


In [11]:
print('done')

done


In [12]:
print('done')

done
