In [26]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [27]:
def preprocess_data(train, test, vectorizer):
    train_bow = vectorizer.fit_transform(train['text'])
    test_bow = vectorizer.transform(test['text'])
    print("Shape of train matrix : ",train_bow.shape)
    print("Shape of test matrix : ",test_bow.shape) 

    X_tr = train_bow
    y_tr = train['label']
    X_ts = test_bow
    y_ts = test['label']
    return X_tr, y_tr, X_ts, y_ts

In [28]:
def train_models(X_tr, y_tr, X_ts, y_ts, models):
    for model in models:
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_ts)

        score1 = metrics.accuracy_score(y_ts, y_pred)
        print("accuracy:   %0.3f" % score1)
        print(metrics.classification_report(y_ts, y_pred,
                                            target_names=['Positive', 'Negative']))
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_ts, y_pred))
        
        y_pred = model.predict_proba(X_ts)
        print("Prediction probability:")
        print(y_pred)

In [29]:
df = pd.read_csv('preprocessing/data.csv', encoding='latin')
train, test = train_test_split(df, test_size=0.2, random_state=1)

In [30]:
proba_models = [MultinomialNB()]

In [31]:
countVectorizer = CountVectorizer()
X_tr, y_tr, X_ts, y_ts = preprocess_data(train, test, countVectorizer)
train_models(X_tr, y_tr, X_ts, y_ts, proba_models)

Shape of train matrix :  (35918, 118074)
Shape of test matrix :  (8980, 118074)
accuracy:   0.972
              precision    recall  f1-score   support

    Positive       0.96      0.98      0.97      4298
    Negative       0.98      0.96      0.97      4682

    accuracy                           0.97      8980
   macro avg       0.97      0.97      0.97      8980
weighted avg       0.97      0.97      0.97      8980

confusion matrix:
[[4226   72]
 [ 183 4499]]
Prediction probability:
[[3.68537926e-33 1.00000000e+00]
 [1.00000000e+00 1.24739818e-14]
 [2.65239791e-37 1.00000000e+00]
 ...
 [1.00000000e+00 4.69429536e-59]
 [1.00000000e+00 1.86692150e-62]
 [1.00000000e+00 1.05325955e-18]]


In [32]:
tfidfVectorizer = TfidfVectorizer()
X_tr, y_tr, X_ts, y_ts = preprocess_data(train, test, tfidfVectorizer)
train_models(X_tr, y_tr, X_ts, y_ts, proba_models)

Shape of train matrix :  (35918, 118074)
Shape of test matrix :  (8980, 118074)
accuracy:   0.958
              precision    recall  f1-score   support

    Positive       0.95      0.96      0.96      4298
    Negative       0.96      0.96      0.96      4682

    accuracy                           0.96      8980
   macro avg       0.96      0.96      0.96      8980
weighted avg       0.96      0.96      0.96      8980

confusion matrix:
[[4125  173]
 [ 203 4479]]
Prediction probability:
[[0.05927066 0.94072934]
 [0.72441826 0.27558174]
 [0.02096882 0.97903118]
 ...
 [0.95050425 0.04949575]
 [0.99210413 0.00789587]
 [0.83269941 0.16730059]]
