In [53]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
from common import *
import os
import tarfile
import glob
from sentence_transformers import SentenceTransformer


In [54]:
if not os.path.exists('aclImdb'):
    if not os.path.exists('aclImdb_v1.tar.gz'):
        files = download_data()
    else:
        files = ['aclImdb_v1.tar.gz']
    
    tarfile.open(files[0]).extractall()

paths = glob.glob('aclImdb/**/pos/*.txt', recursive=True)
pos_frame = get_data(paths, 1)
paths = glob.glob('aclImdb/**/neg/*.txt', recursive=True)
neg_frame = get_data(paths, 0)

df = pd.concat([pos_frame, neg_frame])
print(f'Size of dataset: {df.size}')
df.head()

Size of dataset: 100000


Unnamed: 0,text,label
0,Tromeo and Juliet is perhaps the best Shakespe...,1
1,"The book ""The Railway Children"" is a children'...",1
2,"If you're into alternate realities, contemplat...",1
3,"If not the best movie ever made, ""Babette's Fe...",1
4,SPOILERS BELOW<br /><br />`A Dog's Life' was m...,1


In [55]:
df = df.sample(n=NUM_SAMPLES, random_state=RANDOM_SEED)
df.head()

Unnamed: 0,text,label
1247,"As a kid, this movie scared me green. As an ad...",0
10067,It is a movie which sheds the light on the beg...,0
9590,Talk about being boring!<br /><br />I got this...,0
16668,One of Disney's best films that I can enjoy wa...,1
12196,"Being a bit of a connoisseur of garbage, I hav...",1


In [56]:
"""if not os.path.exists('data.csv'):
    if not os.path.exists('Fake.csv') and not os.path.exists('True.csv'):
        download_data()
    label_data('Fake.csv', 'fake_labeled.csv', 1)
    label_data('True.csv', 'true_labeled.csv', 0)
    os.system("awk '(NR == 1) || (FNR > 1)' *labeled.csv > data.csv")"""

'if not os.path.exists(\'data.csv\'):\n    if not os.path.exists(\'Fake.csv\') and not os.path.exists(\'True.csv\'):\n        download_data()\n    label_data(\'Fake.csv\', \'fake_labeled.csv\', 1)\n    label_data(\'True.csv\', \'true_labeled.csv\', 0)\n    os.system("awk \'(NR == 1) || (FNR > 1)\' *labeled.csv > data.csv")'

In [57]:
"""df = pd.read_csv('data.csv', usecols=['text', 'label'], encoding='utf-8')
df = df.sample(n=NUM_SAMPLES, random_state=RANDOM_SEED)
df.head()"""

"df = pd.read_csv('data.csv', usecols=['text', 'label'], encoding='utf-8')\ndf = df.sample(n=NUM_SAMPLES, random_state=RANDOM_SEED)\ndf.head()"

# BoW

In [58]:
countVectorizer = CountVectorizer()
embeddings_countVectorizer = preprocess_data(df['text'], countVectorizer)

X_tr, X_ts, y_tr, y_ts= train_test_split(embeddings_countVectorizer, df['label'], test_size=TEST_SIZE, random_state=RANDOM_SEED)

X_tr = X_tr.toarray()
X_ts = X_ts.toarray()

feature_size = X_tr.shape[1]

print(f'Train shape: {X_tr.shape}')
print(f'Test shape: {X_ts.shape}')

Train shape: (2500, 51072)
Test shape: (2500, 51072)


In [59]:
def train_models(X_tr, y_tr, X_ts, y_ts, models):
    for model in models:
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_ts)

        score1 = metrics.accuracy_score(y_ts, y_pred)
        print("accuracy:   %0.3f" % score1)
        print(metrics.classification_report(y_ts, y_pred,
                                            target_names=['Positive', 'Negative']))
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_ts, y_pred))
        
        y_pred = model.predict_proba(X_ts)
        print("Prediction probability:")
        print(y_pred)

In [60]:
proba_models = [MultinomialNB()]

In [61]:

train_models(X_tr, y_tr, X_ts, y_ts, proba_models)

accuracy:   0.832
              precision    recall  f1-score   support

    Positive       0.81      0.86      0.83      1205
    Negative       0.86      0.81      0.83      1295

    accuracy                           0.83      2500
   macro avg       0.83      0.83      0.83      2500
weighted avg       0.83      0.83      0.83      2500

confusion matrix:
[[1036  169]
 [ 250 1045]]
Prediction probability:
[[6.54892099e-02 9.34510790e-01]
 [1.00000000e+00 5.95380434e-11]
 [9.99999818e-01 1.82457623e-07]
 ...
 [9.99789559e-01 2.10441486e-04]
 [6.10731872e-04 9.99389268e-01]
 [5.91363885e-01 4.08636115e-01]]


# TF-IDF

In [62]:
tfidfVectorizer = TfidfVectorizer()
embeddings_Tfidf = preprocess_data(df['text'], tfidfVectorizer)

X_tr, X_ts, y_tr, y_ts= train_test_split(embeddings_Tfidf, df['label'], test_size=TEST_SIZE, random_state=RANDOM_SEED)

X_tr = X_tr.toarray()
X_ts = X_ts.toarray()

feature_size = X_tr.shape[1]

print(f'Train shape: {X_tr.shape}')
print(f'Test shape: {X_ts.shape}')

Train shape: (2500, 51072)
Test shape: (2500, 51072)


In [63]:
train_models(X_tr, y_tr, X_ts, y_ts, proba_models)

accuracy:   0.859
              precision    recall  f1-score   support

    Positive       0.85      0.86      0.85      1205
    Negative       0.87      0.86      0.86      1295

    accuracy                           0.86      2500
   macro avg       0.86      0.86      0.86      2500
weighted avg       0.86      0.86      0.86      2500

confusion matrix:
[[1039  166]
 [ 187 1108]]
Prediction probability:
[[0.38613687 0.61386313]
 [0.69810695 0.30189305]
 [0.66353683 0.33646317]
 ...
 [0.56982261 0.43017739]
 [0.37954454 0.62045546]
 [0.45090318 0.54909682]]


In [66]:
sbert_vectorizer = SentenceTransformer('all-MiniLM-L6-v2')

df['text'] = df['text'].str.lower()
df['text'] = df['text'].apply(
    lambda text: text.translate(
        str.maketrans('', '', string.punctuation)
    )
)

X_tr, X_ts, y_tr, y_ts = train_test_split(df['text'], df['label'], test_size=TEST_SIZE, random_state=RANDOM_SEED)
X_tr = sbert_vectorizer.encode(X_tr.values.astype('U'), normalize_embeddings=True)
X_ts = sbert_vectorizer.encode(X_ts.values.astype('U'), normalize_embeddings=True)

train_models(X_tr, y_tr, X_ts, y_ts, proba_models)

ValueError: Negative values in data passed to MultinomialNB (input X)