In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, f1_score


positive_file = 'rt-polaritydata/rt-polarity.pos'  
negative_file = 'rt-polaritydata/rt-polarity.neg'  

with open(positive_file, 'r', encoding='ISO-8859-1') as f:
    positive_texts = f.readlines()

with open(negative_file, 'r', encoding='ISO-8859-1') as f:
    negative_texts = f.readlines()


positive_labels = [1] * len(positive_texts)
negative_labels = [0] * len(negative_texts)

texts = positive_texts + negative_texts
labels = positive_labels + negative_labels


train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    texts[:4000] + texts[5331:9331], labels[:4000] + labels[5331:9331], 
    test_size=0.25, stratify=labels[:4000] + labels[5331:9331]
)

valid_texts, test_texts, valid_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, stratify=temp_labels
)


vectorizer = CountVectorizer(stop_words='english', max_features=5000)
X_train = vectorizer.fit_transform(train_texts)
X_valid = vectorizer.transform(valid_texts)
X_test = vectorizer.transform(test_texts)


nb_model = MultinomialNB()
nb_model.fit(X_train, train_labels)


valid_predictions = nb_model.predict(X_valid)


print("Validation Performance:")
print(classification_report(valid_labels, valid_predictions, target_names=["Negative", "Positive"]))


valid_f1_score = f1_score(valid_labels, valid_predictions)
print(f"Validation F1-score: {valid_f1_score:.4f}")


test_predictions = nb_model.predict(X_test)


print("Test Performance:")
print(classification_report(test_labels, test_predictions, target_names=["Negative", "Positive"]))


test_f1_score = f1_score(test_labels, test_predictions)
print(f"Test F1-score: {test_f1_score:.4f}")


conf_matrix_valid = confusion_matrix(valid_labels, valid_predictions)
conf_matrix_test = confusion_matrix(test_labels, test_predictions)


tn_valid, fp_valid, fn_valid, tp_valid = conf_matrix_valid.ravel()


print("Validation Metrics:")
print(f"True Positives (TP): {tp_valid}")
print(f"True Negatives (TN): {tn_valid}")
print(f"False Positives (FP): {fp_valid}")
print(f"False Negatives (FN): {fn_valid}")


tn_test, fp_test, fn_test, tp_test = conf_matrix_test.ravel()


print("Test Metrics:")
print(f"True Positives (TP): {tp_test}")
print(f"True Negatives (TN): {tn_test}")
print(f"False Positives (FP): {fp_test}")
print(f"False Negatives (FN): {fn_test}")


Validation Performance:
              precision    recall  f1-score   support

    Negative       0.74      0.73      0.73       500
    Positive       0.73      0.74      0.74       500

    accuracy                           0.73      1000
   macro avg       0.74      0.73      0.73      1000
weighted avg       0.74      0.73      0.73      1000

Validation F1-score: 0.7368
Test Performance:
              precision    recall  f1-score   support

    Negative       0.76      0.71      0.74       500
    Positive       0.73      0.78      0.76       500

    accuracy                           0.75      1000
   macro avg       0.75      0.75      0.75      1000
weighted avg       0.75      0.75      0.75      1000

Test F1-score: 0.7551
