In [10]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import classification_report, accuracy_score


In [11]:
columns = [
    "id", "label", "statement", "subject", "speaker", "job_title",
    "state", "party",
    "barely_true_counts", "false_counts", "half_true_counts",
    "mostly_true_counts", "pants_on_fire_counts", "context"
]

train_df = pd.read_csv("../data/raw/train.tsv", sep="\t", header=None)
valid_df = pd.read_csv("../data/raw/valid.tsv", sep="\t", header=None)
test_df  = pd.read_csv("../data/raw/test.tsv", sep="\t", header=None)

train_df.columns = columns
valid_df.columns = columns
test_df.columns  = columns


In [12]:
X_train = train_df["statement"]
y_train = train_df["label"]

X_valid = valid_df["statement"]
y_valid = valid_df["label"]


In [13]:
vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words="english",
    max_features=5000
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_valid_tfidf = vectorizer.transform(X_valid)


In [14]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_tfidf, y_train)

y_pred_lr = log_reg.predict(X_valid_tfidf)


In [15]:
print("Logistic Regression Accuracy:",
      accuracy_score(y_valid, y_pred_lr))

print("\nClassification Report (Logistic Regression):\n")
print(classification_report(y_valid, y_pred_lr))


Logistic Regression Accuracy: 0.21495327102803738

Classification Report (Logistic Regression):

              precision    recall  f1-score   support

 barely-true       0.17      0.13      0.15       237
       false       0.25      0.31      0.28       263
   half-true       0.18      0.23      0.21       248
 mostly-true       0.25      0.25      0.25       251
  pants-fire       0.32      0.05      0.09       116
        true       0.19      0.22      0.21       169

    accuracy                           0.21      1284
   macro avg       0.23      0.20      0.20      1284
weighted avg       0.22      0.21      0.21      1284



In [16]:
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

y_pred_nb = nb_model.predict(X_valid_tfidf)


In [17]:
print("Naive Bayes Accuracy:",
      accuracy_score(y_valid, y_pred_nb))

print("\nClassification Report (Naive Bayes):\n")
print(classification_report(y_valid, y_pred_nb))


Naive Bayes Accuracy: 0.2336448598130841

Classification Report (Naive Bayes):

              precision    recall  f1-score   support

 barely-true       0.18      0.09      0.12       237
       false       0.27      0.33      0.30       263
   half-true       0.21      0.38      0.27       248
 mostly-true       0.26      0.30      0.28       251
  pants-fire       0.33      0.01      0.02       116
        true       0.20      0.12      0.15       169

    accuracy                           0.23      1284
   macro avg       0.24      0.21      0.19      1284
weighted avg       0.24      0.23      0.21      1284

