In [None]:
import pandas as pd
import numpy as np
import re

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
import nltk

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

True

## 1. Data Loading and Preparation (Correct Train/Validation Split and Stratification Fix)

In [None]:
col_names = [
    "ID", "Label", "Statement", "Subject(s)", "Speaker", 
    "Speaker Job Title", "State Info", "Party Affiliation", "Barely True Counts",
    "False Counts", "Half True Counts", "Mostly True Counts", "Pants on Fire Counts", 
    "Context"
]

try:
    train_df = pd.read_csv("./Data/train.tsv", sep="\t", header=None, names=col_names)
    valid_df = pd.read_csv("./Data/valid.tsv", sep="\t", header=None, names=col_names)
    # We still load test_df, but it is not used in this notebook for comparison.
    # test_df = pd.read_csv("test.tsv", sep="\t", header=None, names=col_names)
except FileNotFoundError:
    print("Using Dummy Data: Ensure train.tsv, valid.tsv, and test.tsv are in the working directory.")
    # Creating robust dummy dataframes to prevent crash and test logic
    data_train = {'Statement': ['The sun is green', 'Water is wet', 'The moon is cheese', 'Lies are bad'] * 100, 'Label': ['false', 'true', 'pants-fire', 'mostly-true'] * 100}
    data_valid = {'Statement': ['The sky is blue', 'Birds fly south', 'Cars drive fast', 'Trees are tall'] * 20, 'Label': ['true', 'mostly-true', 'half-true', 'barely-true'] * 20}
    data_test = {'Statement': ['Test statement A', 'Test statement B'], 'Label': ['false', 'true']}
    train_df = pd.DataFrame(data_train)
    valid_df = pd.DataFrame(data_valid)
    test_df = pd.DataFrame(data_test)
    # Add necessary columns for robust concatenation, fill with 0 or empty string
    for df in [train_df, valid_df, test_df]:
        for col in col_names:
            if col not in df.columns:
                df[col] = '' if df[col].dtype == object else 0
        df.columns = col_names[:len(df.columns)]

# 1. Combine training and validation data into a single development set
dev_full = pd.concat([train_df, valid_df], ignore_index=True)[['Statement', 'Label']]

# 2. Prepare for splitting: identify and remove classes with only one instance to fix stratification error
y_dev = dev_full['Label']
class_counts = y_dev.value_counts()
rare_classes = class_counts[class_counts < 2].index.tolist()

if rare_classes:
    print(f"Warning: Dropping {len(rare_classes)} rare classes for stratification: {rare_classes}")
    dev_full_filtered = dev_full[~dev_full['Label'].isin(rare_classes)]
else:
    dev_full_filtered = dev_full

# 3. Split the development set into training and validation sets for comparison
X_dev = dev_full_filtered['Statement'].fillna('')
y_dev_filtered = dev_full_filtered['Label']

# 80% for training, 20% for validation/comparison
X_train, X_val, y_train, y_val = train_test_split(
    X_dev, y_dev_filtered, test_size=0.2, random_state=42, stratify=y_dev_filtered
)

le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_val_encoded = le.transform(y_val)



## 2. Text Preprocessing Implementations (BAT, Lemmatization, Stemming)

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def basic_text_cleaning(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

def apply_lemmatization(tokens):
    return ' '.join([lemmatizer.lemmatize(word) for word in tokens])

def apply_stemming(tokens):
    return ' '.join([stemmer.stem(word) for word in tokens])

def preprocess_ml(text, method='lemma'):
    tokens = basic_text_cleaning(text)
    if method == 'stem':
        return apply_stemming(tokens)
    return apply_lemmatization(tokens)

X_train_lem = X_train.apply(lambda x: preprocess_ml(x, 'lemma'))
X_val_lem = X_val.apply(lambda x: preprocess_ml(x, 'lemma'))

X_train_stem = X_train.apply(lambda x: preprocess_ml(x, 'stem'))
X_val_stem = X_val.apply(lambda x: preprocess_ml(x, 'stem'))

## 3. Classical ML Models (Lemmatized Text + TF-IDF Features)

In [None]:
results_ml = {}
classifiers = {
    'Logistic Regression': LogisticRegression(solver='liblinear', multi_class='auto', random_state=42, max_iter=2000),
    'Naive Bayes': MultinomialNB(),
    'Linear SVM': LinearSVC(random_state=42, max_iter=2000)
}

for name, classifier in classifiers.items():
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_features=5000)),
        ('clf', classifier)
    ])
    
    pipeline.fit(X_train_lem, y_train_encoded)
    y_pred = pipeline.predict(X_val_lem)
    
    results_ml[name] = {
        'Accuracy': accuracy_score(y_val_encoded, y_pred),
        'Precision': precision_score(y_val_encoded, y_pred, average='weighted', zero_division=0),
        'Recall': recall_score(y_val_encoded, y_pred, average='weighted', zero_division=0),
        'F1-Score': f1_score(y_val_encoded, y_pred, average='weighted', zero_division=0)
    }
    
    print(f"--- {name} ---")
    print(classification_report(y_val_encoded, y_pred, target_names=le.classes_, zero_division=0))



--- Logistic Regression ---
              precision    recall  f1-score   support

 barely-true       0.20      0.15      0.17       378
       false       0.26      0.33      0.29       452
   half-true       0.24      0.32      0.27       472
 mostly-true       0.27      0.31      0.29       442
  pants-fire       0.37      0.07      0.12       191
        true       0.24      0.19      0.21       369

    accuracy                           0.25      2304
   macro avg       0.26      0.23      0.22      2304
weighted avg       0.25      0.25      0.24      2304

--- Naive Bayes ---
              precision    recall  f1-score   support

 barely-true       0.19      0.12      0.15       378
       false       0.25      0.31      0.28       452
   half-true       0.24      0.39      0.29       472
 mostly-true       0.24      0.31      0.27       442
  pants-fire       0.40      0.01      0.02       191
        true       0.28      0.11      0.16       369

    accuracy                 

## 4. Deep Learning Models (LSTM & GRU)

In [None]:
vocab_size = 10000
embedding_dim = 100
max_length = 50
trunc_type = 'post'
padding_type = 'post'

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

X_val_sequences = tokenizer.texts_to_sequences(X_val)
X_val_padded = pad_sequences(X_val_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

y_train_ohe = to_categorical(y_train_encoded)
y_val_ohe = to_categorical(y_val_encoded)

num_classes = len(le.classes_)

### 4.1 LSTM Model

In [None]:
model_lstm = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    LSTM(128),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

model_lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history_lstm = model_lstm.fit(
    X_train_padded, y_train_ohe,
    epochs=20,
    validation_data=(X_val_padded, y_val_ohe),
    callbacks=[early_stopping],
    verbose=0
)

y_pred_proba_lstm = model_lstm.predict(X_val_padded)
y_pred_lstm = np.argmax(y_pred_proba_lstm, axis=1)

results_dl_lstm = {
    'Accuracy': accuracy_score(y_val_encoded, y_pred_lstm),
    'Precision': precision_score(y_val_encoded, y_pred_lstm, average='weighted', zero_division=0),
    'Recall': recall_score(y_val_encoded, y_pred_lstm, average='weighted', zero_division=0),
    'F1-Score': f1_score(y_val_encoded, y_pred_lstm, average='weighted', zero_division=0)
}
results_ml['LSTM'] = results_dl_lstm

print("--- LSTM ---")
print(classification_report(y_val_encoded, y_pred_lstm, target_names=le.classes_, zero_division=0))



[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 28ms/step
--- LSTM ---
              precision    recall  f1-score   support

 barely-true       0.00      0.00      0.00       378
       false       0.20      1.00      0.33       452
   half-true       0.00      0.00      0.00       472
 mostly-true       0.00      0.00      0.00       442
  pants-fire       0.00      0.00      0.00       191
        true       0.00      0.00      0.00       369

    accuracy                           0.20      2304
   macro avg       0.03      0.17      0.05      2304
weighted avg       0.04      0.20      0.06      2304



### 4.2 GRU Model

In [None]:
model_gru = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    GRU(128),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

model_gru.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history_gru = model_gru.fit(
    X_train_padded, y_train_ohe,
    epochs=20,
    validation_data=(X_val_padded, y_val_ohe),
    callbacks=[early_stopping],
    verbose=0
)

y_pred_proba_gru = model_gru.predict(X_val_padded)
y_pred_gru = np.argmax(y_pred_proba_gru, axis=1)

results_dl_gru = {
    'Accuracy': accuracy_score(y_val_encoded, y_pred_gru),
    'Precision': precision_score(y_val_encoded, y_pred_gru, average='weighted', zero_division=0),
    'Recall': recall_score(y_val_encoded, y_pred_gru, average='weighted', zero_division=0),
    'F1-Score': f1_score(y_val_encoded, y_pred_gru, average='weighted', zero_division=0)
}
results_ml['GRU'] = results_dl_gru

print("--- GRU ---")
print(classification_report(y_val_encoded, y_pred_gru, target_names=le.classes_, zero_division=0))



[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 26ms/step
--- GRU ---
              precision    recall  f1-score   support

 barely-true       0.00      0.00      0.00       378
       false       0.00      0.00      0.00       452
   half-true       0.20      1.00      0.34       472
 mostly-true       0.00      0.00      0.00       442
  pants-fire       0.00      0.00      0.00       191
        true       0.00      0.00      0.00       369

    accuracy                           0.20      2304
   macro avg       0.03      0.17      0.06      2304
weighted avg       0.04      0.20      0.07      2304



## 5. Final Model Comparison

In [None]:
df_results = pd.DataFrame.from_dict(results_ml, orient='index')
df_results_styled = df_results.style.format({ 
    'Accuracy': "{:.4f}", 
    'Precision': "{:.4f}", 
    'Recall': "{:.4f}", 
    'F1-Score': "{:.4f}" 
})
print("Model Performance Comparison\n")
print(df_results_styled.to_string())

best_model_name = df_results['F1-Score'].idxmax()
print(f"\nThe best performing model based on F1-Score is: {best_model_name}")

Model Performance Comparison

 Accuracy Precision Recall F1-Score
Logistic Regression 0.2478 0.2535 0.2478 0.2389
Naive Bayes 0.2396 0.2531 0.2396 0.2184
Linear SVM 0.2378 0.2365 0.2378 0.2367
LSTM 0.1957 0.0385 0.1957 0.0643
GRU 0.2049 0.0420 0.2049 0.0697


The best performing model based on F1-Score is: Logistic Regression
