# Install Dependencies

In [None]:
!pip install --upgrade pip
!pip install numpy==1.26.4
!pip install nltk==3.9.1
!pip install scipy==1.11.4
!pip install gensim==4.3.2
!pip install contractions==0.1.73
!pip install pandas==2.2.2
!pip install scikit-learn==1.6.1
!pip install tensorflow==2.16.1
!pip install lime shap

# Import Libraries and Setup

In [None]:
import numpy as np
import pandas as pd
import nltk
import string
import re
import warnings

In [None]:
warnings.filterwarnings('ignore')

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

# Load Datasets

In [None]:
#Dataset 1 : LIAR
train_data = pd.read_csv('train.tsv', sep='\t', header=None)
val_data = pd.read_csv('valid.tsv', sep='\t', header=None)
test_data = pd.read_csv('test.tsv', sep='\t', header=None)

In [None]:
print("Training set:", train_data.shape)
print("Validation set:", val_data.shape)
print("Test set:", test_data.shape)

In [None]:
#Dataset 2 : ISOT
true_data = pd.read_csv('/content/True.csv')
fake_data = pd.read_csv('/content/Fake.csv')

In [None]:
true_data['label'] = 1
fake_data['label'] = 0
combined_data = pd.concat([true_data, fake_data], axis=0, ignore_index=True)
combined_data = combined_data.sample(frac=1).reset_index(drop=True)

# Train-Validation-Test Split

In [None]:
from sklearn.model_selection import train_test_split
train_data2, temp_data = train_test_split(
    combined_data,
    train_size=0.7,
    random_state=42,
    stratify=combined_data['label']
)

In [None]:
val_data2, test_data2 = train_test_split(
    temp_data,
    train_size=0.5,
    random_state=42,
    stratify=temp_data['label']
)

In [None]:
print("Training set:", train_data2.shape)
print("Validation set:", val_data2.shape)
print("Test set:", test_data2.shape)

In [None]:
train_data

In [None]:
train_data2

# Data Preprocessing

In [None]:
columns = ['id', 'label', 'statement', 'subject(s)', 'speaker',
           'speaker_job_title', 'state info', 'party',
           'barely_true_count', 'false_count',
           'half_true_count', 'mostly_true_count',
           'pants_on_fire_count', 'context']
for df in [train_data, val_data, test_data]:
    df.columns = columns

In [None]:
train_data

# Exploratory Data Analysis (EDA)

In [None]:
print("LIAR Dataset Shape:")
print("Training set:", train_data.shape)
print("Testing set:", test_data.shape)
print("Validation set:", val_data.shape)
print("Columns:", train_data.columns.tolist())
print(train_data.head())

In [None]:
print("ISOT Dataset Shape:")
print("Training set:", train_data2.shape)
print("Testing set:", test_data2.shape)
print("Validation set:", val_data2.shape)
print("Columns:", train_data2.columns.tolist())
print(train_data2.head())

In [None]:
print("Missing Values in LIAR:")
print(train_data.isnull().sum() + test_data.isnull().sum() + val_data.isnull().sum())

In [None]:
print("Missing Values in ISOT:")
print(train_data2.isnull().sum() + test_data2.isnull().sum() + val_data2.isnull().sum())

In [None]:
print("Duplicate Rows in LIAR:")
print(train_data.duplicated().sum() + test_data.duplicated().sum() + val_data.duplicated().sum())

In [None]:
print("Duplicate Rows in ISOT:")
print(train_data2.duplicated().sum() + test_data2.duplicated().sum() + val_data2.duplicated().sum())

In [None]:
import seaborn as sns
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
liar = pd.concat([train_data, test_data, val_data], axis=0, ignore_index=True)
isot = pd.concat([train_data2, test_data2, val_data2], axis=0, ignore_index=True)
sns.countplot(x='label', data=liar, ax=axes[0])
axes[0].set_title("LIAR Label Distribution")
sns.countplot(x='label', data=isot, ax=axes[1])
axes[1].set_title("ISOT Label Distribution")
plt.show()

In [None]:
print("LIAR Label Counts:")
print(train_data['label'].value_counts(normalize=True))

In [None]:
print("ISOT Label Counts:")
print(train_data2['label'].value_counts(normalize=True))

In [None]:
combined_train = pd.concat([train_data, train_data2], axis=0, ignore_index=True)
combined_test = pd.concat([test_data, test_data2], axis=0, ignore_index=True)
combined_val = pd.concat([val_data, val_data2], axis=0, ignore_index=True)
print("Combined Train Dataset Shape:", combined_train.shape)
print("Combined Test Dataset Shape:", combined_test.shape)
print("Combined Validation Dataset Shape:", combined_val.shape)

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(x='label', data=combined_train)
plt.title("Combined Dataset Label Distribution")
plt.show()

In [None]:
print("Combined Label Distribution:")
print(combined_train['label'].value_counts(normalize=True))

In [None]:
combined_train['text_length'] = combined_train['statement'].apply(lambda x: len(str(x).split()))
plt.figure(figsize=(8,5))
sns.boxplot(data=combined_train, x='label', y='text_length')
plt.title("Text Length by Label (Combined)")
plt.xlabel("Label")
plt.ylabel("Number of Words")
plt.ylim(0, 500)
plt.show()

# Data Cleaning and Missing Value Handling

In [None]:
for df in [train_data, val_data, test_data]:
    df.drop(['id', 'subject(s)', 'state info', 'party', 'speaker', 'speaker_job_title', 'barely_true_count', 'false_count', 'half_true_count', 'mostly_true_count',
             'pants_on_fire_count', 'context'], axis=1, inplace=True)
    df.drop_duplicates(inplace=True)
    df.replace('', np.nan, inplace=True)

In [None]:
print(train_data.columns)

In [None]:
for df in [train_data2, val_data2, test_data2]:
    df['statement'] = df.apply(lambda row: f"{row['title']}. {row['text']}", axis=1)
    df.drop(['title', 'text', 'date', 'subject'], axis=1, inplace=True)
    df.drop_duplicates(inplace=True)
    df.replace('', np.nan, inplace=True)

In [None]:
print(train_data2.columns)

# Text Preprocessing

In [None]:
for df in [train_data, val_data, test_data, train_data2, val_data2, test_data2]:
    df['statement'] = df['statement'].str.lower()

In [None]:
import contractions
def expand_contractions(text):
    return contractions.fix(text)

In [None]:
def sentence_tokenize(text):
    sentences = nltk.sent_tokenize(text)
    return " ".join(sentences)

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    words = word_tokenize(text.lower())
    return ' '.join([word for word in words if word not in stop_words])

In [None]:
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

In [None]:
def remove_repeating_chars(text):
    return re.sub(r'(.)\1+', r'\1\1', text)

In [None]:
def remove_urls(text):
    return re.sub(r'http\S+|www\S+', '', text)

In [None]:
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

In [None]:
def preprocess_text(text):
    text = sentence_tokenize(text)
    text = expand_contractions(text)
    text = remove_urls(text)
    text = remove_punctuation(text)
    text = remove_repeating_chars(text)
    text = remove_numbers(text)
    text = remove_stopwords(text)
    return text

In [None]:
for df in [train_data, val_data, test_data, train_data2, test_data2, val_data2]:
    df['statement'] = df['statement'].astype(str).apply(preprocess_text)

# Tokenization, Stemming and Lemmatization

In [None]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
ps = PorterStemmer()
lm = WordNetLemmatizer()

In [None]:
def stem_words(tokens):
    return [ps.stem(word) for word in tokens]

In [None]:
def lemmatize_words(tokens):
    return [lm.lemmatize(word) for word in tokens]

In [None]:
for df in [train_data, test_data, val_data, train_data2, test_data2, val_data2]:
    df['statement'] = df['statement'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))

# Word Embedding (Word2Vec)

In [None]:
from gensim.models import Word2Vec
for df in [train_data, test_data, val_data, train_data2, test_data2, val_data2]:
    df['tokens'] = df['statement'].apply(lambda x: word_tokenize(str(x).lower()))

In [None]:
w2v_model = Word2Vec(sentences=train_data['tokens'], vector_size=100, window=5, min_count=1, workers=4)
w2v_model = Word2Vec(sentences=train_data2['tokens'], vector_size=100, window=5, min_count=1, workers=4)

In [None]:
def sentence_vector(sentence, model):
    words = [word for word in sentence if word in model.wv]
    if len(words) == 0:
        return np.zeros(model.vector_size)
    return np.mean([model.wv[word] for word in words], axis=0)

In [None]:
for df in [train_data, test_data, val_data, train_data2, test_data2, val_data2]:
    df['vector'] = df['tokens'].apply(lambda x: sentence_vector(x, w2v_model))

In [None]:
print(train_data[['statement', 'vector']].head())
print(train_data2[['statement', 'vector']].head())

# Feature Engineering

In [None]:
for df in [train_data, test_data, val_data]:
    df['label'] = pd.Categorical(df['label']).codes

In [None]:
def convert_to_binary(y):
     return np.where(y.isin([3, 5]), 1, 0)

In [None]:
for df in [train_data, test_data, val_data]:
    df['label'] = convert_to_binary(df['label'])

In [None]:
merged_train_data = pd.concat([train_data, train_data2], axis=0, ignore_index=True)
merged_train_data = merged_train_data.sample(frac=1, random_state=42).reset_index(drop=True)
print("Merged train data shape:", merged_train_data.shape)

In [None]:
print(merged_train_data.columns)

In [None]:
merged_test_data = pd.concat([test_data, test_data2], axis=0, ignore_index=True)
merged_test_data = merged_test_data.sample(frac=1, random_state=42).reset_index(drop=True)
print("Merged test data shape:", merged_test_data.shape)

In [None]:
merged_val_data = pd.concat([val_data, val_data2], axis=0, ignore_index=True)
merged_val_data = merged_val_data.sample(frac=1, random_state=42).reset_index(drop=True)
print("Merged validation data shape:", merged_val_data.shape)

In [None]:
X_train_text = np.array(merged_train_data['vector'].tolist())
X_val_text = np.array(merged_val_data['vector'].tolist())
X_test_text = np.array(merged_test_data['vector'].tolist())

In [None]:
y_train_binary = merged_train_data['label'].values
y_val_binary = merged_val_data['label'].values
y_test_binary = merged_test_data['label'].values

# Model Definition

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (Input, Conv1D, MaxPooling1D, Dense, Dropout, BatchNormalization, GlobalAveragePooling1D, Reshape, GRU, LSTM, Bidirectional,
                                     MultiHeadAttention, LayerNormalization, Attention)
from tensorflow.keras.regularizers import l2

In [None]:
text_input = Input(shape=(100,), name='text_input')
text_reshaped = Reshape((100, 1))(text_input)

In [None]:
conv_layer = Conv1D(filters=512, kernel_size=5, activation='relu')(text_reshaped)
pooling_layer = MaxPooling1D(pool_size=2)(conv_layer)
conv_layer2 = Conv1D(filters=512, kernel_size=3, activation='relu')(pooling_layer)
pooling_layer2 = MaxPooling1D(pool_size=2)(conv_layer2)

In [None]:
attention_output = MultiHeadAttention(num_heads=4, key_dim=64)(pooling_layer2, pooling_layer2)
attention_output = LayerNormalization()(attention_output + pooling_layer2)

In [None]:
bi_lstm_layer = Bidirectional(LSTM(256, return_sequences=True, dropout=0.3, recurrent_dropout=0.2))(attention_output)
bi_lstm_layer = BatchNormalization()(bi_lstm_layer)

In [None]:
bi_gru_layer = Bidirectional(GRU(256, return_sequences=True, dropout=0.3, recurrent_dropout=0.2))(bi_lstm_layer)
bi_gru_layer = BatchNormalization()(bi_gru_layer)

In [None]:
attention_output1 = Attention()([bi_gru_layer, bi_gru_layer])
attention_output1 = GlobalAveragePooling1D()(attention_output1)

In [None]:
combined = Dense(256, activation='relu', kernel_regularizer=l2(0.01))(attention_output1)
combined = Dropout(0.6)(combined)
combined = BatchNormalization()(combined)

In [None]:
output_layer = Dense(1, activation='sigmoid')(combined)

In [None]:
model = Model(inputs=text_input, outputs=output_layer)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# Model Training

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-6, verbose=1)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [None]:
history = model.fit(
    X_train_text, y_train_binary,
    validation_data=(X_val_text, y_val_binary),
    epochs=10,
    batch_size=64,
    callbacks=[early_stopping, reduce_lr]
)

# Model Evaluation

In [None]:
test_loss, test_acc = model.evaluate(X_test_text, y_test_binary)
print(f"Final Test Accuracy: {test_acc:.4f}")

In [None]:
y_pred_probs = model.predict(X_test_text)
y_pred = (y_pred_probs > 0.5).astype(int)

In [None]:
from sklearn.metrics import classification_report
print("\nClassification Report:")
print(classification_report(y_test_binary, y_pred, target_names=["True", "Fake"]))

# Performance Visualization

In [None]:
from sklearn.metrics import precision_recall_curve, ConfusionMatrixDisplay, roc_curve, auc, average_precision_score
import matplotlib.pyplot as plt

In [None]:
precision, recall, _ = precision_recall_curve(y_test_binary, y_pred_probs)
avg_precision = average_precision_score(y_test_binary, y_pred_probs)
plt.figure(figsize=(6, 4))
plt.plot(recall, precision, lw=2, color='purple', label=f'AP = {avg_precision:.2f}')
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend(loc="upper right")
plt.show()

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test_binary, y_pred)
plt.show()

In [None]:
fpr, tpr, _ = roc_curve(y_test_binary, y_pred_probs)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 4))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
metrics = {
    "Accuracy": accuracy_score(y_test_binary, y_pred),
    "Precision": precision_score(y_test_binary, y_pred),
    "Recall": recall_score(y_test_binary, y_pred),
    "F1-score": f1_score(y_test_binary, y_pred)
}

plt.figure(figsize=(6, 4))
plt.bar(metrics.keys(), metrics.values(), color='skyblue')
plt.title("Performance Metrics")
plt.ylim(0, 1)
plt.show()

# Explainable AI (LIME + SHAP)

In [None]:
from lime.lime_text import LimeTextExplainer
def _tokenize_for_model(text: str):
    cleaned = preprocess_text(str(text))
    return word_tokenize(cleaned)

In [None]:
def texts_to_vectors(texts):
    vecs = []
    for t in texts:
        toks = _tokenize_for_model(t)
        vecs.append(sentence_vector(toks, w2v_model))
    return np.vstack(vecs)

In [None]:
def predict_proba_texts(texts):
    X = texts_to_vectors(texts)
    probs = model.predict(X, verbose=0).reshape(-1)
    return np.vstack([1.0 - probs, probs]).T

In [None]:
def predict_pos(texts):
    X = texts_to_vectors(texts)
    return model.predict(X, verbose=0).reshape(-1)

In [None]:
example_text = merged_test_data['statement'].iloc[0]
print("Example text:", example_text[:300], "..." if len(example_text)>300 else "")

In [None]:
CLASS_NAMES = ["Fake", "True"]
lime_explainer = LimeTextExplainer(class_names=CLASS_NAMES)
lime_exp = lime_explainer.explain_instance(
    example_text,
    predict_proba_texts,
    num_features=12,
    top_labels=1
)

In [None]:
print("\nLIME top features per label:")
for label in lime_exp.available_labels():
    print(f"\nLabel {label}:")
    for w, wgt in lime_exp.as_list(label=label):
        print(f"{w:>20s}  {wgt:+.4f}")

In [None]:
with open("lime_explanation.html","w", encoding="utf-8") as f:
    f.write(lime_exp.as_html())
print("\nSaved: lime_explanation.html (open/download to view interactive LIME)")

In [None]:
import shap
shap.initjs()
text_masker = shap.maskers.Text()
shap_explainer = shap.Explainer(predict_pos, text_masker)
shap_values = shap_explainer([example_text])

In [None]:
try:
    shap.plots.text(shap_values[0])
except Exception:
    shap.plots.bar(shap_values[0], max_display=12)
    plt.show()

In [None]:
background_summary = shap.kmeans(X_train_text, 50)
te_size = min(20, len(X_test_text))
test_sample = X_test_text[:te_size]
kernel_explainer = shap.KernelExplainer(lambda X: model.predict(X).reshape(-1), background_summary)
shap_vals_num = kernel_explainer.shap_values(test_sample, nsamples=100)

In [None]:
if isinstance(shap_vals_num, list):
    shap_vals_num = shap_vals_num[0]

In [None]:
shap.summary_plot(
    shap_vals_num,
    test_sample,
    feature_names=[f"dim_{i}" for i in range(X_train_text.shape[1])]
)
plt.show()