#Installs

In [None]:
!pip install pandas torch torchtext scikit-learn


Collecting torchtext
  Downloading torchtext-0.18.0-cp311-cp311-manylinux1_x86_64.whl.metadata (7.9 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata

#datasets

In [None]:
import pandas as pd

# Load the TSV files
train_df = pd.read_csv('/content/drive/MyDrive/liar_dataset/train.tsv', sep='\t', header=None)
valid_df = pd.read_csv('/content/drive/MyDrive/liar_dataset/valid.tsv', sep='\t', header=None)
test_df  = pd.read_csv('/content/drive/MyDrive/liar_dataset/test.tsv',  sep='\t', header=None)

# Set proper column names
columns = ['id', 'label', 'statement', 'subjects', 'speaker', 'job_title',
           'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts',
           'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context']

train_df.columns = columns
valid_df.columns = columns
test_df.columns = columns


In [None]:
print(train_df.head())

           id  label                                          statement  \
0   2635.json      1  Says the Annies List political group supports ...   
1  10540.json      3  When did the decline of coal start? It started...   
2    324.json      4  Hillary Clinton agrees with John McCain "by vo...   
3   1123.json      1  Health care reform legislation is likely to ma...   
4   9028.json      3  The economic turnaround started at the end of ...   

                             subjects         speaker             job_title  \
0                            abortion    dwayne-bohac  State representative   
1  energy,history,job-accomplishments  scott-surovell        State delegate   
2                      foreign-policy    barack-obama             President   
3                         health-care    blog-posting                   NaN   
4                        economy,jobs   charlie-crist                   NaN   

  state_info party_affiliation  barely_true_counts  false_counts  \
0     

#Preprocess

In [None]:
# Mapping labels to numbers
label_mapping = {
    'pants-fire': 0,
    'false': 0,
    'barely-true': 0,
    'half-true': 1,
    'mostly-true': 1,
    'true': 1
}

train_df['label'] = train_df['label'].map(label_mapping)
valid_df['label'] = valid_df['label'].map(label_mapping)
test_df['label'] = test_df['label'].map(label_mapping)

# Drop rows with missing labels or statements
train_df.dropna(subset=['statement', 'label'], inplace=True)
valid_df.dropna(subset=['statement', 'label'], inplace=True)
test_df.dropna(subset=['statement', 'label'], inplace=True)


#Tokenize

In [None]:
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch
from torch.nn.utils.rnn import pad_sequence
from collections import Counter

# Simple tokenizer
def tokenize(text):
    return text.lower().split()

# Build vocabulary
all_text = train_df['statement'].tolist()
counter = Counter()
for text in all_text:
    counter.update(tokenize(text))

# Keep top words
vocab = {word: idx+2 for idx, (word, _) in enumerate(counter.most_common(10000))}
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1

# Encode a sentence
def encode(text):
    return [vocab.get(word, vocab['<UNK>']) for word in tokenize(text)]

# Custom Dataset
class FakeNewsDataset(Dataset):
    def __init__(self, df):
        self.texts = [torch.tensor(encode(text)) for text in df['statement']]
        self.labels = torch.tensor(df['label'].tolist())

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# Pad sequences inside the batch
def collate_fn(batch):
    texts, labels = zip(*batch)
    texts = pad_sequence(texts, batch_first=True, padding_value=0)
    labels = torch.tensor(labels)
    return texts, labels

train_dataset = FakeNewsDataset(train_df)
valid_dataset = FakeNewsDataset(valid_df)
test_dataset  = FakeNewsDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader  = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)


#RNN Model

In [None]:
import torch.nn as nn

class FakeNewsRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(FakeNewsRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded)
        out = self.fc(hidden.squeeze(0))
        return out

# Hyperparameters
vocab_size = len(vocab)
embed_dim = 100
hidden_dim = 128
output_dim = 6

model = FakeNewsRNN(vocab_size, embed_dim, hidden_dim, output_dim)


#Training

In [None]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Training Loop
for epoch in range(20):
    model.train()
    total_loss = 0
    for texts, labels in train_loader:
        texts, labels = texts.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}')
    result()


Epoch 1, Loss: 1.7130
Accuracy: 20.36%
Epoch 2, Loss: 1.7107
Accuracy: 21.31%
Epoch 3, Loss: 1.7057
Accuracy: 22.42%
Epoch 4, Loss: 1.7017
Accuracy: 22.97%
Epoch 5, Loss: 1.6884
Accuracy: 23.05%
Epoch 6, Loss: 1.6645
Accuracy: 21.94%
Epoch 7, Loss: 1.6500
Accuracy: 22.02%
Epoch 8, Loss: 1.6745
Accuracy: 20.76%
Epoch 9, Loss: 1.7019
Accuracy: 20.84%
Epoch 10, Loss: 1.7354
Accuracy: 18.00%
Epoch 11, Loss: 1.7452
Accuracy: 18.15%
Epoch 12, Loss: 1.7048
Accuracy: 21.39%
Epoch 13, Loss: 1.6873
Accuracy: 20.28%
Epoch 14, Loss: 1.6698
Accuracy: 20.60%
Epoch 15, Loss: 1.6769
Accuracy: 20.44%
Epoch 16, Loss: 1.6757
Accuracy: 22.02%
Epoch 17, Loss: 1.7098
Accuracy: 20.76%
Epoch 18, Loss: 1.6833
Accuracy: 21.78%
Epoch 19, Loss: 1.6557
Accuracy: 22.10%
Epoch 20, Loss: 1.6508
Accuracy: 22.57%


#Results

In [None]:
def result():
  model.eval()
  correct = 0
  total = 0

  with torch.no_grad():
      for texts, labels in test_loader:
          texts, labels = texts.to(device), labels.to(device)
          outputs = model(texts)
          _, predicted = torch.max(outputs, 1)
          total += labels.size(0)
          correct += (predicted == labels).sum().item()

  print(f'Accuracy: {100 * correct / total:.2f}%')


#naive bayes

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

# 1. Load your train, validation, and test data
train_df = pd.read_csv('/content/drive/MyDrive/liar_dataset/train.tsv', sep='\t', header=None)
valid_df = pd.read_csv('/content/drive/MyDrive/liar_dataset/valid.tsv', sep='\t', header=None)
test_df = pd.read_csv('/content/drive/MyDrive/liar_dataset/test.tsv', sep='\t', header=None)

# 2. Set column names (based on your description)
columns = [
    "id", "label", "statement", "subject", "speaker", "speaker_job_title",
    "state_info", "party_affiliation", "barely_true_counts", "false_counts",
    "half_true_counts", "mostly_true_counts", "pants_on_fire_counts", "context"
]

train_df.columns = columns
valid_df.columns = columns
test_df.columns = columns

label_mapping = {
    'pants-fire': 0,
    'false': 0,
    'barely-true': 0,
    'half-true': 1,
    'mostly-true': 1,
    'true': 1
}

# Map the 'label' column using the label_mapping dictionary
train_df['label'] = train_df['label'].map(label_mapping)
valid_df['label'] = valid_df['label'].map(label_mapping)
test_df['label'] = test_df['label'].map(label_mapping)


# 3. Combine train + valid for final training (optional but better)
full_train_df = pd.concat([train_df, valid_df])

# 4. Get text and labels
X_train = full_train_df['statement']
y_train = full_train_df['label']

X_test = test_df['statement']
y_test = test_df['label']

# 5. Text Vectorization (TF-IDF)
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# 6. Naive Bayes model
model = MultinomialNB()
model.fit(X_train_vec, y_train)

# 7. Predict
y_pred = model.predict(X_test_vec)

# 8. Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.611681136543015

Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.39      0.47       553
           1       0.62      0.78      0.69       714

    accuracy                           0.61      1267
   macro avg       0.60      0.59      0.58      1267
weighted avg       0.61      0.61      0.60      1267



#svm randomforest xgboost gdboost

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.neighbors import KNeighborsClassifier

# 1. Load your train, validation, and test data
train_df = pd.read_csv('/content/drive/MyDrive/liar_dataset/train.tsv', sep='\t', header=None)
valid_df = pd.read_csv('/content/drive/MyDrive/liar_dataset/valid.tsv', sep='\t', header=None)
test_df = pd.read_csv('/content/drive/MyDrive/liar_dataset/test.tsv', sep='\t', header=None)

# 2. Set column names
columns = [
    "id", "label", "statement", "subject", "speaker", "speaker_job_title",
    "state_info", "party_affiliation", "barely_true_counts", "false_counts",
    "half_true_counts", "mostly_true_counts", "pants_on_fire_counts", "context"
]

train_df.columns = columns
valid_df.columns = columns
test_df.columns = columns

# 3. Label mapping: group into 0 (false) and 1 (true)
label_mapping = {
    'pants-fire': 0,
    'false': 0,
    'barely-true': 0,
    'half-true': 1,
    'mostly-true': 1,
    'true': 1
}

train_df['label'] = train_df['label'].map(label_mapping)
valid_df['label'] = valid_df['label'].map(label_mapping)
test_df['label'] = test_df['label'].map(label_mapping)

# 4. Combine train + valid for full training
full_train_df = pd.concat([train_df, valid_df])

# 5. Get text and labels
X_train = full_train_df['statement']
y_train = full_train_df['label']

X_test = test_df['statement']
y_test = test_df['label']

# 6. TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# 7. Choose your model
# 7.1 SVM
model = SVC(kernel='linear', probability=True, random_state=42)
model.fit(X_train_vec, y_train)

# 9. Predict
y_pred = model.predict(X_test_vec)

# 10. Evaluate
print("svm\n")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# 7.2 Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)

model.fit(X_train_vec, y_train)

# 9. Predict
y_pred = model.predict(X_test_vec)

# 10. Evaluate
print("random forest\n")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# 7.3 XGBoost
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model.fit(X_train_vec, y_train)

# 9. Predict
y_pred = model.predict(X_test_vec)

# 10. Evaluate
print("xgboost\n")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# 7.4 Gradient Boosting
model = GradientBoostingClassifier(n_estimators=100, random_state=42)

# 8. Train the model
model.fit(X_train_vec, y_train)

# 9. Predict
y_pred = model.predict(X_test_vec)

# 10. Evaluate
print("GD boost\n")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


# 7.5 KNN
model = KNeighborsClassifier(n_neighbors=5)

model.fit(X_train_vec, y_train)

# 9. Predict
y_pred = model.predict(X_test_vec)

# 10. Evaluate
print("random forest\n")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

svm

Accuracy: 0.6069455406471981

Classification Report:
               precision    recall  f1-score   support

           0       0.56      0.45      0.50       553
           1       0.63      0.73      0.68       714

    accuracy                           0.61      1267
   macro avg       0.60      0.59      0.59      1267
weighted avg       0.60      0.61      0.60      1267

random forest

Accuracy: 0.6108918705603789

Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.40      0.47       553
           1       0.63      0.77      0.69       714

    accuracy                           0.61      1267
   macro avg       0.60      0.59      0.58      1267
weighted avg       0.60      0.61      0.60      1267



Parameters: { "use_label_encoder" } are not used.



xgboost

Accuracy: 0.6093133385951065

Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.39      0.46       553
           1       0.62      0.78      0.69       714

    accuracy                           0.61      1267
   macro avg       0.60      0.58      0.58      1267
weighted avg       0.60      0.61      0.59      1267

GD boost

Accuracy: 0.6045777426992897

Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.24      0.35       553
           1       0.60      0.88      0.72       714

    accuracy                           0.60      1267
   macro avg       0.61      0.56      0.53      1267
weighted avg       0.61      0.60      0.56      1267

random forest

Accuracy: 0.5777426992896606

Classification Report:
               precision    recall  f1-score   support

           0       0.56      0.14      0.23       553
           1       0.58      0.91      0.

#voting model

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

# 1. Load your train, validation, and test data
train_df = pd.read_csv('/content/drive/MyDrive/liar_dataset/train.tsv', sep='\t', header=None)
valid_df = pd.read_csv('/content/drive/MyDrive/liar_dataset/valid.tsv', sep='\t', header=None)
test_df = pd.read_csv('/content/drive/MyDrive/liar_dataset/test.tsv', sep='\t', header=None)

# 2. Set column names
columns = [
    "id", "label", "statement", "subject", "speaker", "speaker_job_title",
    "state_info", "party_affiliation", "barely_true_counts", "false_counts",
    "half_true_counts", "mostly_true_counts", "pants_on_fire_counts", "context"
]
train_df.columns = columns
valid_df.columns = columns
test_df.columns = columns

# 3. Label mapping
label_mapping = {
    'pants-fire': 0,
    'false': 0,
    'barely-true': 0,
    'half-true': 1,
    'mostly-true': 1,
    'true': 1
}
train_df['label'] = train_df['label'].map(label_mapping)
valid_df['label'] = valid_df['label'].map(label_mapping)
test_df['label'] = test_df['label'].map(label_mapping)

# 4. Combine train + valid
full_train_df = pd.concat([train_df, valid_df])

# 5. Get text and labels
X_train = full_train_df['statement']
y_train = full_train_df['label']
X_test = test_df['statement']
y_test = test_df['label']

# 6. TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# 7. Create the models
svm_model = SVC(kernel='linear', probability=True, random_state=42)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# 8. Voting Classifier
voting_model = VotingClassifier(
    estimators=[
        ('svm', svm_model),
        ('rf', rf_model),
        ('xgb', xgb_model)
    ],
    voting='soft'   # soft voting uses predicted probabilities
)

# 9. Train the ensemble model
voting_model.fit(X_train_vec, y_train)

# 10. Predict
y_pred = voting_model.predict(X_test_vec)

# 11. Evaluate
print("Voting Classifier (SVM + RF + XGB)\n")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Parameters: { "use_label_encoder" } are not used.



Voting Classifier (SVM + RF + XGB)

Accuracy: 0.6203630623520127

Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.38      0.47       553
           1       0.63      0.81      0.71       714

    accuracy                           0.62      1267
   macro avg       0.61      0.59      0.59      1267
weighted avg       0.62      0.62      0.60      1267



#Gated Recurrent Unit

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Bidirectional

# 1. Load your train, valid, and test data
train_df = pd.read_csv('/content/drive/MyDrive/liar_dataset/train.tsv', sep='\t', header=None)
valid_df = pd.read_csv('/content/drive/MyDrive/liar_dataset/valid.tsv', sep='\t', header=None)
test_df = pd.read_csv('/content/drive/MyDrive/liar_dataset/test.tsv', sep='\t', header=None)

# 2. Set column names
columns = [
    "id", "label", "statement", "subject", "speaker", "speaker_job_title",
    "state_info", "party_affiliation", "barely_true_counts", "false_counts",
    "half_true_counts", "mostly_true_counts", "pants_on_fire_counts", "context"
]
train_df.columns = columns
valid_df.columns = columns
test_df.columns = columns

# 3. Label mapping (same as before)
label_mapping = {
    'pants-fire': 0,
    'false': 0,
    'barely-true': 0,
    'half-true': 1,
    'mostly-true': 1,
    'true': 1
}
train_df['label'] = train_df['label'].map(label_mapping)
valid_df['label'] = valid_df['label'].map(label_mapping)
test_df['label'] = test_df['label'].map(label_mapping)

# 4. Combine train + valid
full_train_df = pd.concat([train_df, valid_df])

# 5. Get texts and labels
X_train = full_train_df['statement'].astype(str)
y_train = full_train_df['label']

X_test = test_df['statement'].astype(str)
y_test = test_df['label']

# 6. Tokenization and Padding
vocab_size = 10000  # You can change based on your dataset size
maxlen = 100        # Maximum number of words per statement

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen, padding='post')

# 7. Build GRU Model
model = Sequential([
    Embedding(vocab_size, 128, input_length=maxlen),
    GRU(64, return_sequences=False),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

# model = Sequential([
#     Embedding(vocab_size, 128, input_length=maxlen),
#     Bidirectional(GRU(64)),
#     Dropout(0.5),
#     Dense(32, activation='relu'),
#     Dense(1, activation='sigmoid')
# ])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# 8. Train Model
early_stop = EarlyStopping(monitor='val_loss', patience=3)

history = model.fit(
    X_train_pad, y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stop],
    verbose=2
)

# 9. Evaluate
loss, accuracy = model.evaluate(X_test_pad, y_test, verbose=0)
print(f"GRU Model Test Accuracy: {accuracy:.4f}")

# 10. Classification Report
from sklearn.metrics import classification_report

y_pred_prob = model.predict(X_test_pad)
y_pred = (y_pred_prob > 0.5).astype(int)

print("\nClassification Report:\n", classification_report(y_test, y_pred))




Epoch 1/10
289/289 - 10s - 34ms/step - accuracy: 0.5593 - loss: 0.6872 - val_accuracy: 0.5323 - val_loss: 0.6930
Epoch 2/10
289/289 - 3s - 10ms/step - accuracy: 0.5632 - loss: 0.6869 - val_accuracy: 0.5323 - val_loss: 0.6942
Epoch 3/10
289/289 - 2s - 8ms/step - accuracy: 0.5632 - loss: 0.6867 - val_accuracy: 0.5323 - val_loss: 0.6914
Epoch 4/10
289/289 - 2s - 8ms/step - accuracy: 0.5633 - loss: 0.6857 - val_accuracy: 0.5323 - val_loss: 0.6911
Epoch 5/10
289/289 - 3s - 9ms/step - accuracy: 0.5634 - loss: 0.6858 - val_accuracy: 0.5323 - val_loss: 0.6953
Epoch 6/10
289/289 - 3s - 10ms/step - accuracy: 0.5634 - loss: 0.6854 - val_accuracy: 0.5323 - val_loss: 0.6911
Epoch 7/10
289/289 - 5s - 16ms/step - accuracy: 0.5623 - loss: 0.6861 - val_accuracy: 0.5323 - val_loss: 0.6926
GRU Model Test Accuracy: 0.5635
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#gru+cnn

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# 1. Load and preprocess your data
train_df = pd.read_csv('/content/drive/MyDrive/liar_dataset/train.tsv', sep='\t', header=None)
valid_df = pd.read_csv('/content/drive/MyDrive/liar_dataset/valid.tsv', sep='\t', header=None)
test_df = pd.read_csv('/content/drive/MyDrive/liar_dataset/test.tsv', sep='\t', header=None)

columns = [
    "id", "label", "statement", "subject", "speaker", "speaker_job_title",
    "state_info", "party_affiliation", "barely_true_counts", "false_counts",
    "half_true_counts", "mostly_true_counts", "pants_on_fire_counts", "context"
]
train_df.columns = columns
valid_df.columns = columns
test_df.columns = columns

label_mapping = {
    'pants-fire': 0,
    'false': 0,
    'barely-true': 0,
    'half-true': 1,
    'mostly-true': 1,
    'true': 1
}

train_df['label'] = train_df['label'].map(label_mapping)
valid_df['label'] = valid_df['label'].map(label_mapping)
test_df['label'] = test_df['label'].map(label_mapping)

# Combine train + valid
full_train_df = pd.concat([train_df, valid_df])

X_train = full_train_df['statement'].astype(str)
y_train = full_train_df['label']

X_test = test_df['statement'].astype(str)
y_test = test_df['label']

# Tokenization and Padding
vocab_size = 10000
maxlen = 100

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen, padding='post')

# 2. Build GRU + CNN Model
model = Sequential([
    Embedding(vocab_size, 128, input_length=maxlen),
    GRU(64, return_sequences=True),    # Set return_sequences=True because CNN needs the full sequence
    Conv1D(64, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),               # Reduce dimensions after convolution
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# 3. Train Model
early_stop = EarlyStopping(monitor='val_loss', patience=3)

history = model.fit(
    X_train_pad, y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stop],
    verbose=2
)

# 4. Evaluate
loss, accuracy = model.evaluate(X_test_pad, y_test, verbose=0)
print(f"GRU + CNN Model Test Accuracy: {accuracy:.4f}")

# 5. Classification Report
y_pred_prob = model.predict(X_test_pad)
y_pred = (y_pred_prob > 0.5).astype(int)

print("\nClassification Report:\n", classification_report(y_test, y_pred))


Epoch 1/10




289/289 - 38s - 131ms/step - accuracy: 0.5727 - loss: 0.6753 - val_accuracy: 0.6130 - val_loss: 0.6530
Epoch 2/10
289/289 - 41s - 141ms/step - accuracy: 0.6704 - loss: 0.6098 - val_accuracy: 0.6022 - val_loss: 0.6652
Epoch 3/10
289/289 - 40s - 140ms/step - accuracy: 0.7682 - loss: 0.4892 - val_accuracy: 0.5874 - val_loss: 0.7124
Epoch 4/10
289/289 - 48s - 166ms/step - accuracy: 0.8471 - loss: 0.3518 - val_accuracy: 0.5918 - val_loss: 0.8980
GRU + CNN Model Test Accuracy: 0.5675
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 43ms/step

Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.56      0.53       553
           1       0.63      0.58      0.60       714

    accuracy                           0.57      1267
   macro avg       0.57      0.57      0.56      1267
weighted avg       0.57      0.57      0.57      1267



#GRU+RNN

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, SimpleRNN, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# 1. Load and preprocess your data
train_df = pd.read_csv('/content/drive/MyDrive/liar_dataset/train.tsv', sep='\t', header=None)
valid_df = pd.read_csv('/content/drive/MyDrive/liar_dataset/valid.tsv', sep='\t', header=None)
test_df = pd.read_csv('/content/drive/MyDrive/liar_dataset/test.tsv', sep='\t', header=None)

columns = [
    "id", "label", "statement", "subject", "speaker", "speaker_job_title",
    "state_info", "party_affiliation", "barely_true_counts", "false_counts",
    "half_true_counts", "mostly_true_counts", "pants_on_fire_counts", "context"
]
train_df.columns = columns
valid_df.columns = columns
test_df.columns = columns

label_mapping = {
    'pants-fire': 0,
    'false': 0,
    'barely-true': 0,
    'half-true': 1,
    'mostly-true': 1,
    'true': 1
}

train_df['label'] = train_df['label'].map(label_mapping)
valid_df['label'] = valid_df['label'].map(label_mapping)
test_df['label'] = test_df['label'].map(label_mapping)

# Combine train + valid
full_train_df = pd.concat([train_df, valid_df])

X_train = full_train_df['statement'].astype(str)
y_train = full_train_df['label']

X_test = test_df['statement'].astype(str)
y_test = test_df['label']

# Tokenization and Padding
vocab_size = 10000
maxlen = 100

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen, padding='post')

# 2. Build GRU + RNN Model
model = Sequential([
    Embedding(vocab_size, 128, input_length=maxlen),
    GRU(64, return_sequences=True),    # GRU captures long-range dependencies
    SimpleRNN(32, return_sequences=False),  # SimpleRNN captures short-range dependencies
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# 3. Train Model
early_stop = EarlyStopping(monitor='val_loss', patience=3)

history = model.fit(
    X_train_pad, y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stop],
    verbose=2
)

# 4. Evaluate
loss, accuracy = model.evaluate(X_test_pad, y_test, verbose=0)
print(f"GRU + RNN Model Test Accuracy: {accuracy:.4f}")

# 5. Classification Report
y_pred_prob = model.predict(X_test_pad)
y_pred = (y_pred_prob > 0.5).astype(int)

print("\nClassification Report:\n", classification_report(y_test, y_pred))




Epoch 1/10
289/289 - 42s - 146ms/step - accuracy: 0.5515 - loss: 0.6891 - val_accuracy: 0.5323 - val_loss: 0.6911
Epoch 2/10
289/289 - 32s - 110ms/step - accuracy: 0.5597 - loss: 0.6880 - val_accuracy: 0.5323 - val_loss: 0.6936
Epoch 3/10
289/289 - 40s - 137ms/step - accuracy: 0.5597 - loss: 0.6875 - val_accuracy: 0.5323 - val_loss: 0.6979
Epoch 4/10
289/289 - 33s - 113ms/step - accuracy: 0.5582 - loss: 0.6874 - val_accuracy: 0.5323 - val_loss: 0.6937
GRU + RNN Model Test Accuracy: 0.5635
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       553
           1       0.56      1.00      0.72       714

    accuracy                           0.56      1267
   macro avg       0.28      0.50      0.36      1267
weighted avg       0.32      0.56      0.41      1267



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
