# 2. Setting up the enviroment

In [76]:
# Tabjular Data Analysis
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Utility
import time
import warnings
warnings.filterwarnings('ignore')

# 4. Load the dataset

In [77]:
all_df = pd.read_csv("/content/fake_news_classification_mal_train(3).csv")
all_df.head(3)

Unnamed: 0,ID,News,Label
0,FAKE_MAL_TR_0001,കേള്‍വി തകരാറുള്ള കുട്ടികള്‍ക്ക് നടത്തുന്ന സൗജ...,False
1,FAKE_MAL_TR_0002,ചന്ദ്രയാന് കേരള മുഖ്യമന്ത്രി പിണറായി വിജയൻ മാത...,False
2,FAKE_MAL_TR_0003,പിണറായി വിജയന്‍ സര്‍ക്കാര്‍ നിര്‍മിച്ച കേരളത്ത...,False


In [78]:
# List of columns to drop
#columns_to_drop = ['Unnamed: 0']

# Drop the columns
#train_df = train_df.drop(columns=columns_to_drop)


In [79]:
from sklearn.model_selection import train_test_split

In [80]:
# Split into train, validation
train_df, val_df = train_test_split(all_df, test_size=0.2, stratify=all_df["Label"], random_state=42)

In [81]:
test_df = pd.read_csv("/content/fake_news_classification_mal_test.xlsx - Sheet1.csv")
test_df.head(3)

Unnamed: 0,S.no,News
0,FAKE_MAL_TE_0001,കേരളത്തില്‍ പുരുഷന്മാര്‍ക്ക് രണ്ട് ഭാര്യമാര്‍ ...
1,FAKE_MAL_TE_0002,പാർട്ടിയുടെ കൊടിക്ക് മഹത്വം ഉണ്ടെന്നും സംരംഭങ്...
2,FAKE_MAL_TE_0003,നവകേരള സദസ്സ്: കാട്ടാക്കട ക്രിസ്ത്യൻ കോളേജ് കവ...


In [82]:
TEXT_VAR = "News"
LABEL_VAR = "Label"

Labels are not numerical. Let's make them numerical.

In [83]:
# Map text labels to numerical values
label_mapping = {label: idx for idx, label in enumerate(train_df[LABEL_VAR].unique())}
train_df[LABEL_VAR] = train_df[LABEL_VAR].map(label_mapping)  # Change as necessary
val_df[LABEL_VAR] = val_df[LABEL_VAR].map(label_mapping)  # Change as necessary

# Modeling

## Load Things

In [84]:
import os
import torch
import torch.nn as nn
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModel, AutoTokenizer, AutoProcessor
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tqdm import tqdm

In [85]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [86]:
seed = 42
torch.manual_seed(seed)

<torch._C.Generator at 0x7d5f77f24310>

In [87]:
# Hyperparameters
model_name = "l3cube-pune/malayalam-topic-all-doc"
batch_size = 16
max_length = 512

In [88]:
# Load Tokenizer and Model
text_tokenizer = AutoTokenizer.from_pretrained(model_name)
text_model = AutoModel.from_pretrained(model_name).to(device)

In [89]:
class TextDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        transcription = self.df.iloc[idx][TEXT_VAR]
        transcription = transcription if isinstance(transcription, str) else ""
        inputs = self.tokenizer(
            transcription, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt"
        )
        return inputs, self.df.iloc[idx][LABEL_VAR]

## Collect Embeddings

In [90]:
def extract_text_embeddings(df, save_path, model, tokenizer):
    if os.path.exists(save_path):
        print(f"Embeddings already exist at {save_path}")
        return torch.load(save_path)

    embeddings = {}
    model.eval()
    with torch.no_grad():
        for idx, row in tqdm(df.iterrows(), desc="Extracting text embeddings", total=len(df)):
            transcription = row[TEXT_VAR]
            transcription = transcription if isinstance(transcription, str) else ""

            # Tokenize the text
            inputs = tokenizer(
                transcription, padding="max_length", truncation=True, max_length=512, return_tensors="pt"
            )
            inputs = {key: value.to(device) for key, value in inputs.items()}  # Move inputs to GPU/CPU

            # Extract embeddings
            outputs = model(**inputs)
            cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token embeddings
            embeddings[idx] = cls_embedding.cpu()  # Use the index as the key

    torch.save(embeddings, save_path)
    return embeddings

In [91]:
train_text_embeddings = extract_text_embeddings(
    train_df, "train_text_embeddings.pt", text_model, text_tokenizer
)
val_text_embeddings = extract_text_embeddings(
    val_df, "val_text_embeddings.pt", text_model, text_tokenizer
)
test_text_embeddings = extract_text_embeddings(
    test_df, "test_text_embeddings.pt", text_model, text_tokenizer
)

Embeddings already exist at train_text_embeddings.pt
Embeddings already exist at val_text_embeddings.pt
Embeddings already exist at test_text_embeddings.pt


## Load Embeddings

In [92]:
def load_embeddings(embedding_path):
    if os.path.exists(embedding_path):
        print(f"Loading embeddings from {embedding_path}")
        return torch.load(embedding_path)
    else:
        raise FileNotFoundError(f"Embeddings file not found at {embedding_path}")

In [93]:
train_text_embeddings = load_embeddings("/content/train_text_embeddings.pt")
val_text_embeddings = load_embeddings("/content/val_text_embeddings.pt")
test_text_embeddings = load_embeddings("/content/test_text_embeddings.pt")

Loading embeddings from /content/train_text_embeddings.pt
Loading embeddings from /content/val_text_embeddings.pt
Loading embeddings from /content/test_text_embeddings.pt


## Modeling

In [94]:
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import DataLoader, TensorDataset

In [95]:
def prepare_text_embeddings(text_embeddings, df, LABEL_VAR, has_labels=True):
    combined_embeddings = []
    labels = [] if has_labels else None

    for idx, row in df.iterrows():
        # Ensure the index exists in the text embeddings
        if idx in text_embeddings:
            text_embedding = text_embeddings[idx].squeeze()  # Squeeze to remove unnecessary dimensions

            # Add the text embedding to the list
            combined_embeddings.append(text_embedding)

            if has_labels:
                labels.append(row[LABEL_VAR])  # Get the label from the DataFrame

    if has_labels:
        return torch.stack(combined_embeddings), torch.tensor(labels)
    else:
        return torch.stack(combined_embeddings)

In [96]:
X_train, y_train = prepare_text_embeddings(train_text_embeddings, train_df, LABEL_VAR)
X_val, y_val = prepare_text_embeddings(val_text_embeddings, val_df, LABEL_VAR)
X_test = prepare_text_embeddings(test_text_embeddings, test_df, LABEL_VAR, has_labels=False)

print(f"Training data shape: {X_train.shape}, Labels: {y_train.shape}")
print(f"Validation data shape: {X_val.shape}, Labels: {y_val.shape}")
print(f"Test data shape: {X_test.shape}")

Training data shape: torch.Size([1520, 768]), Labels: torch.Size([1520])
Validation data shape: torch.Size([380, 768]), Labels: torch.Size([380])
Test data shape: torch.Size([200, 768])


In [97]:
from collections import Counter
def new_balance_classes(X, y):

  # Load embeddings from .pt file
#     embeddings = torch.load(embeddings_path)

  # Count occurrences of each class
  train_df=pd.DataFrame({"Label":y})
  class_counts = Counter(train_df['Label'])



  # Find the maximum count
  max_count = max(class_counts.values())

  # Create a new list for balanced data
  label_lst = []
  embedding_lst=[]


  for idx, row in train_df.iterrows():
      label = row[LABEL_VAR]
      embedding = X[idx]  # Assuming embeddings are in the same order as DataFrame

      # Add original sample
      embedding_lst.append(embedding)  # Duplicate sample
      label_lst.append(label)



      # Duplicate samples for minority classes
      duplicates_needed = int( (max_count - class_counts[label]) / class_counts[label])
      for _ in range(duplicates_needed):
          embedding_lst.append(embedding)  # Duplicate sample
          label_lst.append(label)


  return torch.stack(embedding_lst), np.array(label_lst)


# # train_df.head()
X_train,y_train=new_balance_classes(X_train,y_train)

X_train = torch.tensor(X_train)
y_train = torch.tensor(y_train)

In [98]:
# Define the MLP model
class MLPModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout_p=0.35):
        """
        Initialize the MLP model.
        Args:
            input_dim (int): Dimension of the input features.
            hidden_dim (list of int): List of dimensions for hidden layers.
            output_dim (int): Dimension of the output layer.
            dropout_p (float): Dropout probability.
        """
        super(MLPModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim[0])
        self.relu = nn.ReLU()
        self.dropout1 = nn.Dropout(p=dropout_p)
        self.fc2 = nn.Linear(hidden_dim[0], hidden_dim[1])
        self.dropout2 = nn.Dropout(p=dropout_p)
        self.fc3 = nn.Linear(hidden_dim[1], output_dim)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout1(x)
        x = self.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

In [99]:
# Hyperparameters
input_dim = X_train.shape[1]
num_classes = len(train_df[LABEL_VAR].unique())
hidden_dim = [768, 512]  # Increased hidden layer dimensions
output_dim = num_classes
batch_size = 16  # Smaller batch size
learning_rate = 0.0001  # Increased learning rate
dropout_p = 0.35

In [100]:
# Prepare the data loaders
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)
test_dataset = TensorDataset(X_test)

train_loader = DataLoader(train_dataset, batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size)
test_loader = DataLoader(test_dataset, batch_size)

In [101]:
# Initialize model, loss function, and optimizer
model = MLPModel(input_dim, hidden_dim, output_dim, dropout_p).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [102]:
# Function to calculate metrics
def calculate_metrics(preds, labels):
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="macro")
    return accuracy, precision, recall, f1

In [103]:
max_length = train_df['News'].str.len().max()
print(f"Maximum text length: {max_length}")

Maximum text length: 250


In [104]:
max_length = val_df['News'].str.len().mean()
print(f"Maximum text length: {max_length}")

Maximum text length: 92.33947368421053


## Train and Val

In [105]:
# Train and save best model
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def train_and_save_best_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, save_dir):
    best_f1 = -float('inf')
    best_model_path = None

    for epoch in range(num_epochs):
        model.train() # Indent this line. Start of the loop
        train_loss = 0
        all_train_preds, all_train_labels = [], []

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs) # Remove the squeeze function

            #check if a single sample is passed in as a batch
            if outputs.shape[0] == 1:
                continue

            # Compute loss and backpropagate
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

            _, preds = torch.max(outputs, dim=1)
            all_train_preds.extend(preds.cpu().tolist())
            all_train_labels.extend(labels.cpu().tolist())

        # Calculate training metrics
        train_accuracy, train_precision, train_recall, train_f1 = calculate_metrics(all_train_preds, all_train_labels)

        # Validation phase
        model.eval()
        val_loss = 0
        all_val_preds, all_val_labels = [], []

        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs) # Remove the squeeze function
                #check if a single sample is passed in as a batch
                if outputs.shape[0] == 1:
                    continue

                loss = criterion(outputs, labels)
                val_loss += loss.item()

                _, preds = torch.max(outputs, dim=1)
                all_val_preds.extend(preds.cpu().tolist())
                all_val_labels.extend(labels.cpu().tolist())

        # Calculate validation metrics
        val_accuracy, val_precision, val_recall, val_f1 = calculate_metrics(all_val_preds, all_val_labels)

        print(f"Epoch {epoch+1}/{num_epochs}: Train Loss: {train_loss/len(train_loader):.4f}, "
              f"Train Acc: {train_accuracy:.4f}, Prec: {train_precision:.4f}, Rec: {train_recall:.4f}, F1: {train_f1:.4f} | "
              f"Val Loss: {val_loss/len(val_loader):.4f}, Val Acc: {val_accuracy:.4f}, Prec: {val_precision:.4f}, "
              f"Rec: {val_recall:.4f}, F1: {val_f1:.4f}")

        # Save the model if it has the best F1 score on validation
        if val_f1 > best_f1:
            best_f1 = val_f1
            best_model_path = f"{save_dir}/best_model_epoch_{epoch + 1}_f1_{val_f1:.4f}.pth"
            torch.save(model.state_dict(), best_model_path)
            print(f"Best model saved with F1: {val_f1:.4f} at epoch {epoch + 1}")

    return best_model_path

In [106]:
# Set the directory where the best model will be saved
save_dir = "./models"
os.makedirs(save_dir, exist_ok=True)

In [107]:
# Train the model and save the best model
best_model_path = train_and_save_best_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    criterion=criterion,
    optimizer=optimizer,
    num_epochs=50,
    save_dir=save_dir
)

print(f"Best model saved at: {best_model_path}")

Epoch 1/50: Train Loss: 1.5250, Train Acc: 0.3341, Prec: 0.2923, Rec: 0.3303, F1: 0.2667 | Val Loss: 1.4116, Val Acc: 0.4842, Prec: 0.2685, Rec: 0.3349, F1: 0.2469
Best model saved with F1: 0.2469 at epoch 1
Epoch 2/50: Train Loss: 1.4096, Train Acc: 0.4006, Prec: 0.3645, Rec: 0.3981, F1: 0.3462 | Val Loss: 1.4171, Val Acc: 0.4395, Prec: 0.2234, Rec: 0.3131, F1: 0.2272
Epoch 3/50: Train Loss: 1.3361, Train Acc: 0.4496, Prec: 0.4256, Rec: 0.4472, F1: 0.4046 | Val Loss: 1.4185, Val Acc: 0.4158, Prec: 0.2587, Rec: 0.3335, F1: 0.2417
Epoch 4/50: Train Loss: 1.2616, Train Acc: 0.4845, Prec: 0.4594, Rec: 0.4824, F1: 0.4477 | Val Loss: 1.3946, Val Acc: 0.4316, Prec: 0.2421, Rec: 0.3306, F1: 0.2383
Epoch 5/50: Train Loss: 1.1929, Train Acc: 0.5074, Prec: 0.4846, Rec: 0.5049, F1: 0.4730 | Val Loss: 1.3659, Val Acc: 0.4316, Prec: 0.2720, Rec: 0.3515, F1: 0.2686
Best model saved with F1: 0.2686 at epoch 5
Epoch 6/50: Train Loss: 1.1400, Train Acc: 0.5345, Prec: 0.5116, Rec: 0.5325, F1: 0.5063 | V

## Test

In [108]:
def predict_and_generate_submission(test_loader, best_model_path, submission_file_path):
    # Load the best model with weights_only=True to avoid security warnings
    model = MLPModel(input_dim, hidden_dim, output_dim, dropout_p).to(device)
    model.load_state_dict(torch.load(best_model_path, weights_only=True))
    model.eval()  # Set the model to evaluation mode

    test_predictions = []
    with torch.no_grad():
        for inputs in test_loader:
            # Ensure inputs are converted to a tensor and stacked into a batch if necessary
            if isinstance(inputs, list):
                # Convert each item to tensor using .detach() to avoid the user warning
                inputs = [i.clone().detach().to(device) if isinstance(i, torch.Tensor) else torch.tensor(i).to(device) for i in inputs]
                inputs = torch.stack(inputs)  # Stack them into a batch tensor
            else:
                inputs = inputs.to(device)  # If inputs is already a tensor, move it to device

            outputs = model(inputs).squeeze()

            # Predict binary labels
            _, preds = torch.max(outputs, dim=1)
            test_predictions.extend(preds.tolist())

    # Prepare the submission DataFrame
    submission_df = pd.DataFrame({
        TEXT_VAR: [i for i in test_df[TEXT_VAR]],
        'predictions': test_predictions
    })

    # Save the predictions to a CSV file
    submission_df.to_csv(submission_file_path, index=False)
    print(f"Submission file saved to {submission_file_path}")

    return submission_df

In [109]:
submission_file_path = "submission.csv"
submission_df = predict_and_generate_submission(test_loader=test_loader, best_model_path=best_model_path, submission_file_path=submission_file_path)

Submission file saved to submission.csv


In [110]:
submission_df.head()

Unnamed: 0,News,predictions
0,കേരളത്തില്‍ പുരുഷന്മാര്‍ക്ക് രണ്ട് ഭാര്യമാര്‍ ...,3
1,പാർട്ടിയുടെ കൊടിക്ക് മഹത്വം ഉണ്ടെന്നും സംരംഭങ്...,4
2,നവകേരള സദസ്സ്: കാട്ടാക്കട ക്രിസ്ത്യൻ കോളേജ് കവ...,4
3,ശബരിമലയില്‍ അയ്യപ്പ ഭക്തന്‍റെ തല പോലീസ് അടിച്ച...,2
4,ബൈക്കുകള്‍ സ്വന്തം ജില്ലയില്‍ മാത്രം ഉപയോഗിക്ക...,2


In [111]:
import random

def get_error_examples(model, dataloader, data, label_mapping):
    model.eval()
    incorrect_examples = []
    reverse_label_mapping = {v: k for k, v in label_mapping.items()}

    with torch.no_grad():
        for i, (inputs, labels) in enumerate(dataloader):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            labels = labels.cpu().numpy()

            # Collect incorrect examples
            for idx, (pred, label) in enumerate(zip(preds, labels)):
                if pred != label:  # Only collect incorrect ones
                    incorrect_examples.append({
                        "text": data.iloc[i * dataloader.batch_size + idx]["News"],
                        "true_label": label,
                        "predicted_label": pred
                    })

    # Randomly sample 5 incorrect examples (or fewer if less than 5 total)
    if len(incorrect_examples) >= 5:
        sampled_errors = random.sample(incorrect_examples, 5)
    else:
        sampled_errors = incorrect_examples

    # Display the examples
    for error in sampled_errors:
        print(f"Text: {error['text']}")
        print(f"True Label: {error['true_label']} ({reverse_label_mapping[error['true_label']]})")
        print(f"Predicted Label: {error['predicted_label']} ({reverse_label_mapping[error['predicted_label']]})")
        print("-" * 50)

# Example call on the validation set:
get_error_examples(model, val_loader, val_df, label_mapping)


Text: ബഫല്ലോ മഞ്ഞുവീഴ്ചയെത്തുടർന്ന് കുടുങ്ങിയ ഒരു കാർ ഫോട്ടോ കാണിക്കുന്നു
True Label: 0 (FALSE)
Predicted Label: 3 (HALF TRUE)
--------------------------------------------------
Text: കെ.ടി ജലീല്‍ എംഎല്‍എയുടെ പ്രദേശിക വികസന ഫണ്ടില്‍ നിന്ന് നിര്‍മിച്ച ബസ് കാത്തിരിപ്പ് കേന്ദ്രം.
True Label: 0 (FALSE)
Predicted Label: 2 (MOSTLY FALSE)
--------------------------------------------------
Text: നദിയില്‍ ഇടിമിന്നല്‍ പതിക്കുന്ന ദൃശ്യം.
True Label: 2 (MOSTLY FALSE)
Predicted Label: 3 (HALF TRUE)
--------------------------------------------------
Text: ചന്ദ്രയാന് -3 ചന്ദ്രനില് ഇറങ്ങിയതിന് ശേഷം ടാറ്റ ഗ്രൂപ്പ് എല്ലാ ഇന്ത്യന് ഉപയോക്താക്കള് ക്കും 239 രൂപയുടെ 28 ദിവസത്തെ സൗജന്യ റീചാര് ജ് വാഗ്ദാനം ചെയ്യുന്നു
True Label: 0 (FALSE)
Predicted Label: 2 (MOSTLY FALSE)
--------------------------------------------------
Text: പ്രധാനമന്ത്രി നരേന്ദ്ര മോദിയുടെ ചിത്രം വെച്ച് തിരുവനന്തപുരം സര്‍ക്കാര്‍ എഞ്ചിനീയറിംഗ് കോളേജിലെ ടോപ്പര്‍മാരെ അഭിനന്ദനങ്ങള്‍ അര്‍പ്പിക്കുന്ന ബാനര്‍
True Label: 0 (FALSE)
Predicted Label: 4