# 2. Setting up the enviroment

In [None]:
# Tabular Data Analysis
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Utility
import time
import warnings
warnings.filterwarnings('ignore')

# 4. Load the dataset

In [None]:
train_df = pd.read_csv("/content/Fake_train.csv")
train_df.head(3)

Unnamed: 0,text,label
0,നല്ല അവതരണം. സത്യം പുറത്തു വരട്ടെ,Fake
1,Masha Allah,Fake
2,"അന്വേഷണം കഴിയുമ്പോൾ,. C. A. A. യ്ക്ക് എതിരായ ക...",Fake


In [None]:
val_df = pd.read_csv("/content/Fake_dev.csv")
val_df.head(3)

Unnamed: 0,text,label
0,Full. Musilm. Verodamum,Fake
1,പക്ഷികളും മൃഗങ്ങളും ഈ ലോകത്ത് സുഖമായി ജീവിക്കു...,Fake
2,ഒരു താടിക്കാരൻ പാത്രം കൊട്ടാൻ പറഞ്ഞപ്പോ .........,original


In [None]:
test_df = pd.read_csv("/content/test_Fake_test_without_labels.csv")
test_df.head(3)

Unnamed: 0,Id,text
0,Fake_01,5000 ഉള്ള പോൾ ലോഗ്‌ഡ്‌വൻ ഇപ്പോള് 250000 എന്താ...
1,Fake_02,ഓഷോ രജനീഷ് പറഞ്ഞപോലെ എനിക്കപ്പോൾ തോന്നിയത് അ...
2,Fake_03,ചേട്ടാ വാർത്ത വയ്ക്കുന്നത് കേരളത്തിലാണ് സം...


In [None]:
TEXT_VAR = "text"
LABEL_VAR = "label"

Labels are not numerical. Let's make them numerical.

In [None]:
# Map text labels to numerical values
label_mapping = {label: idx for idx, label in enumerate(train_df[LABEL_VAR].unique())}
train_df[LABEL_VAR] = train_df[LABEL_VAR].map(label_mapping)
val_df[LABEL_VAR] = val_df[LABEL_VAR].map(label_mapping)

# Modeling

## Load Things

In [None]:
import os
import torch
import torch.nn as nn
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModel, AutoTokenizer, AutoProcessor
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tqdm import tqdm

In [None]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
seed = 42
torch.manual_seed(seed)

<torch._C.Generator at 0x78f3704e0c70>

In [None]:
# Hyperparameters
model_name = "mdosama39/malayalam-bert-FakeNews-Dravidian"
batch_size = 16
max_length = 786

In [None]:
# Load Tokenizer and Model
text_tokenizer = AutoTokenizer.from_pretrained(model_name)
text_model = AutoModel.from_pretrained(model_name).to(device)

In [None]:
class TextDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        transcription = self.df.iloc[idx][TEXT_VAR]
        transcription = transcription if isinstance(transcription, str) else ""
        inputs = self.tokenizer(
            transcription, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt"
        )
        return inputs, self.df.iloc[idx][LABEL_VAR]

## Collect Embeddings

In [None]:
def extract_text_embeddings(df, save_path, model, tokenizer):
    if os.path.exists(save_path):
        print(f"Embeddings already exist at {save_path}")
        return torch.load(save_path)

    embeddings = {}
    model.eval()
    with torch.no_grad():
        for idx, row in tqdm(df.iterrows(), desc="Extracting text embeddings", total=len(df)):
            transcription = row[TEXT_VAR]
            transcription = transcription if isinstance(transcription, str) else ""

            # Tokenize the text
            inputs = tokenizer(
                transcription, padding="max_length", truncation=True, max_length=512, return_tensors="pt"
            )
            inputs = {key: value.to(device) for key, value in inputs.items()}

            # Extract embeddings
            outputs = model(**inputs)
            cls_embedding = outputs.last_hidden_state[:, 0, :]
            embeddings[idx] = cls_embedding.cpu()

    torch.save(embeddings, save_path)
    return embeddings

In [None]:
train_text_embeddings = extract_text_embeddings(
    train_df, "train_text_embeddings.pt", text_model, text_tokenizer
)
val_text_embeddings = extract_text_embeddings(
    val_df, "val_text_embeddings.pt", text_model, text_tokenizer
)
test_text_embeddings = extract_text_embeddings(
    test_df, "test_text_embeddings.pt", text_model, text_tokenizer
)

Embeddings already exist at train_text_embeddings.pt
Embeddings already exist at val_text_embeddings.pt
Embeddings already exist at test_text_embeddings.pt


## Load Embeddings

In [None]:
def load_embeddings(embedding_path):
    if os.path.exists(embedding_path):
        print(f"Loading embeddings from {embedding_path}")
        return torch.load(embedding_path)
    else:
        raise FileNotFoundError(f"Embeddings file not found at {embedding_path}")

In [None]:
train_text_embeddings = load_embeddings("/content/train_text_embeddings.pt")
val_text_embeddings = load_embeddings("/content/val_text_embeddings.pt")
test_text_embeddings = load_embeddings("/content/test_text_embeddings.pt")

Loading embeddings from /content/train_text_embeddings.pt
Loading embeddings from /content/val_text_embeddings.pt
Loading embeddings from /content/test_text_embeddings.pt


## Modeling

In [None]:
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import DataLoader, TensorDataset

In [None]:
def prepare_text_embeddings(text_embeddings, df, LABEL_VAR, has_labels=True):
    combined_embeddings = []
    labels = [] if has_labels else None

    for idx, row in df.iterrows():
        # Ensure the index exists in the text embeddings
        if idx in text_embeddings:
            text_embedding = text_embeddings[idx].squeeze()

            # Add the text embedding to the list
            combined_embeddings.append(text_embedding)

            if has_labels:
                labels.append(row[LABEL_VAR])

    if has_labels:
        return torch.stack(combined_embeddings), torch.tensor(labels)
    else:
        return torch.stack(combined_embeddings)

In [None]:
X_train, y_train = prepare_text_embeddings(train_text_embeddings, train_df, LABEL_VAR)
X_val, y_val = prepare_text_embeddings(val_text_embeddings, val_df, LABEL_VAR)
X_test = prepare_text_embeddings(test_text_embeddings, test_df, LABEL_VAR, has_labels=False)

print(f"Training data shape: {X_train.shape}, Labels: {y_train.shape}")
print(f"Validation data shape: {X_val.shape}, Labels: {y_val.shape}")
print(f"Test data shape: {X_test.shape}")

Training data shape: torch.Size([3257, 768]), Labels: torch.Size([3257])
Validation data shape: torch.Size([815, 768]), Labels: torch.Size([815])
Test data shape: torch.Size([1019, 768])


In [None]:
# Define the MLP model
class MLPModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout_p=0.2):
        """
        Initialize the MLP model.
        Args:
            input_dim (int): Dimension of the input features.
            hidden_dim (list of int): List of dimensions for hidden layers.
            output_dim (int): Dimension of the output layer.
            dropout_p (float): Dropout probability.
        """
        super(MLPModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim[0])
        self.relu = nn.ReLU()
        self.dropout1 = nn.Dropout(p=dropout_p)
        self.fc2 = nn.Linear(hidden_dim[0], hidden_dim[1])
        self.dropout2 = nn.Dropout(p=dropout_p)
        self.fc3 = nn.Linear(hidden_dim[1], output_dim)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout1(x)
        x = self.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

In [None]:
# Hyperparameters
input_dim = X_train.shape[1]
num_classes = len(train_df[LABEL_VAR].unique())
hidden_dim = [786, 512]
output_dim = num_classes
batch_size = 16
num_epochs = 50
learning_rate = 0.0001
dropout_p = 0.2

In [None]:
# Prepare the data loaders
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)
test_dataset = TensorDataset(X_test)

train_loader = DataLoader(train_dataset, batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size)
test_loader = DataLoader(test_dataset, batch_size)

In [None]:
# Initialize model, loss function, and optimizer
model = MLPModel(input_dim, hidden_dim, output_dim, dropout_p).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# Function to calculate metrics
def calculate_metrics(preds, labels):
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="macro")
    return accuracy, precision, recall, f1

In [None]:
max_length = train_df['text'].str.len().max()
print(f"Maximum text length: {max_length}")

Maximum text length: 3125


## Train and Val

In [None]:
# Train and save best model
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def train_and_save_best_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, save_dir):
    best_f1 = -float('inf')
    best_model_path = None

    for epoch in range(num_epochs):
        model.train() # Indent this line. Start of the loop
        train_loss = 0
        all_train_preds, all_train_labels = [], []

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs) # Remove the squeeze function

            #check if a single sample is passed in as a batch
            if outputs.shape[0] == 1:
                continue

            # Compute loss and backpropagate
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

            _, preds = torch.max(outputs, dim=1)
            all_train_preds.extend(preds.cpu().tolist())
            all_train_labels.extend(labels.cpu().tolist())

        # Calculate training metrics
        train_accuracy, train_precision, train_recall, train_f1 = calculate_metrics(all_train_preds, all_train_labels)

        # Validation phase
        model.eval()
        val_loss = 0
        all_val_preds, all_val_labels = [], []

        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)

                if outputs.shape[0] == 1:
                    continue

                loss = criterion(outputs, labels)
                val_loss += loss.item()

                _, preds = torch.max(outputs, dim=1)
                all_val_preds.extend(preds.cpu().tolist())
                all_val_labels.extend(labels.cpu().tolist())

        # Calculate validation metrics
        val_accuracy, val_precision, val_recall, val_f1 = calculate_metrics(all_val_preds, all_val_labels)

        print(f"Epoch {epoch+1}/{num_epochs}: Train Loss: {train_loss/len(train_loader):.4f}, "
              f"Train Acc: {train_accuracy:.4f}, Prec: {train_precision:.4f}, Rec: {train_recall:.4f}, F1: {train_f1:.4f} | "
              f"Val Loss: {val_loss/len(val_loader):.4f}, Val Acc: {val_accuracy:.4f}, Prec: {val_precision:.4f}, "
              f"Rec: {val_recall:.4f}, F1: {val_f1:.4f}")

        # Save the model if it has the best F1 score on validation
        if val_f1 > best_f1:
            best_f1 = val_f1
            best_model_path = f"{save_dir}/best_model_epoch_{epoch + 1}_f1_{val_f1:.4f}.pth"
            torch.save(model.state_dict(), best_model_path)
            print(f"Best model saved with F1: {val_f1:.4f} at epoch {epoch + 1}")

    return best_model_path

In [None]:
# Set the directory where the best model will be saved
save_dir = "./models"
os.makedirs(save_dir, exist_ok=True)

In [None]:
# Train the model and save the best model
best_model_path = train_and_save_best_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    criterion=criterion,
    optimizer=optimizer,
    num_epochs=50,
    save_dir=save_dir
)

print(f"Best model saved at: {best_model_path}")

Epoch 1/50: Train Loss: 0.0342, Train Acc: 0.9874, Prec: 0.9875, Rec: 0.9873, F1: 0.9874 | Val Loss: 1.2886, Val Acc: 0.8110, Prec: 0.8113, Rec: 0.8110, F1: 0.8110
Best model saved with F1: 0.8110 at epoch 1
Epoch 2/50: Train Loss: 0.0523, Train Acc: 0.9810, Prec: 0.9810, Rec: 0.9809, F1: 0.9810 | Val Loss: 1.2363, Val Acc: 0.8000, Prec: 0.8002, Rec: 0.8000, F1: 0.8000
Epoch 3/50: Train Loss: 0.0446, Train Acc: 0.9813, Prec: 0.9813, Rec: 0.9812, F1: 0.9813 | Val Loss: 1.2322, Val Acc: 0.7975, Prec: 0.7978, Rec: 0.7976, F1: 0.7975
Epoch 4/50: Train Loss: 0.0525, Train Acc: 0.9794, Prec: 0.9795, Rec: 0.9794, F1: 0.9794 | Val Loss: 1.1919, Val Acc: 0.8012, Prec: 0.8029, Rec: 0.8011, F1: 0.8009
Epoch 5/50: Train Loss: 0.0388, Train Acc: 0.9859, Prec: 0.9859, Rec: 0.9858, F1: 0.9859 | Val Loss: 1.2703, Val Acc: 0.8061, Prec: 0.8140, Rec: 0.8058, F1: 0.8048
Epoch 6/50: Train Loss: 0.0550, Train Acc: 0.9804, Prec: 0.9804, Rec: 0.9803, F1: 0.9803 | Val Loss: 1.1532, Val Acc: 0.8074, Prec: 0.80

## Test

In [None]:
def predict_and_generate_submission(test_loader, best_model_path, submission_file_path):
    # Load the best model with weights_only=True to avoid security warnings
    model = MLPModel(input_dim, hidden_dim, output_dim, dropout_p).to(device)
    model.load_state_dict(torch.load(best_model_path, weights_only=True))
    model.eval()  # Set the model to evaluation mode

    test_predictions = []
    with torch.no_grad():
        for inputs in test_loader:
            # Ensure inputs are converted to a tensor and stacked into a batch if necessary
            if isinstance(inputs, list):
                # Convert each item to tensor using .detach() to avoid the user warning
                inputs = [i.clone().detach().to(device) if isinstance(i, torch.Tensor) else torch.tensor(i).to(device) for i in inputs]
                inputs = torch.stack(inputs)  # Stack them into a batch tensor
            else:
                inputs = inputs.to(device)  # If inputs is already a tensor, move it to device

            outputs = model(inputs).squeeze()

            # Predict binary labels
            _, preds = torch.max(outputs, dim=1)
            test_predictions.extend(preds.tolist())

    # Prepare the submission DataFrame
    submission_df = pd.DataFrame({
        TEXT_VAR: [i for i in test_df[TEXT_VAR]],
        'predictions': test_predictions
    })

    # Save the predictions to a CSV file
    submission_df.to_csv(submission_file_path, index=False)
    print(f"Submission file saved to {submission_file_path}")

    return submission_df

In [None]:
submission_file_path = "submission.csv"
submission_df = predict_and_generate_submission(test_loader=test_loader, best_model_path=best_model_path, submission_file_path=submission_file_path)

Submission file saved to submission.csv


In [None]:
submission_df.head()

Unnamed: 0,text,predictions
0,5000 ഉള്ള പോൾ ലോഗ്‌ഡ്‌വൻ ഇപ്പോള് 250000 എന്താ...,1
1,ഓഷോ രജനീഷ് പറഞ്ഞപോലെ എനിക്കപ്പോൾ തോന്നിയത് അ...,1
2,ചേട്ടാ വാർത്ത വയ്ക്കുന്നത് കേരളത്തിലാണ് സം...,1
3,Shame for entire Woman&#39;s of Kerala,1
4,135 code janaghal andhu wide business cheythal...,0
