# Download/Upload Data

# 2. Setting up the enviroment

In [1]:
# Tabular Data Analysis
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Utility
import time
import warnings
warnings.filterwarnings('ignore')

# 4. Load the dataset

In [2]:
all_df = pd.read_csv("/kaggle/input/dataset3/fake_news_classification_mal_train(3).csv")
all_df.head(3)

Unnamed: 0,ID,News,Label
0,FAKE_MAL_TR_0001,കേള്‍വി തകരാറുള്ള കുട്ടികള്‍ക്ക് നടത്തുന്ന സൗജ...,False
1,FAKE_MAL_TR_0002,ചന്ദ്രയാന് കേരള മുഖ്യമന്ത്രി പിണറായി വിജയൻ മാത...,False
2,FAKE_MAL_TR_0003,പിണറായി വിജയന്‍ സര്‍ക്കാര്‍ നിര്‍മിച്ച കേരളത്ത...,False


In [3]:
from sklearn.model_selection import train_test_split

# Split into train, validation
train_df, val_df = train_test_split(all_df, test_size=0.15, stratify=all_df["Label"], random_state=42)

In [4]:
#val_df = pd.read_csv("/content/CIOL-Winter-ML-Bootcamp/datasets/ResearchWriting/o1/val.csv")
#val_df.head(3)

In [5]:
test_df = pd.read_csv("/kaggle/input/dataset3/fake_news_classification_mal_test.xlsx - Sheet1.csv")
test_df.head(3)

Unnamed: 0,S.no,News
0,FAKE_MAL_TE_0001,കേരളത്തില്‍ പുരുഷന്മാര്‍ക്ക് രണ്ട് ഭാര്യമാര്‍ ...
1,FAKE_MAL_TE_0002,പാർട്ടിയുടെ കൊടിക്ക് മഹത്വം ഉണ്ടെന്നും സംരംഭങ്...
2,FAKE_MAL_TE_0003,നവകേരള സദസ്സ്: കാട്ടാക്കട ക്രിസ്ത്യൻ കോളേജ് കവ...


In [6]:
TEXT_VAR = "News"
LABEL_VAR = "Label"

Labels are not numerical. Let's make them numerical.

In [7]:
# Map text labels to numerical values
label_mapping = {label: idx for idx, label in enumerate(train_df[LABEL_VAR].unique())}
train_df[LABEL_VAR] = train_df[LABEL_VAR].map(label_mapping)  # Change as necessary
val_df[LABEL_VAR] = val_df[LABEL_VAR].map(label_mapping)  # Change as necessary

# Modeling

## Load Things

In [8]:
import os
import torch
import torch.nn as nn
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModel, AutoTokenizer, AutoProcessor
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tqdm import tqdm

In [9]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:
seed = 42
torch.manual_seed(seed)

<torch._C.Generator at 0x7cbdd3e73e50>

In [11]:
# Hyperparameters
batch_size = 8

model_names = ["l3cube-pune/malayalam-topic-all-doc", "mdosama39/malayalam-bert-FakeNews-Dravidian-finalwithPP"]  # Add your model names here
max_lengths = [512, 512]  # Add the corresponding max lengths for each model. If max_length value is bigger than actual model's max_length, it'll show error.

## Collect Embeddings

In [12]:
# Function to extract and concatenate embeddings from multiple models
def extract_text_embeddings_multiple_models(df, save_path, model_names, max_lengths):
    if os.path.exists(save_path):
        print(f"Embeddings already exist at {save_path}")
        return torch.load(save_path)

    # Initialize models and tokenizers once
    models = [AutoModel.from_pretrained(model_name).to(device).eval() for model_name in model_names]
    tokenizers = [AutoTokenizer.from_pretrained(model_name) for model_name in model_names]

    all_embeddings = {}
    with torch.no_grad():
        for idx, row in tqdm(df.iterrows(), desc="Extracting text embeddings", total=len(df)):
            transcription = row[TEXT_VAR]
            transcription = transcription if isinstance(transcription, str) else ""

            model_embeddings = []

            # Loop through each model and its corresponding tokenizer and max_length
            for model, tokenizer, max_len in zip(models, tokenizers, max_lengths):
                # Tokenize the text
                inputs = tokenizer(
                    transcription, padding="max_length", truncation=True, max_length=max_len, return_tensors="pt"
                )
                inputs = {key: value.to(device) for key, value in inputs.items()}  # Move inputs to GPU/CPU

                # Extract embeddings
                outputs = model(**inputs)
                cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token embeddings
                model_embeddings.append(cls_embedding.cpu())

            # Concatenate embeddings from all models
            all_embeddings[idx] = torch.cat(model_embeddings, dim=1)  # Concatenate along feature dimension

    torch.save(all_embeddings, save_path)
    return all_embeddings

In [13]:
# Collect and save embeddings for all models
train_save_path = "train_text_embeddings__" + "__".join([f"{model_name.split('/')[-1]}_{max_len}" for model_name, max_len in zip(model_names, max_lengths)]) + ".pt"
train_text_embeddings = extract_text_embeddings_multiple_models(
    train_df, train_save_path, model_names, max_lengths
)

# Collect and save embeddings for all models
val_save_path = "val_text_embeddings__" + "__".join([f"{model_name.split('/')[-1]}_{max_len}" for model_name, max_len in zip(model_names, max_lengths)]) + ".pt"
val_text_embeddings = extract_text_embeddings_multiple_models(
    val_df, val_save_path, model_names, max_lengths
)

# Collect and save embeddings for all models
test_save_path = "test_text_embeddings__" + "__".join([f"{model_name.split('/')[-1]}_{max_len}" for model_name, max_len in zip(model_names, max_lengths)]) + ".pt"
test_text_embeddings = extract_text_embeddings_multiple_models(
    test_df, test_save_path, model_names, max_lengths
)

config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/950M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/881 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/950M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/6.41M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/6.41M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Extracting text embeddings: 100%|██████████| 1615/1615 [01:42<00:00, 15.70it/s]
Extracting text embeddings: 100%|██████████| 285/285 [00:19<00:00, 14.42it/s]
Extracting text embeddings: 100%|██████████| 200/200 [00:14<00:00, 14.18it/s]


## Load Embeddings

In [14]:
def load_embeddings(embedding_path):
    if os.path.exists(embedding_path):
        print(f"Loading embeddings from {embedding_path}")
        return torch.load(embedding_path)
    else:
        raise FileNotFoundError(f"Embeddings file not found at {embedding_path}")

In [15]:
train_text_embeddings = load_embeddings(train_save_path)
val_text_embeddings = load_embeddings(val_save_path)
test_text_embeddings = load_embeddings(test_save_path)

Loading embeddings from train_text_embeddings__malayalam-topic-all-doc_512__malayalam-bert-FakeNews-Dravidian-finalwithPP_512.pt
Loading embeddings from val_text_embeddings__malayalam-topic-all-doc_512__malayalam-bert-FakeNews-Dravidian-finalwithPP_512.pt
Loading embeddings from test_text_embeddings__malayalam-topic-all-doc_512__malayalam-bert-FakeNews-Dravidian-finalwithPP_512.pt


## Modeling

In [16]:
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import DataLoader, TensorDataset

In [17]:
def prepare_text_embeddings(text_embeddings, df, LABEL_VAR, has_labels=True):
    combined_embeddings = []
    labels = [] if has_labels else None

    for idx, row in df.iterrows():
        # Ensure the index exists in the text embeddings
        if idx in text_embeddings:
            text_embedding = text_embeddings[idx].squeeze()  # Squeeze to remove unnecessary dimensions

            # Add the text embedding to the list
            combined_embeddings.append(text_embedding)

            if has_labels:
                labels.append(row[LABEL_VAR])  # Get the label from the DataFrame

    if has_labels:
        return torch.stack(combined_embeddings), torch.tensor(labels)
    else:
        return torch.stack(combined_embeddings)

In [18]:
X_train, y_train = prepare_text_embeddings(train_text_embeddings, train_df, LABEL_VAR)
X_val, y_val = prepare_text_embeddings(val_text_embeddings, val_df, LABEL_VAR)
X_test = prepare_text_embeddings(test_text_embeddings, test_df, LABEL_VAR, has_labels=False)

print(f"Training data shape: {X_train.shape}, Labels: {y_train.shape}")
print(f"Validation data shape: {X_val.shape}, Labels: {y_val.shape}")
print(f"Test data shape: {X_test.shape}")

Training data shape: torch.Size([1615, 1536]), Labels: torch.Size([1615])
Validation data shape: torch.Size([285, 1536]), Labels: torch.Size([285])
Test data shape: torch.Size([200, 1536])


In [19]:
from collections import Counter
def new_balance_classes(X, y):

  # Load embeddings from .pt file
#     embeddings = torch.load(embeddings_path)

  # Count occurrences of each class
  train_df=pd.DataFrame({"Label":y})
  class_counts = Counter(train_df['Label'])



  # Find the maximum count
  max_count = max(class_counts.values())

  # Create a new list for balanced data
  label_lst = []
  embedding_lst=[]


  for idx, row in train_df.iterrows():
      label = row[LABEL_VAR]
      embedding = X[idx]  # Assuming embeddings are in the same order as DataFrame

      # Add original sample
      embedding_lst.append(embedding)  # Duplicate sample
      label_lst.append(label)



      # Duplicate samples for minority classes
      duplicates_needed = int( (max_count - class_counts[label]) / class_counts[label])
      for _ in range(duplicates_needed):
          embedding_lst.append(embedding)  # Duplicate sample
          label_lst.append(label)


  return torch.stack(embedding_lst), np.array(label_lst)


# # train_df.head()
X_train,y_train=new_balance_classes(X_train,y_train)

X_train = torch.tensor(X_train)
y_train = torch.tensor(y_train)

In [20]:
# Define the MLP model
class MLPModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout_p=0.5):
        """
        Initialize the MLP model.
        Args:
            input_dim (int): Dimension of the input features.
            hidden_dim (list of int): List of dimensions for hidden layers.
            output_dim (int): Dimension of the output layer.
            dropout_p (float): Dropout probability.
        """
        super(MLPModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim[0])
        self.relu = nn.ReLU()
        self.dropout1 = nn.Dropout(p=dropout_p)
        self.fc2 = nn.Linear(hidden_dim[0], hidden_dim[1])
        self.dropout2 = nn.Dropout(p=dropout_p)
        self.fc3 = nn.Linear(hidden_dim[1], output_dim)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout1(x)
        x = self.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

In [38]:
# Hyperparameters
input_dim = X_train.shape[1]
num_classes = len(train_df[LABEL_VAR].unique())
hidden_dim = [786, 512]
output_dim = num_classes
batch_size = 16
num_epochs = 500
learning_rate = 0.0001
dropout_p = 0.5

In [39]:
# Prepare the data loaders
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)
test_dataset = TensorDataset(X_test)

train_loader = DataLoader(train_dataset, batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size)
test_loader = DataLoader(test_dataset, batch_size)

In [40]:
# Initialize model, loss function, and optimizer
model = MLPModel(input_dim, hidden_dim, output_dim, dropout_p).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [41]:
# Function to calculate metrics
def calculate_metrics(preds, labels):
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="macro")
    return accuracy, precision, recall, f1

## Train and Val

In [42]:
# Train and save best model
def train_and_save_best_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, save_dir):
    best_f1 = -float('inf')
    best_model_path = None

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        all_train_preds, all_train_labels = [], []

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs).squeeze()

            # Compute loss and backpropagate
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

            _, preds = torch.max(outputs, dim=1)
            all_train_preds.extend(preds.cpu().tolist())
            all_train_labels.extend(labels.cpu().tolist())

        # Calculate training metrics
        train_accuracy, train_precision, train_recall, train_f1 = calculate_metrics(all_train_preds, all_train_labels)

        # Validation phase
        model.eval()
        val_loss = 0
        all_val_preds, all_val_labels = [], []

        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs).squeeze()

                loss = criterion(outputs, labels)
                val_loss += loss.item()

                _, preds = torch.max(outputs, dim=1)
                all_val_preds.extend(preds.cpu().tolist())
                all_val_labels.extend(labels.cpu().tolist())

        # Calculate validation metrics
        val_accuracy, val_precision, val_recall, val_f1 = calculate_metrics(all_val_preds, all_val_labels)

        print(f"Epoch {epoch+1}/{num_epochs}: Train Loss: {train_loss/len(train_loader):.4f}, "
              f"Train Acc: {train_accuracy:.4f}, Prec: {train_precision:.4f}, Rec: {train_recall:.4f}, F1: {train_f1:.4f} | "
              f"Val Loss: {val_loss/len(val_loader):.4f}, Val Acc: {val_accuracy:.4f}, Prec: {val_precision:.4f}, "
              f"Rec: {val_recall:.4f}, F1: {val_f1:.4f}")

        # Save the model if it has the best F1 score on validation
        if val_f1 > best_f1:
            best_f1 = val_f1
            best_model_path = f"{save_dir}/best_model_epoch_{epoch + 1}_f1_{val_f1:.4f}.pth"
            torch.save(model.state_dict(), best_model_path)
            print(f"Best model saved with F1: {val_f1:.4f} at epoch {epoch + 1}")

    return best_model_path

In [43]:
# Set the directory where the best model will be saved
save_dir = "./models"
os.makedirs(save_dir, exist_ok=True)

In [44]:
# Train the model and save the best model
best_model_path = train_and_save_best_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    criterion=criterion,
    optimizer=optimizer,
    num_epochs=num_epochs,
    save_dir=save_dir
)

print(f"Best model saved at: {best_model_path}")

Epoch 1/500: Train Loss: 1.5358, Train Acc: 0.3163, Prec: 0.3088, Rec: 0.3142, F1: 0.2570 | Val Loss: 1.4685, Val Acc: 0.4877, Prec: 0.2476, Rec: 0.3679, F1: 0.2649
Best model saved with F1: 0.2649 at epoch 1
Epoch 2/500: Train Loss: 1.4505, Train Acc: 0.3796, Prec: 0.3558, Rec: 0.3780, F1: 0.3327 | Val Loss: 1.4076, Val Acc: 0.4772, Prec: 0.2856, Rec: 0.3819, F1: 0.2805
Best model saved with F1: 0.2805 at epoch 2
Epoch 3/500: Train Loss: 1.3976, Train Acc: 0.4020, Prec: 0.3708, Rec: 0.4004, F1: 0.3613 | Val Loss: 1.3770, Val Acc: 0.4456, Prec: 0.2725, Rec: 0.3583, F1: 0.2700
Epoch 4/500: Train Loss: 1.3569, Train Acc: 0.4260, Prec: 0.3949, Rec: 0.4247, F1: 0.3865 | Val Loss: 1.4276, Val Acc: 0.3965, Prec: 0.2690, Rec: 0.3653, F1: 0.2557
Epoch 5/500: Train Loss: 1.3094, Train Acc: 0.4598, Prec: 0.4314, Rec: 0.4587, F1: 0.4226 | Val Loss: 1.3181, Val Acc: 0.4982, Prec: 0.2939, Rec: 0.3712, F1: 0.2792
Epoch 6/500: Train Loss: 1.2628, Train Acc: 0.4674, Prec: 0.4308, Rec: 0.4665, F1: 0.42

## Test

In [45]:
def predict_and_generate_submission(test_loader, best_model_path, submission_file_path):
    # Load the best model with weights_only=True to avoid security warnings
    model = MLPModel(input_dim, hidden_dim, output_dim, dropout_p).to(device)
    model.load_state_dict(torch.load(best_model_path, weights_only=True))
    model.eval()  # Set the model to evaluation mode

    test_predictions = []
    with torch.no_grad():
        for inputs in test_loader:
            # Ensure inputs are converted to a tensor and stacked into a batch if necessary
            if isinstance(inputs, list):
                # Convert each item to tensor using .detach() to avoid the user warning
                inputs = [i.clone().detach().to(device) if isinstance(i, torch.Tensor) else torch.tensor(i).to(device) for i in inputs]
                inputs = torch.stack(inputs)  # Stack them into a batch tensor
            else:
                inputs = inputs.to(device)  # If inputs is already a tensor, move it to device

            outputs = model(inputs).squeeze()

            # Predict binary labels
            _, preds = torch.max(outputs, dim=1)
            test_predictions.extend(preds.tolist())

    # Prepare the submission DataFrame
    submission_df = pd.DataFrame({
        TEXT_VAR: [i for i in test_df[TEXT_VAR]],
        'predictions': test_predictions
    })

    # Save the predictions to a CSV file
    submission_df.to_csv(submission_file_path, index=False)
    print(f"Submission file saved to {submission_file_path}")

    return submission_df

In [46]:
submission_file_path = "submission.csv"
submission_df = predict_and_generate_submission(test_loader=test_loader, best_model_path=best_model_path, submission_file_path=submission_file_path)

Submission file saved to submission.csv


In [47]:
submission_df.head()

Unnamed: 0,News,predictions
0,കേരളത്തില്‍ പുരുഷന്മാര്‍ക്ക് രണ്ട് ഭാര്യമാര്‍ ...,1
1,പാർട്ടിയുടെ കൊടിക്ക് മഹത്വം ഉണ്ടെന്നും സംരംഭങ്...,0
2,നവകേരള സദസ്സ്: കാട്ടാക്കട ക്രിസ്ത്യൻ കോളേജ് കവ...,3
3,ശബരിമലയില്‍ അയ്യപ്പ ഭക്തന്‍റെ തല പോലീസ് അടിച്ച...,0
4,ബൈക്കുകള്‍ സ്വന്തം ജില്ലയില്‍ മാത്രം ഉപയോഗിക്ക...,0


If you use it, cite:

*Azmine Toushik Wasi. (2024). CIOL Presnts Winer ML BootCamp. https://github.com/ciol-researchlab/CIOL-Winter-ML-Bootcamp*

```
@misc{wasi2024CIOL-WMLB,
      title={CIOL Presnts Winer ML BootCamp},
      author={Azmine Toushik Wasi},
      year={2024},
      url={https://github.com/ciol-researchlab/CIOL-Winter-ML-Bootcamp},
}```