# Pre-Processing
No need to run, files are already generated

In [6]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/manishank/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
import re
from html import unescape
from nltk.corpus import stopwords

def clean_english_text(text):
    text = unescape(text)
    text = re.sub(r'<.*?>', '', text)
    text = text.lower()
    text = re.sub(r'[0-9]', ' ', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    stop_words = set(stopwords.words('english'))
    return ' '.join(word for word in text.split() if word not in stop_words)

def clean_hindi_text(text):
    text = unescape(text)
    text = re.sub(r'<.*?>', '', text)
    text = text.lower()
    text = re.sub(r'[0-9]', ' ', text)
    text = re.sub(r'[^\u0900-\u097F\u1CD0-\u1CFF\uA8E0-\uA8FF\u0964\u0965\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def parse_dataset(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
    pattern = r"'(.*?)','(.*?)'(?:\n|$)"
    pairs = re.findall(pattern, content, re.DOTALL)
    if not pairs:
        lines = content.strip().split("\n")
        pairs = [tuple(line.split(",", 1)) for line in lines if "," in line]
    return pairs

def clean_file_for_glove(input_path, output_path, lang='english'):
    pairs = parse_dataset(input_path)
    with open(output_path, 'w', encoding='utf-8') as outfile:
        for _, text in pairs:
            cleaned = clean_english_text(text) if lang == 'english' else clean_hindi_text(text)
            if cleaned:
                outfile.write(cleaned + '\n')

def clean_file_with_labels(input_path, output_path, lang='english'):
    pairs = parse_dataset(input_path)
    with open(output_path, 'w', encoding='utf-8') as outfile:
        for label, text in pairs:
            cleaned = clean_english_text(text) if lang == 'english' else clean_hindi_text(text)
            if cleaned:
                outfile.write(f"{label}\t{cleaned}\n")

import re
from html import unescape

def convert_raw_to_tab(input_path, output_path):
    with open(input_path, 'r', encoding='utf-8') as f:
        content = f.read()
    pattern = r"'([^']+)','(.*?)'(?:\n|$)"
    pairs = re.findall(pattern, content, re.DOTALL)
    with open(output_path, 'w', encoding='utf-8') as out:
        for label, text in pairs:
            text = clean_english_text(text)
            label = label.strip()
            text = unescape(text.strip().replace("\n", " "))
            out.write(f"{label}\t{text}\n")
    print(f"Converted {len(pairs)} records into tab-separated format at {output_path}")

def clean_english_training():
    datasets = [
        ('datasets/english/english_2500.txt', 'datasets/english/cleaned_english_2500.txt'),
        ('datasets/english/english_15000.txt', 'datasets/english/cleaned_english_15000.txt'),
        ('datasets/english/english_30000.txt', 'datasets/english/cleaned_english_30000.txt'),
    ]
    for in_path, out_path in datasets:
        clean_file_for_glove(in_path, out_path, 'english')

def clean_hindi_training():
    datasets = [
        ('datasets/hindi/hindi_2500.txt', 'datasets/hindi/cleaned_hindi_2500.txt'),
        ('datasets/hindi/hindi_15000.txt', 'datasets/hindi/cleaned_hindi_15000.txt'),
        ('datasets/hindi/hindi_30000.txt', 'datasets/hindi/cleaned_hindi_30000.txt'),
    ]
    for in_path, out_path in datasets:
        clean_file_for_glove(in_path, out_path, 'hindi')

def clean_hindi_test():
    datasets = [
        ('datasets/hindi/hindi_test.txt', 'datasets/hindi/cleaned_hindi_test_labelled.txt'),
    ]
    for in_path, out_path in datasets:
        clean_file_with_labels(in_path, out_path, 'hindi')

def clean_english_training_labelled():
    datasets = [
        ('datasets/english/english_2500.txt', 'datasets/english/cleaned_english_2500_labelled.txt'),
        ('datasets/english/english_15000.txt', 'datasets/english/cleaned_english_15000_labelled.txt'),
        ('datasets/english/english_30000.txt', 'datasets/english/cleaned_english_30000_labelled.txt'),
    ]
    for in_path, out_path in datasets:
        clean_file_with_labels(in_path, out_path, 'english')

def clean_hindi_training_labelled():
    datasets = [
        ('datasets/hindi/hindi_2500.txt', 'datasets/hindi/cleaned_hindi_2500_labelled.txt'),
        ('datasets/hindi/hindi_15000.txt', 'datasets/hindi/cleaned_hindi_15000_labelled.txt'),
        ('datasets/hindi/hindi_30000.txt', 'datasets/hindi/cleaned_hindi_30000_labelled.txt'),
    ]
    for in_path, out_path in datasets:
        clean_file_with_labels(in_path, out_path, 'hindi')

clean_english_training()
clean_hindi_training()
clean_hindi_test()
clean_english_training_labelled()
clean_hindi_training_labelled()

convert_raw_to_tab(
    "datasets/english/english_test.txt", 
    "datasets/english/english_final_test_cleaned.txt"
 )

Converted 6096 records into tab-separated format at datasets/english/english_final_test_cleaned.txt


In [8]:
convert_raw_to_tab(
    "datasets/english/english_30000.txt", 
    "datasets/english/english_final30k_train_cleaned.txt"
 )

Converted 28775 records into tab-separated format at datasets/english/english_final30k_train_cleaned.txt


In [9]:
convert_raw_to_tab(
    "datasets/english/english_15000.txt", 
    "datasets/english/english_final15k_train_cleaned.txt"
 )

Converted 15000 records into tab-separated format at datasets/english/english_final15k_train_cleaned.txt


In [10]:
convert_raw_to_tab(
    "datasets/english/english_2500.txt", 
    "datasets/english/english_final2.5k_train_cleaned.txt"
 )

Converted 2500 records into tab-separated format at datasets/english/english_final2.5k_train_cleaned.txt


# Building Co-Occurance matrix and saving it 
Necessary if we want to do the training of text to word embeddings model.
Files of this are not given, are not necessary to use trained models.

In [None]:
import numpy as np
from scipy.sparse import dok_matrix
import pickle
import os

def load_our_data(filepath):
    with open(filepath,'r',encoding = 'utf-8') as f:
        return [line.strip().split() for line in f]

def build_and_save_cooccurrence_matrix(training_data, output_dir, window=20):
    print(f"Loading data from {training_data}...")
    training = load_our_data(training_data)
    print("Building vocabulary...")
    total_words = [word for sentence in training for word in sentence]
    vocabulary = list(set(total_words))
    word_to_id = {word: i for i,word in enumerate(vocabulary)}
    id_to_word = {i:word for word,i in word_to_id.items()}
    size = len(vocabulary)
    print(f"Vocabulary size: {size}")
    print(f"Total sentences: {len(training)}")
    print(f"Window size: {window}")
    print("Building co-occurrence matrix...")
    x = dok_matrix((size,size), dtype=np.float32)
    for sentence_idx, sentence in enumerate(training):
        if sentence_idx % 5000 == 0:
            print(f"Processed {sentence_idx}/{len(training)} sentences ({sentence_idx/len(training)*100:.1f}%)")
        indice = [word_to_id.get(w) for w in sentence if w in word_to_id]
        for word_index, word in enumerate(indice):
            start = max(0, word_index - window)
            end = min(len(indice), word_index + window + 1)
            for context_index in range(start, end):
                if word_index == context_index:
                    continue
                distance = abs(context_index - word_index)
                context_word = indice[context_index]
                x[word, context_word] = x.get((word, context_word), 0) + 1.0 / distance
    print(f"Co-occurrence matrix built with {len(x.keys())} non-zero entries")
    os.makedirs(output_dir, exist_ok=True)
    print("Saving co-occurrence matrix...")
    non_zero_indices = list(x.keys())
    co_occurrence_data = {
        'indices': non_zero_indices,
        'values': [x[idx] for idx in non_zero_indices]
    }
    with open(f"{output_dir}/cooccurrence_data.pkl", "wb") as f:
        pickle.dump(co_occurrence_data, f)
    print("Saving vocabulary...")
    with open(f"{output_dir}/word_to_id.pkl", "wb") as f:
        pickle.dump(word_to_id, f)
    with open(f"{output_dir}/id_to_word.pkl", "wb") as f:
        pickle.dump(id_to_word, f)
    metadata = {
        'vocab_size': size,
        'window_size': window,
        'dataset': training_data,
        'total_sentences': len(training),
        'non_zero_entries': len(non_zero_indices)
    }
    with open(f"{output_dir}/preprocessing_metadata.pkl", "wb") as f:
        pickle.dump(metadata, f)
    print(f"All preprocessing data saved to {output_dir}/")
    print(f"Files created:")
    print(f"  - cooccurrence_data.pkl ({len(non_zero_indices)} entries)")
    print(f"  - word_to_id.pkl ({size} words)")
    print(f"  - id_to_word.pkl ({size} words)")
    print(f"  - preprocessing_metadata.pkl")
    return output_dir

if __name__ == "__main__":
    training_data = 'datasets/english/cleaned_english_30000.txt'
    output_dir = "models/matrixes/preprocessed_30k_w20"
    window = 20
    build_and_save_cooccurrence_matrix(training_data, output_dir, window)

# Training text to word embedding model using the Co-Occurance matrix
Trained model files are in the same directory, No need to run this



 Note this code only works on cuda enabled systems


 Requirement: cupy-cuda12x or cupy-cuda13x

In [None]:
import numpy as np
import cupy as cp
import pickle
import os

def load_preprocessed_data(data_dir):
    print(f"Loading preprocessed data from {data_dir}...")
    with open(f"{data_dir}/cooccurrence_data.pkl", "rb") as f:
        co_occurrence_data = pickle.load(f)
    with open(f"{data_dir}/word_to_id.pkl", "rb") as f:
        word_to_id = pickle.load(f)
    with open(f"{data_dir}/id_to_word.pkl", "rb") as f:
        id_to_word = pickle.load(f)
    with open(f"{data_dir}/preprocessing_metadata.pkl", "rb") as f:
        metadata = pickle.load(f)
    print(f"Loaded:")
    print(f"  - Vocabulary: {metadata['vocab_size']} words")
    print(f"  - Co-occurrences: {metadata['non_zero_entries']} entries")
    print(f"  - Window size: {metadata['window_size']}")
    print(f"  - Dataset: {metadata['dataset']}")
    return co_occurrence_data, word_to_id, id_to_word, metadata

def safe_gpu_init():
    try:
        print("Initializing GPU...")
        device_count = cp.cuda.runtime.getDeviceCount()
        print(f"Found {device_count} GPU(s)")
        cp.cuda.Device(0).use()
        test = cp.array([1, 2, 3])
        result = cp.sum(test)
        print(f"GPU available: {cp.cuda.Device()}")
        print(f"Test operation successful: {result}")
        del test
        try:
            cp.get_default_memory_pool().free_all_blocks()
            print("GPU memory cleared")
        except Exception as mem_error:
            print(f"Could not clear memory pool: {mem_error}")
            print("Continuing anyway...")
        return True
    except Exception as e:
        print(f"GPU initialization failed: {e}")
        print("1. Check GPU status: nvidia-smi")
        print("2. Restart system: sudo reboot")
        print("3. Kill Python processes: sudo pkill -f python")
        print("4. Reset GPU: sudo nvidia-smi --gpu-reset")
        return False

def train_glove_gpu(data_dir, dimension=100, epochs=100, learning_rate=0.01, 
                   x_max=100, alpha=0.75, batch_size=30000):
    co_occurrence_data, word_to_id, id_to_word, metadata = load_preprocessed_data(data_dir)
    if not safe_gpu_init():
        print("Cannot proceed without GPU")
        return None, None, None, None
    non_zero_indices = co_occurrence_data['indices']
    non_zero_values = co_occurrence_data['values']
    vocab_size = metadata['vocab_size']
    print(f"Training Configuration:")
    print(f"  - Embedding dimension: {dimension}")
    print(f"  - Epochs: {epochs}")
    print(f"  - Learning rate: {learning_rate}")
    print(f"  - Batch size: {batch_size}")
    def weighting_function(x):
        return cp.where(x < x_max, (x/x_max) ** alpha, 1.0)
    print(f"Initializing embeddings ({vocab_size} x {dimension})...")
    try:
        cp.random.seed(0)
        word_vec = cp.random.uniform(-0.01, 0.01, (vocab_size, dimension), dtype=cp.float32)
        context_vec = cp.random.uniform(-0.01, 0.01, (vocab_size, dimension), dtype=cp.float32)
        b = cp.zeros(vocab_size, dtype=cp.float32)
        c = cp.zeros(vocab_size, dtype=cp.float32)
        print("Embeddings initialized on GPU")
    except Exception as e:
        print(f"Failed to allocate embeddings: {e}")
        print("Try reducing dimension size")
        return None, None, None, None
    print("Converting co-occurrence data to GPU...")
    try:
        all_indices_i = cp.array([idx[0] for idx in non_zero_indices], dtype=cp.int32)
        all_indices_j = cp.array([idx[1] for idx in non_zero_indices], dtype=cp.int32)
        all_values = cp.array(non_zero_values, dtype=cp.float32)
        print(f"{len(non_zero_indices)} co-occurrences loaded to GPU")
    except Exception as e:
        print(f"Failed to load co-occurrence data to GPU: {e}")
        return None, None, None, None
    def print_gpu_memory():
        mempool = cp.get_default_memory_pool()
        print(f"GPU memory used: {mempool.used_bytes() / 1024**3:.2f} GB")
    print_gpu_memory()
    print(f"Starting training...")
    for epoch in range(epochs):
        total_loss = 0
        for batch_start in range(0, len(non_zero_indices), batch_size):
            batch_end = min(batch_start + batch_size, len(non_zero_indices))
            indices_i = all_indices_i[batch_start:batch_end]
            indices_j = all_indices_j[batch_start:batch_end]
            values = all_values[batch_start:batch_end]
            Xij = values
            wgt = weighting_function(Xij)
            word_vecs_batch = word_vec[indices_i]
            context_vecs_batch = context_vec[indices_j]
            b_batch = b[indices_i]
            c_batch = c[indices_j]
            dots = cp.sum(word_vecs_batch * context_vecs_batch, axis=1)
            diff = dots + b_batch + c_batch - cp.log(Xij + 1)
            batch_loss = cp.sum(wgt * diff ** 2)
            total_loss += float(batch_loss)
            grad = 2 * wgt * diff
            grad_reshaped = grad.reshape(-1, 1)
            word_vec_updates = grad_reshaped * context_vecs_batch
            context_vec_updates = grad_reshaped * word_vecs_batch
            cp.add.at(word_vec, indices_i, -learning_rate * word_vec_updates)
            cp.add.at(context_vec, indices_j, -learning_rate * context_vec_updates)
            cp.add.at(b, indices_i, -learning_rate * grad)
            cp.add.at(c, indices_j, -learning_rate * grad)
            del word_vecs_batch, context_vecs_batch, b_batch, c_batch
            del dots, diff, grad, grad_reshaped, word_vec_updates, context_vec_updates
        if epoch % 10 == 0:
            print(f"Epoch {epoch:3d}, Loss: {total_loss:.4f}")
            print_gpu_memory()
    embeddings = word_vec + context_vec
    embeddings_cpu = cp.asnumpy(embeddings)
    print(f"Training completed!")
    print(f"Final embeddings shape: {embeddings_cpu.shape}")
    return embeddings_cpu, word_to_id, id_to_word, metadata

def save_trained_model(embeddings, word_to_id, id_to_word, metadata, 
                      training_config, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    np.save(f"{output_dir}/embeddings.npy", embeddings)
    with open(f"{output_dir}/word_to_id.pkl", "wb") as f:
        pickle.dump(word_to_id, f)
    with open(f"{output_dir}/id_to_word.pkl", "wb") as f:
        pickle.dump(id_to_word, f)
    complete_metadata = {**metadata, **training_config}
    with open(f"{output_dir}/metadata.pkl", "wb") as f:
        pickle.dump(complete_metadata, f)
    print(f"Model saved to {output_dir}/")

if __name__ == "__main__":
    preprocessed_dir = "models/matrixes/preprocessed_30k_w20"
    model_output_dir = "models/test_to_word_embeddings/glove_model_30k"
    dimension = 300
    epochs = 500
    learning_rate = 0.005
    batch_size = 10000
    print("Starting GloVe Training")
    print("=" * 40)
    result = train_glove_gpu(
        preprocessed_dir, 
        dimension=dimension,
        epochs=epochs,
        learning_rate=learning_rate,
        batch_size=batch_size
    )
    embeddings, word_to_id, id_to_word, metadata = result
    if embeddings is not None:
        print("Training successful! Saving model...")
        training_config = {
            'embedding_dim': dimension,
            'epochs': epochs,
            'learning_rate': learning_rate,
            'batch_size': batch_size
        }
        save_trained_model(embeddings, word_to_id, id_to_word, metadata, 
                          training_config, model_output_dir)
        print(f"Complete! Model saved to: {model_output_dir}")
    else:
        print("Training failed!")
        print("Next steps:")
        print("1. Fix GPU issues (restart system)")
        print("2. Or reduce dimension/batch_size")
        print("3. Or try CPU-based training instead")

# Training the Random Forest classifier and saving pkl file

In [6]:
!pip install tensorflow

Collecting tensorflow
[0m  Downloading tensorflow-2.20.0-cp312-cp312-macosx_12_0_arm64.whl.metadata (4.5 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Using cached absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Using cached flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google_pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-1-py2.py3-none-macosx_11_0_arm64.whl.metadata (5.2 kB)
Collecting protobuf>=5.28.0 (from tensorflow)
  Downloading protobuf-6.32.0-cp39-abi3-macosx_10_9_universal2.whl.metadata (593 bytes)
Collecting termcolor>=1.1.0 

In [None]:
import numpy as np
import pickle
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from collections import Counter
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence

def load_glove_model(model_dir):
    embeddings = np.load(f"{model_dir}/embeddings.npy")
    with open(f"{model_dir}/word_to_id.pkl", "rb") as f:
        word_to_id = pickle.load(f)
    with open(f"{model_dir}/id_to_word.pkl", "rb") as f:
        id_to_word = pickle.load(f)
    with open(f"{model_dir}/metadata.pkl", "rb") as f:
        metadata = pickle.load(f)
    return embeddings, word_to_id, id_to_word, metadata

def text_to_indices(text, word_to_id, max_seq_len=600):
    tokens = text.lower().split()
    indices = [word_to_id.get(w, 0) for w in tokens]
    if len(indices) > max_seq_len:
        indices = indices[:max_seq_len]
    return indices

def prepare_cnn_dataset(file_path, word_to_id, max_seq_len=600):
    sequences, labels = [], []
    print(f"Loading dataset from {file_path}...")
    with open(file_path, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            line = line.strip()
            if not line:
                continue
            parts = line.split('\t', 1)
            if len(parts) < 2:
                print(f"Warning: Skipping malformed line {line_num}")
                continue
            genres_part = parts[0]
            text_part = parts[1]
            primary_genre = genres_part.split(',')[0].strip()
            indices = text_to_indices(text_part, word_to_id, max_seq_len)
            sequences.append(torch.tensor(indices, dtype=torch.long))
            labels.append(primary_genre)
    print(f"Loaded {len(sequences)} samples")
    padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=0)
    labels_array = np.array(labels)
    counts = Counter(labels_array)
    print(f"Label distribution before filtering:")
    for label, count in sorted(counts.items(), key=lambda x: x[1], reverse=True):
        print(f"  {label}: {count} samples")
    filtered_indices = [i for i, label in enumerate(labels_array) if counts[label] >= 3]
    print(f"Filtered out {len(labels_array) - len(filtered_indices)} samples with rare labels")
    print(f"Final dataset size: {len(filtered_indices)} samples")
    return padded_sequences[filtered_indices], labels_array[filtered_indices]

class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes, pretrained_embeddings=None, 
                 filter_sizes=[3, 4, 5], num_filters=128, dropout=0.5, max_seq_len=600):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        if pretrained_embeddings is not None:
            self.embedding.weight.data.copy_(torch.from_numpy(pretrained_embeddings))
        self.convs = nn.ModuleList([
            nn.Conv1d(embedding_dim, num_filters, kernel_size=fs)
            for fs in filter_sizes
        ])
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(len(filter_sizes) * num_filters, num_classes)
    def forward(self, x):
        x = self.embedding(x)
        x = x.transpose(1, 2)
        conv_outputs = []
        for conv in self.convs:
            conv_out = F.relu(conv(x))
            pooled = F.max_pool1d(conv_out, kernel_size=conv_out.size(2))
            conv_outputs.append(pooled.squeeze(2))
        x = torch.cat(conv_outputs, dim=1)
        x = self.dropout(x)
        x = self.fc(x)
        return x

def train_cnn_model(model_dir, dataset_path, output_pkl, max_seq_len=600, 
                   filter_sizes=[3,4,5], num_filters=128, epochs=30, 
                   batch_size=32, lr=0.001, dropout=0.3, weight_decay=0.0005):
    print(f"Training CNN with {model_dir}...")
    print(f"Configuration:")
    print(f"  - Max sequence length: {max_seq_len}")
    print(f"  - Filter sizes: {filter_sizes}")
    print(f"  - Number of filters per size: {num_filters}")
    print(f"  - Batch size: {batch_size}")
    print(f"  - Learning rate: {lr}")
    print(f"  - Weight decay (L2): {weight_decay}")
    print(f"  - Dropout: {dropout}")
    print(f"  - Epochs: {epochs}")
    embeddings, word_to_id, id_to_word, metadata = load_glove_model(model_dir)
    vocab_size, embedding_dim = embeddings.shape
    X, y = prepare_cnn_dataset(dataset_path, word_to_id, max_seq_len)
    unique_labels = sorted(set(y))
    label_to_idx = {label: idx for idx, label in enumerate(unique_labels)}
    idx_to_label = {idx: label for label, idx in label_to_idx.items()}
    y_idx = np.array([label_to_idx[label] for label in y])
    print(f"Dataset info:")
    print(f"  - Sequences shape: {X.shape}")
    print(f"  - Number of classes: {len(unique_labels)}")
    print(f"  - Vocabulary size: {vocab_size}")
    print(f"  - Embedding dimension: {embedding_dim}")
    print(f"Final label distribution:")
    final_counts = Counter(y)
    for label, count in sorted(final_counts.items(), key=lambda x: x[1], reverse=True):
        print(f"  {label}: {count} samples")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_idx, test_size=0.2, random_state=42, stratify=y_idx
    )
    print(f"Train-Test Split:")
    print(f"  - Training samples: {len(X_train)}")
    print(f"  - Test samples: {len(X_test)}")
    print(f"  - Train ratio: {len(X_train)/len(X)*100:.1f}%")
    print(f"  - Test ratio: {len(X_test)/len(X)*100:.1f}%")
    class_counts = np.bincount(y_train)
    class_weights = 1. / class_counts
    class_weights = class_weights / class_weights.sum() * len(class_weights)
    weights = torch.tensor(class_weights, dtype=torch.float)
    print(f"Training set class distribution:")
    for i, (label, count) in enumerate(zip([idx_to_label[i] for i in range(len(unique_labels))], class_counts)):
        print(f"  {label}: {count} samples (weight: {class_weights[i]:.3f})")
    print(f"Test set class distribution:")
    test_class_counts = np.bincount(y_test)
    for i, (label, count) in enumerate(zip([idx_to_label[i] for i in range(len(unique_labels))], test_class_counts)):
        print(f"  {label}: {count} samples")
    train_dataset = TensorDataset(X_train, torch.tensor(y_train, dtype=torch.long))
    test_dataset = TensorDataset(X_test, torch.tensor(y_test, dtype=torch.long))
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    if torch.backends.mps.is_available() and torch.backends.mps.is_built():
     device = torch.device("mps")
    elif torch.cuda.is_available():
     device = torch.device("cuda")
    else:
     device = torch.device("cpu")
    print(f"Using device: {device}")
    model = TextCNN(
        vocab_size=vocab_size,
        embedding_dim=embedding_dim,
        num_classes=len(unique_labels),
        pretrained_embeddings=embeddings,
        filter_sizes=filter_sizes,
        num_filters=num_filters,
        dropout=dropout,
        max_seq_len=max_seq_len
    ).to(device)
    criterion = nn.CrossEntropyLoss(weight=weights.to(device))
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    checkpoint_dir = "rnn"
    os.makedirs(checkpoint_dir, exist_ok=True)
    best_test_acc = 0.0
    train_losses = []
    train_accuracies = []
    val_accuracies = []
    patience_loss = 3
    patience_acc = 3
    min_loss_improvement = 0.005
    best_loss = float('inf')
    epochs_without_loss_improvement = 0
    epochs_without_acc_improvement = 0
    early_stopped = False
    best_model_state = None
    best_model_epoch = 0
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        correct = 0
        total = 0
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()
            total += target.size(0)
        avg_loss = total_loss / len(train_loader)
        train_accuracy = 100. * correct / total
        train_losses.append(avg_loss)
        train_accuracies.append(train_accuracy)
        loss_improvement = 0.0
        if epoch == 0:
            best_loss = avg_loss
            epochs_without_loss_improvement = 0
        else:
            if avg_loss < best_loss:
                loss_improvement = (best_loss - avg_loss) / best_loss
                if loss_improvement >= min_loss_improvement:
                    best_loss = avg_loss
                    epochs_without_loss_improvement = 0
                else:
                    epochs_without_loss_improvement += 1
            else:
                epochs_without_loss_improvement += 1
        if (epoch + 1) % 5 == 0:
            model.eval()
            val_correct = 0
            val_total = 0
            with torch.no_grad():
                for data, target in test_loader:
                    data, target = data.to(device), target.to(device)
                    output = model(data)
                    pred = output.argmax(dim=1)
                    val_correct += pred.eq(target).sum().item()
                    val_total += target.size(0)
            val_accuracy = 100. * val_correct / val_total
            val_accuracies.append(val_accuracy)
            print(f'Epoch {epoch+1:2d}/{epochs}, Loss: {avg_loss:.4f} (↓{loss_improvement*100:.2f}%), Train Acc: {train_accuracy:.2f}%, Test Acc: {val_accuracy:.2f}%')
            if val_accuracy > best_test_acc:
                best_test_acc = val_accuracy
                epochs_without_acc_improvement = 0
                best_model_state = model.state_dict().copy()
                best_model_epoch = epoch + 1
                print(f"    New best test accuracy: {val_accuracy:.2f}% (saved)")
            else:
                epochs_without_acc_improvement += 1
                print(f"    No test accuracy improvement for {epochs_without_acc_improvement} epoch(s)")
            checkpoint_path = os.path.join(checkpoint_dir, f"cnn_model_epoch_{epoch+1}.pkl")
            checkpoint_data = {
                "epoch": epoch + 1,
                "model_state": model.state_dict(),
                "optimizer_state": optimizer.state_dict(),
                "train_loss": avg_loss,
                "train_accuracy": train_accuracy,
                "val_accuracy": val_accuracy,
                "label_to_idx": label_to_idx,
                "idx_to_label": idx_to_label,
                "vocab_size": vocab_size,
                "embedding_dim": embedding_dim,
                "num_classes": len(unique_labels),
                "filter_sizes": filter_sizes,
                "num_filters": num_filters,
                "max_seq_len": max_seq_len,
                "dropout": dropout,
                "weight_decay": weight_decay
            }
            with open(checkpoint_path, "wb") as f:
                pickle.dump(checkpoint_data, f)
            print(f"    Checkpoint saved: {checkpoint_path}")
            model.train()
        if epoch >= 4:
            if epochs_without_acc_improvement >= patience_acc:
                print(f"Early stopping triggered after epoch {epoch+1}")
                print(f"Test accuracy hasn't improved for {epochs_without_acc_improvement} validation checks")
                print(f"Best test accuracy: {best_test_acc:.2f}% at epoch {best_model_epoch}")
                early_stopped = True
                break
            if epochs_without_loss_improvement >= patience_loss:
                print(f"Early stopping triggered after epoch {epoch+1}")
                print(f"Loss improvement of {loss_improvement*100:.3f}% is below threshold of {min_loss_improvement*100:.1f}%")
                print(f"No significant loss improvement for {epochs_without_loss_improvement} epochs")
                early_stopped = True
                break
    if early_stopped and best_model_state is not None:
        print(f"Restoring best model from epoch {best_model_epoch}")
        model.load_state_dict(best_model_state)
    print(f"FINAL EVALUATION ON TEST SET")
    model.eval()
    y_true, y_pred, y_probs = [], [], []
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            probs = F.softmax(output, dim=1)
            pred = output.argmax(dim=1)
            y_true.extend(target.cpu().numpy())
            y_pred.extend(pred.cpu().numpy())
            y_probs.extend(probs.cpu().numpy())
    acc = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')
    print(f'Test Accuracy:           {acc:.4f} ({acc*100:.2f}%)')
    print(f'Weighted Precision:      {precision:.4f}')
    print(f'Weighted Recall:         {recall:.4f}')
    print(f'Weighted F1-Score:       {f1:.4f}')
    print(f'Macro Precision:         {macro_precision:.4f}')
    print(f'Macro Recall:            {macro_recall:.4f}')
    print(f'Macro F1-Score:          {macro_f1:.4f}')
    print(f'Best Test Accuracy:      {best_test_acc:.2f}%')
    target_names = [idx_to_label[i] for i in sorted(set(y_true + y_pred))]
    print(classification_report(y_true, y_pred, target_names=target_names, digits=3))
    for i in sorted(set(y_true + y_pred)):
        class_mask = np.array(y_true) == i
        if np.sum(class_mask) > 0:
            class_acc = np.sum(np.array(y_pred)[class_mask] == i) / np.sum(class_mask)
            print(f'{idx_to_label[i]:<25}: {class_acc:.3f} ({class_acc*100:.1f}%)')
    cm = confusion_matrix(y_true, y_pred)
    print(f'Total predictions: {len(y_true)}')
    print(f'Correct predictions: {np.trace(cm)}')
    print(f'Incorrect predictions: {len(y_true) - np.trace(cm)}')
    print(f'Total epochs completed: {len(train_accuracies)}')
    print(f'Early stopped: {"Yes" if early_stopped else "No"}')
    if early_stopped:
        print(f'Best model from epoch: {best_model_epoch}')
        print(f'Best test accuracy: {best_test_acc:.2f}%')
    print(f'Final training accuracy: {train_accuracies[-1]:.2f}%')
    print(f'Final training loss: {train_losses[-1]:.4f}')
    print(f'Best training loss: {min(train_losses):.4f}')
    print(f'Training improvement: {train_accuracies[-1] - train_accuracies[0]:.2f}%')
    print(f'Checkpoints saved in: {checkpoint_dir}/')
    final_model_path = output_pkl
    model_data = {
        "final_epoch": len(train_accuracies),
        "best_epoch": best_model_epoch if early_stopped else len(train_accuracies),
        "model_state": model.state_dict(),
        "label_to_idx": label_to_idx,
        "idx_to_label": idx_to_label,
        "vocab_size": vocab_size,
        "embedding_dim": embedding_dim,
        "num_classes": len(unique_labels),
        "filter_sizes": filter_sizes,
        "num_filters": num_filters,
        "max_seq_len": max_seq_len,
        "dropout": dropout,
        "weight_decay": weight_decay,
        "final_train_loss": train_losses[-1],
        "final_train_accuracy": train_accuracies[-1],
        "best_val_accuracy": best_test_acc,
        "early_stopped": early_stopped,
        "test_accuracy": acc,
        "test_f1": f1,
        "val_accuracies": val_accuracies,
        "train_losses_history": train_losses,
        "train_accuracies_history": train_accuracies
    }
    with open(final_model_path, "wb") as f:
        pickle.dump(model_data, f)
    print(f"Final model saved to {final_model_path}")
    print(f"Checkpoints available in {checkpoint_dir}/ folder")
    return model, acc, f1

models_to_train = [
    {
        "model_dir": "embeddings/glove_model_30k",
        "dataset_path": "datasets/hindi/cleaned_hindi_test_labelled.txt",
        "output_pkl": "models/classification_models/engnew30k_cnn.pkl",
        "max_seq_len": 600,
        "filter_sizes": [2, 3, 4, 5, 6],
        "num_filters": 128,
        "epochs": 5,
        "batch_size": 32,
        "lr": 0.001,
        "weight_decay": 0.001
    }
]
for config in models_to_train:
    train_cnn_model(**config)

Training CNN with models/text_to_word_embeddings/glove_model_hindi_30k_d150...
Configuration:
  - Max sequence length: 600
  - Filter sizes: [2, 3, 4, 5, 6]
  - Number of filters per size: 128
  - Batch size: 32
  - Learning rate: 0.001
  - Weight decay (L2): 0.001
  - Dropout: 0.3
  - Epochs: 5
Loading dataset from datasets/hindi/cleaned_hindi_test_labelled.txt...
Loaded 30000 samples
Label distribution before filtering:
  [entertainment]: 3431 samples
  [politics: 3408 samples
  [national]: 3068 samples
  [miscellaneous]: 2744 samples
  [world: 2579 samples
  [business]: 2228 samples
  [technology]: 2201 samples
  [world]: 2100 samples
  [sports: 1657 samples
  [विश्व_कप_2023: 1572 samples
  [business: 1388 samples
  [national: 1003 samples
  [miscellaneous: 975 samples
  [politics]: 628 samples
  [sports]: 561 samples
  [entertainment: 208 samples
  [hatke: 186 samples
  [automobile: 37 samples
  [technology: 13 samples
  [एशियन_गेम्स_2022: 11 samples
  [एशिया_कप_2023: 2 samples
Fil