## ELMO Implementation version 2

In [1]:
!pip install indic-nlp-library wandb tqdm

import torch
from torch.utils.data import Dataset, DataLoader
import os
import re
import numpy as np
from indicnlp.tokenize import indic_tokenize
from torch.nn.utils.rnn import pad_sequence

Collecting indic-nlp-library
  Downloading indic_nlp_library-0.92-py3-none-any.whl.metadata (5.7 kB)
Collecting sphinx-argparse (from indic-nlp-library)
  Downloading sphinx_argparse-0.4.0-py3-none-any.whl.metadata (1.9 kB)
Collecting morfessor (from indic-nlp-library)
  Downloading Morfessor-2.0.6-py3-none-any.whl.metadata (628 bytes)
Collecting sphinx>=1.2.0 (from sphinx-argparse->indic-nlp-library)
  Downloading sphinx-7.2.6-py3-none-any.whl.metadata (5.9 kB)
Collecting sphinxcontrib-applehelp (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library)
  Downloading sphinxcontrib_applehelp-1.0.8-py3-none-any.whl.metadata (2.3 kB)
Collecting sphinxcontrib-devhelp (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library)
  Downloading sphinxcontrib_devhelp-1.0.6-py3-none-any.whl.metadata (2.3 kB)
Collecting sphinxcontrib-jsmath (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library)
  Downloading sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting sphinxcontrib-h

In [2]:
import fasttext
import fasttext.util
ft_model = fasttext.load_model('/kaggle/input/pre-trained-model-indicft/indicnlp.ft.mr.300.bin')
word = "नृत्य"
print("Embedding Shape is {}".format(ft_model.get_word_vector(word)))

Embedding Shape is [ 0.41362646 -0.0721162  -0.09042576 -0.0080447   0.24940777 -0.0564224
 -0.06691281 -0.02353338 -0.01068915  0.22869106 -0.05456081  0.05685291
 -0.3771104   0.17173615 -0.19166155  0.05876774  0.2110943  -0.06690847
 -0.17985937  0.19748911 -0.17697716  0.0982177  -0.73754513  0.26441753
 -0.14425668 -0.3502157  -0.0930915  -0.26033682  0.04246099 -0.0807714
  0.25401157  0.62080336  0.02260456  0.16584569 -0.08181361  0.18448925
  0.06636861  0.18036523 -0.24447897  0.0946254  -0.05784336  0.27843988
 -0.09996741  0.14146516 -0.2521708  -0.01767177 -0.03513876  0.16193527
 -0.4139789  -0.06065518 -0.13225324  0.0381115   0.404005   -0.39212966
  0.45432544 -0.18739994  0.16050169 -0.41535494  0.09758026  0.12121335
 -0.464044    0.05734312 -0.11185544  0.0205804  -0.03070647 -0.02953663
  0.43329865 -0.25726065 -0.2399962   0.17885959  0.03350684  0.03437545
 -0.43484426 -0.05221066  0.07860021 -0.32815468  0.3373454  -0.16823411
  0.5529572  -0.2737693   0.321741



In [3]:
import os
import re
from indicnlp.tokenize import indic_tokenize
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import fasttext
from tqdm import tqdm

folder_path = '/kaggle/input/micro-marathi-dataset'
model_path = '/kaggle/input/pre-trained-model-indicft/indicnlp.ft.mr.300.bin'  # Path to the INDICFT model

# loading the pre-trained model
ft_model = fasttext.load_model(model_path)

token_to_index = {'<PAD>': 0, '<UNK>': 1, '<SOS>':2, '<EOS>':3}
next_token_index = 4  

# This might be unnecessary if using fixed embeddings from INDICFT, but keeping for consistency
def update_indices(token_list, token_to_index):
    global next_token_index
    for token in token_list:
        if token not in token_to_index:
            token_to_index[token] = next_token_index
            next_token_index += 1

texts = []  
threshold = 256

# Loading and preprocessing texts
for file_name in os.listdir(folder_path):
    if file_name.endswith('.txt'):
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        sentences = re.split(r'[।\n\.]+', text)
        sentences = ["<SOS> "+sentence.strip()+" <EOS>" for sentence in sentences if sentence.strip()]

        for sentence in sentences:
            tokens = indic_tokenize.trivial_tokenize(sentence, lang='mr')
            update_indices(tokens, token_to_index)
            if len(tokens) > threshold:  
                continue  
            texts.append(sentence)

print(f"Number of sentences processed: {len(texts)}")

# Custom Dataset class for handling Marathi data
class MarathiDataset(Dataset):
    def __init__(self, texts, ft_model, token_to_index):
        self.texts = texts
        self.ft_model = ft_model
        self.token_to_index = token_to_index
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        sentence = self.texts[idx]
        tokens = indic_tokenize.trivial_tokenize(sentence, lang='mr')
        embeddings = [self.ft_model.get_word_vector(token) for token in tokens]  # Get embeddings for each token
        input_embeddings = torch.tensor(embeddings[:-1], dtype=torch.float) # Exclude the last token for input
        target_indices = [self.token_to_index.get(token, self.token_to_index['<UNK>']) for token in tokens[1:]]  # Exclude the first token for target
        target_indices = torch.tensor(target_indices, dtype=torch.long)  # Targets are the indices of the next token
        return input_embeddings, target_indices

# Padding function for batches
def collate_fn(batch):
    (inputs, targets) = zip(*batch)
    input_embeddings = pad_sequence(inputs, batch_first=True, padding_value=0.0)  # Padding embeddings
    target_sequences = pad_sequence(targets, batch_first=True, padding_value=token_to_index['<PAD>'])  # Padding target indices
    return input_embeddings, target_sequences

dataset = MarathiDataset(texts, ft_model, token_to_index)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

for input_data, targets in dataloader:
    print(f"Input batch shape: {input_data.shape}")
    print(f"Target batch shape: {targets.shape}")
    break



Number of sentences processed: 1674
Input batch shape: torch.Size([4, 24, 300])
Target batch shape: torch.Size([4, 24])


  input_embeddings = torch.tensor(embeddings[:-1], dtype=torch.float) # Exclude the last token for input


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ELMoLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(ELMoLanguageModel, self).__init__()
        
        # Forward and backward LSTMs, extracting all layer outputs
        self.forward_lstm1 = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, batch_first=True)
        self.forward_lstm2 = nn.LSTM(hidden_dim, hidden_dim, num_layers=1, batch_first=True)
        self.backward_lstm1 = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, batch_first=True)
        self.backward_lstm2 = nn.LSTM(hidden_dim, hidden_dim, num_layers=1, batch_first=True)

        # Linear layers for final predictions
        self.forward_pred = nn.Linear(hidden_dim, vocab_size)
        self.backward_pred = nn.Linear(hidden_dim, vocab_size)

        # Weight parameters for combining the layers
        self.gamma = nn.Parameter(torch.ones(3)) 

    def forward(self, x):
    
        forward_out1, _ = self.forward_lstm1(x)
        forward_out2, _ = self.forward_lstm2(forward_out1)

        reversed_embeddings = torch.flip(x, [1])
        backward_out1, _ = self.backward_lstm1(reversed_embeddings)
        backward_out2, _ = self.backward_lstm2(backward_out1)

        # Flipping backward outputs back to original sequence order
        backward_out1 = torch.flip(backward_out1, [1])
        backward_out2 = torch.flip(backward_out2, [1])

        # last hidden states for predictions
        forward_predictions = self.forward_pred(forward_out2[:, -1, :])
        backward_predictions = self.backward_pred(backward_out2[:, 0, :])

        # Weighted sum of embeddings and LSTM outputs
        combined_embeddings = self.gamma[0] * x + self.gamma[1] * torch.cat((forward_out1, backward_out1), dim=-1) + self.gamma[2] * torch.cat((forward_out2, backward_out2), dim=-1)

        return forward_predictions, backward_predictions, combined_embeddings


In [5]:
cuda_available = torch.cuda.is_available()
print("CUDA Available:", cuda_available)
device = torch.device("cuda" if cuda_available else "cpu")

CUDA Available: True


In [6]:
hidden_dim = 150  
num_layers = 2  
vocab_size = len(token_to_index) + 1

model = ELMoLanguageModel(vocab_size, 300, hidden_dim).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=token_to_index['<PAD>']) 
optimizer = torch.optim.Adam(model.parameters()) 

# Training Loop
num_epochs = 3  
for epoch in range(num_epochs):
    model.train() 
    total_loss = 0
    for input_data, targets in tqdm(dataloader, desc=f"Epoch {epoch+1}"):
        input_data, targets = input_data.to(device), targets.to(device)
        optimizer.zero_grad()  
        forward_pred, backward_pred, _ = model(input_data) , _ = self.forward(inputs)
                
        loss_f = criterion(forward_pred, targets[:, 1]) 
        loss_b = criterion(backward_pred, targets[:, -1])  

        total_loss = loss_f + loss_b
        
        total_loss.backward()  # Backpropagate the loss
        optimizer.step()       # Updating the weights
        total_loss += total_loss.item()
    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.15f}")

Epoch 1: 100%|██████████| 419/419 [00:06<00:00, 62.60it/s]


Epoch 1/3, Average Loss: 0.000006811817457


Epoch 2: 100%|██████████| 419/419 [00:06<00:00, 67.87it/s]


Epoch 2/3, Average Loss: 0.000002561656174


Epoch 3: 100%|██████████| 419/419 [00:06<00:00, 68.45it/s]

Epoch 3/3, Average Loss: 0.000001347338753





In [7]:
model_path = './bilm_marathi_model.pth'
torch.save(model.state_dict(), model_path)

In [8]:
import json

mappings_path = './marathi_mappings.json'
with open(mappings_path, 'w', encoding='utf-8') as f:
    json.dump({
        'token_to_index': token_to_index
    }, f, ensure_ascii=False, indent=4)


## News classification

In [9]:
!pip install indic-nlp-library



In [10]:
import os
import re
from indicnlp.tokenize import indic_tokenize

def preprocess_text(text, language='mr'):
    """
    Apply preprocessing steps to the given text.
    """
    text = remove_non_textual_elements(text)
    text = normalize_quotation_marks(text)
    text = ensure_utf8_encoding(text)
    sentences = tokenize_sentences(text)
    sentences_SOS = ["<SOS> "+sentence+" <EOS>" for sentence in sentences]
    tokenized_sentences = [tokenize_words_indicnlp(sentence, language) for sentence in sentences_SOS]
    return ' '.join([' '.join(sentence) for sentence in tokenized_sentences])

def remove_non_textual_elements(text):
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def normalize_quotation_marks(text):
    text = text.replace('“', '"').replace('”', '"')
    text = text.replace("‘", "'").replace("’", "'")
    return text

def ensure_utf8_encoding(text):
    return text.encode('utf-8', errors='ignore').decode('utf-8')

def tokenize_sentences(text):
    sentences = re.split(r'[।\n\.]+', text)
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    return sentences

def tokenize_words_indicnlp(sentence, language='mr'):
    return indic_tokenize.trivial_tokenize(sentence, lang=language)


In [11]:
!pip install pandas pyarrow



In [12]:
import pandas as pd

def load_dataset(parquet_path):
    """Load dataset from a Parquet file."""
    return pd.read_parquet(parquet_path)

train_path = '/kaggle/input/news-category-classification/marathi/train-00000-of-00001.parquet'
test_path = '/kaggle/input/news-category-classification/marathi/test-00000-of-00001.parquet'
val_path = '/kaggle/input/news-category-classification/marathi/validation-00000-of-00001.parquet'

train_df = load_dataset(train_path)
test_df = load_dataset(test_path)
val_df = load_dataset(val_path)

print("Train Dataset:", train_df.head())
print("Test Dataset:", test_df.head())
print("Validation Dataset:", val_df.head())

Train Dataset:                                                 text  label
0           …म्हणून सानिया मिर्झाची ड्यू डेट आहे खास      0
1  Video : दीपिका-रणवीरच्या लग्नाचा मेन्यू झाला लीक!      0
2   सचिनच्या रणजी कारकीर्दीचा शेवट गोड, मुंबईचा विजय      3
3  पुरंदरेंना महाराष्ट्र भूषण पुरस्काराविरोधात सं...      4
4  जयपूर पोलिसांच्या जाहिरातीवर बुमराह संतापला, स...      3
Test Dataset:                                                 text  label
0  काजोल पुन्हा माझ्या आयुष्यात येणार नाही - करण ...      0
1             विराटला चीअर करण्यासाठी अनुष्का सिडनीत      0
2                                     संतोषचा अड्डा!      0
3  मुरूड समुद्रात बुडालेल्या विद्यार्थ्याचा मृतदे...      4
4  गोव्यात शिवसेना वेलिंगकरांसोबत,लवकरच युतीची घोषणा      4
Validation Dataset:                                                 text  label
0  CWG 2018 : संजिता चानूची सुवर्णभरारी, भारताच्य...      3
1  कर्मचार्‍यांच्या हलगर्जीपणामुळे होणार होते जिव...      4
2                श्रीलंकेचा भारतावर 7 गडी राखून विज

In [18]:
def preprocess_dataset(df, text_column='text'):
    df[text_column] = df[text_column].apply(lambda x: preprocess_text(x))
    return df

train_df_preprocessed = preprocess_dataset(train_df, 'text')
test_df_preprocessed = preprocess_dataset(test_df, 'text')
val_df_preprocessed = preprocess_dataset(val_df, 'text')

texts_train = train_df_preprocessed['text'].tolist()
texts_test = test_df_preprocessed['text'].tolist()
texts_val = val_df_preprocessed['text'].tolist()
labels_train = train_df_preprocessed['label'].tolist()
labels_test = test_df_preprocessed['label'].tolist()
labels_val = val_df_preprocessed['label'].tolist()

print(train_df_preprocessed.head())

                                                text  label
0  < SOS > …म्हणून सानिया मिर्झाची ड्यू डेट आहे ख...      0
1  < SOS > Video : दीपिका - रणवीरच्या लग्नाचा मेन...      0
2  < SOS > सचिनच्या रणजी कारकीर्दीचा शेवट गोड , म...      3
3  < SOS > पुरंदरेंना महाराष्ट्र भूषण पुरस्कारावि...      4
4  < SOS > जयपूर पोलिसांच्या जाहिरातीवर बुमराह सं...      3


In [21]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import numpy as np



class MarathiDatasetCreate(Dataset):
    def __init__(self, texts, labels, ft_model, token_to_index, lang='mr'):
        self.texts = texts
        self.labels = labels
        self.ft_model = ft_model
        self.token_to_index = token_to_index
        self.lang = lang

        # Normalize labels
        self.label_encoder = LabelEncoder()
        self.labels = self.label_encoder.fit_transform(labels)

    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        tokens = indic_tokenize.trivial_tokenize(text, lang=self.lang)
        embeddings = [self.ft_model.get_word_vector(token) for token in tokens]
        input_embeddings = torch.tensor(embeddings, dtype=torch.float)
        target_label = torch.tensor(label, dtype=torch.long)
        return input_embeddings, target_label

    def get_label_encoder(self):
        return self.label_encoder

def collate_fn(batch):
    inputs, labels = zip(*batch)
    inputs_padded = pad_sequence(inputs, batch_first=True, padding_value=0.0)
    labels = torch.tensor(labels, dtype=torch.long)
    return inputs_padded, labels

class BiLSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes, num_layers=1, dropout=0.5):
        super(BiLSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers,
                            batch_first=True, bidirectional=True, dropout=dropout if num_layers > 1 else 0)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)  # Multiply by 2 for bidirectional output

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        last_outputs = lstm_out[:, -1, :]  # Get the last time step output
        output = self.fc(last_outputs)
        return output
    

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vocab_size = len(token_to_index)  
hidden_dim = 150
input_dim = 300  
num_classes = len(set(train_df_preprocessed['label']))

elmo_model = ELMoLanguageModel(vocab_size, 300, hidden_dim).to(device)
classifier = BiLSTMClassifier(input_dim=input_dim, hidden_dim=hidden_dim, num_classes=num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(list(elmo_model.parameters()) + list(classifier.parameters()), lr=0.001)

train_dataset = MarathiDatasetCreate(texts_train, labels_train, ft_model, token_to_index)
val_dataset = MarathiDatasetCreate(texts_val, labels_val, ft_model, token_to_index)
test_dataset = MarathiDatasetCreate(texts_test, labels_test, ft_model, token_to_index)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

# Training Loop
num_epochs = 5
for epoch in range(num_epochs):
    elmo_model.train()
    classifier.train()
    total_loss = 0

    for input_data, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        input_data, labels = input_data.to(device), labels.to(device)
        optimizer.zero_grad()

        _, _, embeddings = elmo_model(input_data)

        outputs = classifier(embeddings)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Average Loss Epoch {epoch+1}: {total_loss / len(train_loader)}")

def extract_features(dataloader, elmo_model, classifier, device):
    elmo_model.eval()
    classifier.eval()
    all_embeddings = []
    all_labels = []

    with torch.no_grad():
        for input_data, labels in dataloader:
            input_data = input_data.to(device)
            _, _, embeddings = elmo_model(input_data)
            outputs = classifier(embeddings)
            all_embeddings.extend(outputs.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return np.array(all_embeddings), np.array(all_labels)

# Evaluate the model
test_embeddings, test_labels = extract_features(test_loader, elmo_model, classifier, device)
predicted_labels = np.argmax(test_embeddings, axis=1)

print(classification_report(test_labels, predicted_labels))

Epoch 1/5: 100%|██████████| 303/303 [00:18<00:00, 16.02it/s]


Average Loss Epoch 1: 0.4158588737101838


Epoch 2/5: 100%|██████████| 303/303 [00:18<00:00, 16.17it/s]


Average Loss Epoch 2: 0.18889940986809362


Epoch 3/5: 100%|██████████| 303/303 [00:18<00:00, 16.22it/s]


Average Loss Epoch 3: 0.14980837889323043


Epoch 4/5: 100%|██████████| 303/303 [00:18<00:00, 16.20it/s]


Average Loss Epoch 4: 0.12693508030391812


Epoch 5/5: 100%|██████████| 303/303 [00:18<00:00, 16.20it/s]


Average Loss Epoch 5: 0.11668757034201335
              precision    recall  f1-score   support

           0       0.92      0.89      0.90       335
           1       0.98      0.79      0.88       116
           2       0.94      0.98      0.96       759

    accuracy                           0.94      1210
   macro avg       0.95      0.89      0.91      1210
weighted avg       0.94      0.94      0.94      1210

