## Tokenizing the dataset

In [None]:
import pandas as pd
from transformers import AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling, LlamaForCausalLM, LlamaConfig
import torch
from torch.utils.data import Dataset, DataLoader
import math
import numpy as np
import ast

In [None]:
tokenizer = AutoTokenizer.from_pretrained('WORDPIECE_CSV/malayalam_wp_v2')

# Load and tokenize dataset
df = pd.read_csv("WORDPIECE_CSV/wp_4.csv")

def tokenize_text(text, tokenizer, max_length=128):
    if not isinstance(text, str):  # Check if text is not a string
        text = str(text)  # Convert to string
    encoding = tokenizer(text, truncation=True, padding='max_length', max_length=max_length, return_tensors="pt")
    return encoding.input_ids.squeeze().tolist(), encoding.attention_mask.squeeze().tolist()

def tokenize_text(text, tokenizer, max_length=512):
    encoding = tokenizer(text, truncation=True, padding='max_length', max_length=max_length, return_tensors="pt")
    return encoding.input_ids.squeeze().tolist(), encoding.attention_mask.squeeze().tolist()

df['tokenized_input_ids'], df['tokenized_attention_mask'] = zip(*df['Text'].apply(lambda x: tokenize_text(x, tokenizer)))
df.to_csv('tokenized_data5.csv', index=False)


ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

## Training the model

In [9]:
tokenizer = AutoTokenizer.from_pretrained('WORDPIECE_CSV/malayalam_wp_v2')

Creating the dataset compatible with the model

In [4]:
# Load tokenized data
df = pd.read_csv('tokenized_data2.csv')
df['tokenized_input_ids'] = df['tokenized_input_ids'].apply(ast.literal_eval)
df['tokenized_attention_mask'] = df['tokenized_attention_mask'].apply(ast.literal_eval)

In [5]:
df.head()

Unnamed: 0,Serial Number,Source Name,Text,tokenized_input_ids,tokenized_attention_mask
0,1,wikipedia_final_page_747.txt,"<doc id=""510272"" url=""https://ml wikipedia org...","[24, 2088, 2286, 25, 9, 4, 9, 2293, 25, 9, 227...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,2,deshabhimani_7_7158.txt,നഴ്‌സറി കുട്ടികൾക്കെതിരെ ലൈം​ഗികാതിക്രമം; മഹാര...,"[18730, 1945, 5852, 3393, 15971, 23, 6481, 192...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,3,deshabhimani_5_27197.txt,"olumpics, Deshabhimani, Malayalam News, Online...","[48, 1076, 3715, 3125, 1079, 1078, 19, 3218, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,4,deshabhimani_3_20576.txt,"കുട്ടനാടൻ അതിജീവനം സമഗ്രശിക്ഷാ കേരളം, Deshabhi...","[2612, 11654, 17020, 1009, 7406, 8231, 1022, 3...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,5,deshabhimani_5_25794.txt,"ആനുകൂല്യങ്ങൾ, Deshabhimani, Malayalam News, On...","[17164, 19, 3218, 19, 2860, 2405, 19, 5810, 28...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


Creating the dataset compatible with the model

In [4]:
class MalayalamDataset(Dataset):
    def __init__(self, dataframe, max_length=512):
        self.dataframe = dataframe
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.dataframe['tokenized_input_ids'][idx], dtype=torch.long)
        attention_mask = torch.tensor(self.dataframe['tokenized_attention_mask'][idx], dtype=torch.long)
        padding_length = self.max_length - input_ids.size(0)
        if padding_length > 0:
            input_ids = torch.cat([input_ids, torch.zeros(padding_length, dtype=torch.long)])
            attention_mask = torch.cat([attention_mask, torch.zeros(padding_length, dtype=torch.long)])
        return {'input_ids': input_ids, 'attention_mask': attention_mask}


In [5]:
malayalam_dataset = MalayalamDataset(df)

In [6]:
train_size = int(0.8 * len(malayalam_dataset))
val_size = len(malayalam_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(malayalam_dataset, [train_size, val_size])


In [7]:
len(malayalam_dataset)

6

Selecting the model

In [25]:
config = LlamaConfig(
    hidden_size=768,  # Reduce hidden size
    vocab_size=len(tokenizer.vocab),
    num_attention_heads=8,  # Reduce the number of attention heads
    num_key_value_heads=4,  # Reduce the number of key-value heads
    num_hidden_layers=12,  # Reduce the number of layers
    intermediate_size=1024  # Keep the intermediate size reasonable
)




In [35]:
model = LlamaForCausalLM(config)
device = torch.device("cpu")
model.to(device)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32768, 768)
    (layers): ModuleList(
      (0-11): 12 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=768, out_features=768, bias=False)
          (k_proj): Linear(in_features=768, out_features=384, bias=False)
          (v_proj): Linear(in_features=768, out_features=384, bias=False)
          (o_proj): Linear(in_features=768, out_features=768, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=768, out_features=1024, bias=False)
          (up_proj): Linear(in_features=768, out_features=1024, bias=False)
          (down_proj): Linear(in_features=1024, out_features=768, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((768,), eps=1e-06)
        (post_attention_layernorm): LlamaRMSNorm((768,), eps=1e-06)
      )
    )
    (norm): LlamaRMSNorm

In [28]:
training_args = TrainingArguments(
    output_dir="./bert",
    max_steps=10,  # For testing, increase this as needed
    evaluation_strategy="epoch",
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    report_to="none",
)


In [29]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


In [30]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

  trainer = Trainer(
max_steps is given, it will override any value given in num_train_epochs


In [36]:
def calculate_perplexity(model, eval_dataset, batch_size=8):
    model.eval()
    total_loss = 0.0
    dataloader = DataLoader(eval_dataset, batch_size=batch_size)
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
            total_loss += outputs.loss.item()
    avg_loss = total_loss / len(dataloader)
    return math.exp(avg_loss)

In [37]:
num_epochs = 10
epoch_intervals = int(num_epochs * 10)
perplexity_matrix = np.zeros((epoch_intervals,))

Calculating the perplexity score

In [None]:
for epoch_idx in range(epoch_intervals):
    current_epoch = (epoch_idx + 1) / 10
    trainer.train()
    perplexity = calculate_perplexity(model, val_dataset)
    perplexity_matrix[epoch_idx] = perplexity
    print(f"Perplexity at {current_epoch:.1f} epochs: {perplexity}")

model.save_pretrained('saved_llama_model')
tokenizer.save_pretrained('saved_llama_tokenizer')

print("Training complete. Perplexity Matrix:", perplexity_matrix)


Prompting the trained model

In [None]:
prompts = [
    "ഞാൻ മലയാളം", "കൂടുതൽ വിവരങ്ങൾ", "ഇന്നത്തെ കാലാവസ്ഥ", "എന്റെ പേര്", "ഇന്നലെ രാത്രി",
    "കേരളത്തിലെ കായിക", "സിനിമ കാഴ്ച", "നല്ല പുസ്തകം", "ആത്മവിശ്വാസം", "വായനക്കാരൻ"
]

print("Model outputs for prompts:")
for i, prompt in enumerate(prompts):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=50)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Prompt {i+1}: {prompt}")
    print(f"Output {i+1}: {generated_text}\n")

## Workflow for training a trained model with more data

In [None]:
model = LlamaForCausalLM.from_pretrained('saved_llama_model')
tokenizer = AutoTokenizer.from_pretrained('saved_llama_tokenizer')

In [None]:
df_new = pd.read_csv('new_tokenized_data.csv')
df_new['tokenized_input_ids'] = df_new['tokenized_input_ids'].apply(ast.literal_eval)
df_new['tokenized_attention_mask'] = df_new['tokenized_attention_mask'].apply(ast.literal_eval)

# Create a new dataset for further training
new_malayalam_dataset = MalayalamDataset(df_new)

# Split the new dataset
train_size = int(0.8 * len(new_malayalam_dataset))
val_size = len(new_malayalam_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(new_malayalam_dataset, [train_size, val_size])

# Continue training on the new dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Continue training the model with the new data
for epoch_idx in range(epoch_intervals):
    current_epoch = (epoch_idx + 1) / 10
    trainer.train()
    perplexity = calculate_perplexity(model, val_dataset)
    perplexity_matrix[epoch_idx] = perplexity
    print(f"Perplexity at {current_epoch:.1f} epochs: {perplexity}")

# Optionally, save the updated model and tokenizer
model.save_pretrained('updated_llama_model')
tokenizer.save_pretrained('updated_llama_tokenizer')
print("Further training complete!")
print("Training complete. Perplexity Matrix:", perplexity_matrix)
