In [None]:
import os
import argparse
import datetime
import re
import glob
from tqdm import tqdm
import pandas as pd
from tokenizers import SentencePieceBPETokenizer
from transformers import PreTrainedTokenizerFast


In [None]:
english_pattern = re.compile(r'[A-Za-z]')

file = '/content/cleaned_taasir_articles.csv'

df=pd.read_csv(file)
df['Content'] = df['Content'].astype(str)
df = df[~df["Content"].str.contains(english_pattern)]
df.to_csv(f"{file[:-4]}_filtered.csv",index=False)

In [None]:
english_pattern = re.compile(r'[A-Za-z]')

file = '/content/deduplicated_cleaned_Jang_all_articles.csv'

df=pd.read_csv(file)
df['Content'] = df['Content'].astype(str)
df = df[~df["Content"].str.contains(english_pattern)]
df.to_csv(f"{file[:-4]}filtered.csv",index=False)

In [None]:
def train_tokenizer(data_list, vocab_size=32768, model_name="./urdu_tokenizer_v2"):

    ## Change bos & eos
    bos_tok = "<sos>"
    eos_tok = "<end_of_sen>"

    ## Add basic characters to this below list, including numbers & special language characters.
    special_char = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]

    tokenizer = SentencePieceBPETokenizer()

    tokenizer.train_from_iterator(
        data_list,
        vocab_size,
        min_frequency = 5,
        special_tokens = ["<pad>", "<unk>", bos_tok, eos_tok, "<user>", "<assistant>"] + special_char,
        show_progress = True,
    )

    ## Don't forget to add special tokens.
    transformer_tokenizer = PreTrainedTokenizerFast(
        tokenizer_object=tokenizer,
        bos_token = bos_tok,
        eos_token = eos_tok,
        unk_token = "<unk>",
        pad_token = "<pad>",
        mask_token = "<mask>",
        padding_side = "left",
        truncation_side = "right",
        additional_special_tokens = ["<user>", "<assistant>"],
        clean_up_tokenization_spaces = False,
    )

    transformer_tokenizer.save_pretrained(model_name)

In [None]:
### Importing Data
df = pd.read_csv("/content/deduplicated_cleaned_Jang_all_articlesfiltered.csv", encoding = "utf-8")

In [None]:
len(df["Content"].to_list())

448588

In [None]:
### Executing Training Function to Train tokenizer
train_tokenizer(df["Content"].to_list())

In [None]:
### Testing Training Tokenizer
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("./urdu_tokenizer_v2")

In [None]:
len(tokenizer.get_vocab())

32769

In [None]:
df_test= pd.read_csv("/content/cleaned_taasir_articles_filtered.csv")

In [None]:
import numpy as np
input_list=df_test["Content"].to_list()
input_ids_len=[]
counts=[]
for i in range(len(df_test)):
  if isinstance(input_list[i], str) and pd.notna(input_list[i]):
    input_ids = tokenizer.encode(input_list[i])
    input_ids_len.append(len(input_ids))
    word_count = len(input_list[i])
    counts.append(word_count)
  else:
    print(f"Skipping element at index {i} due to NaN or non-string value.")

input_ids_len=np.array(input_ids_len)
counts=np.array(counts)
f_score = np.mean(input_ids_len/counts)

In [None]:
f_score

0.30628303299649967

#### Data preperation for PRE-Training

In [None]:
### Importing Data
df_1 = pd.read_csv("/content/cleaned_taasir_articles_filtered.csv")
# df_2 = pd.read_csv("English_2.csv")

In [None]:
df_1.iloc[:, 0] = df_1.iloc[:, 0].astype(str) + "<eos>"
# df_2.iloc[:, 0] = df_2.iloc[:, 0] + "<eos>"

1           1<eos>
2           3<eos>
3           4<eos>
4           5<eos>
           ...    
7418     9766<eos>
7419     9767<eos>
7420    12690<eos>
7421    13097<eos>
7422    13318<eos>
Name: Unnamed: 0, Length: 7423, dtype: object' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df_1.iloc[:, 0] = df_1.iloc[:, 0].astype(str) + "<eos>"


In [None]:
df_1.iloc[0, 0]

'0<eos>'

In [None]:
input_ids = tokenizer(df_1.iloc[:, 0].to_list())["input_ids"]

In [None]:
df_1["tokens"] = input_ids

In [None]:
df_1

Unnamed: 0.1,Unnamed: 0,Title,URL,Content,tokens
0,0<eos>,گنپتی وسرجن اورعید میلاد النبی (ص) جلوس کے دور...,https://taasir.com/2024/09/demand-for-investig...,تاثیر ۲۱ستمبر۲۰۲۴:- ایس -ایم- حسن ممبئی ، 21 ...,"[6, 215, 1, 1, 1, 1, 34]"
1,1<eos>,اگر دہلی کے عوام نے کیجریوال کو وزیر اعلیٰ نہی...,https://taasir.com/2024/09/if-the-people-of-de...,تاثیر ۲۱ستمبر۲۰۲۴:- ایس -ایم- حسن نئی دہلی، 2...,"[7, 215, 1, 1, 1, 1, 34]"
2,3<eos>,عوامی شکایات کے 68 مقدمات کی ہوئی سماعت,https://taasir.com/2024/09/68-cases-of-public-...,تاثیر ۲۱ستمبر۲۰۲۴:- ایس -ایم- حسن,"[9, 215, 1, 1, 1, 1, 34]"
3,4<eos>,وزیراعلیٰ نے گنگا ندی کے بڑھتے پانی کی سطح کا ...,https://taasir.com/2024/09/chief-minister-insp...,تاثیر ۲۱ستمبر۲۰۲۴:- ایس -ایم- حسن ۔ سیلاب کے ...,"[10, 215, 1, 1, 1, 1, 34]"
4,5<eos>,آتشی مرلینا کی تاجپوشی,https://taasir.com/2024/09/atishi-take-oath-as...,تاثیر ۲۱ستمبر۲۰۲۴:- ایس -ایم- حسن,"[11, 215, 1, 1, 1, 1, 34]"
...,...,...,...,...,...
7418,9766<eos>,नूंह हिंसा पर अजमेर दरगाह के दीवान ने की शांति...,https://taasir.com/2023/08/%e0%a4%a8%e0%a5%82%...,नूंह हिंसा पर अजमेर दरगाह के दीवान ने की शांति...,"[15, 13, 12, 12, 215, 1, 1, 1, 1, 34]"
7419,9767<eos>,इरफान अंसारी सदन के अंदर कान पकड़ कर माफी मांग...,https://taasir.com/2023/08/%e0%a4%87%e0%a4%b0%...,इरफान अंसारी सदन के अंदर कान पकड़ कर माफी मांग...,"[15, 13, 12, 13, 215, 1, 1, 1, 1, 34]"
7420,12690<eos>,قومی اردو کونسل کے کمپیوٹر سینٹرز کے کامیاب طل...,https://taasir.com/2023/03/%d9%82%d9%88%d9%85%...,جدید ٹکنالوجی میں مہارت و اختصاص پیدا کرنا وقت...,"[7, 8, 12, 15, 6, 215, 1, 1, 1, 1, 34]"
7421,13097<eos>,مہاراشٹر میں غیر قانونی طریقے سے رہ رہے 18بنگل...,https://taasir.com/2023/03/%d9%85%db%81%d8%a7%...,تھانے،4مارچ : مہاراشٹر کے تھانے ضلع میں پولیس ...,"[7, 9, 6, 15, 13, 215, 1, 1, 1, 1, 34]"
