### Article Generation using N-grams

You have to write the urdu article using ngrams (1 grams to 5 grams). So, in short your output must be of 5 paragraphs, the first one is generated using unigram, second one is generated using bigram and so on.

Input: Your input is the seed sentence. E.g. first 3 to 4 words of the paragraph.

Output: Your output is the consist of 5 paragraphs for each n gram, each of 200 words.

You have to make N-gram model using the provided dataset. Dataset can be downloaded from  https://www.kaggle.com/datasets/saurabhshahane/urdu-news-dataset

You have to use all News Text column.

In [2]:
"""
Urdu N-gram generator (1-gram to 5-gram) using a local CSV file.

Dependencies:
  pip install pandas tqdm
"""

import sys
import random
import re
from collections import defaultdict, Counter

import pandas as pd
from tqdm import tqdm

WORD_RE = re.compile(r"[\u0600-\u06FF]+|[A-Za-z0-9]+|[،؛؟.!]+", flags=re.UNICODE)
PUNCT = set("،؛؟.!؟")

def tokenize(text):
    return WORD_RE.findall(text)

def detokenize(tokens):
    out = []
    for t in tokens:
        if out and t in PUNCT:
            out[-1] = out[-1] + t
        else:
            out.append(t)
    return " ".join(out)

def load_dataset(csv_path):
    df = pd.read_csv(csv_path, encoding="utf-8", low_memory=False)
    text_cols = []
    for col in df.columns:
        sample = df[col].astype(str).dropna().head(200).str.strip()
        if sample.str.contains(r"[\u0600-\u06FF]", regex=True).sum() > 0:
            text_cols.append(col)
    if not text_cols:
        raise ValueError("No Urdu text column found in dataset.")
    series = df[text_cols].astype(str).fillna("").agg(" ".join, axis=1)
    return series[series.str.strip().str.len() > 20].tolist()

def preprocess_corpus(raw_texts):
    tokenized = []
    for t in tqdm(raw_texts, desc="Tokenizing corpus"):
        toks = tokenize(str(t).strip())
        if toks:
            tokenized.append(toks)
    return tokenized

def build_ngrams(corpus_tokens_list, max_n=5):
    models = {n: defaultdict(Counter) for n in range(1, max_n + 1)}
    for tokens in tqdm(corpus_tokens_list, desc="Building n-grams"):
        L = len(tokens)
        for i in range(L):
            for n in range(1, max_n + 1):
                if i + n <= L:
                    ngram = tuple(tokens[i:i + n])
                    context = () if n == 1 else ngram[:-1]
                    word = ngram[-1]
                    models[n][context][word] += 1
    return models

def sample_next(counter):
    if not counter:  # Handle empty counter
        return ""
    words, freqs = zip(*counter.items())
    total = sum(freqs)
    if total == 0: #Handle zero total frequency
        return ""
    return random.choices(words, weights=[f/total for f in freqs], k=1)[0]

def generate_paragraph(models, order, seed_tokens, target_words=200):
    out = list(seed_tokens)
    while len(out) < target_words:
        placed = False
        for context_len in reversed(range(0, order)):
            context = tuple(out[-context_len:]) if context_len > 0 else ()
            counter = models[context_len + 1].get(context)
            if counter:
                next_word = sample_next(counter)
                if next_word:
                    out.append(next_word)
                    placed = True
                    break
        if not placed:
            next_word = sample_next(models[1][()])
            if next_word:
                out.append(next_word)
            else: # Break if no word can be sampled even from unigram
                break
    return detokenize(out[:target_words])

# Modified main function to accept arguments directly
def main(csv_path, seed, words=200, output="generated_urdu_paragraphs.txt"):
    raw_texts = load_dataset(csv_path)
    tokenized_texts = preprocess_corpus(raw_texts)
    models = build_ngrams(tokenized_texts, max_n=5)
    seed_tokens = tokenize(seed)
    outputs = []
    for n in range(1, 6):
        paragraph = generate_paragraph(models, n, seed_tokens, target_words=words)
        outputs.append((n, paragraph))
    with open(output, "w", encoding="utf-8") as f:
        for n, p in outputs:
            f.write(f"--- {n}-gram paragraph ---\n")
            f.write(p + "\n\n")
    print("Saved generated paragraphs to", output)

# Example usage within the notebook:
# main(csv_path="/content/urdu-news-dataset-1M.csv", seed="یہ ایک مثال")

In [4]:
# Example usage within the notebook:
main(csv_path="/content/urdu-news-dataset-1M.csv", seed="یہ ایک مثال")

Tokenizing corpus: 100%|██████████| 46216/46216 [00:06<00:00, 6813.61it/s] 
Building n-grams: 100%|██████████| 46216/46216 [03:03<00:00, 252.11it/s]


Saved generated paragraphs to generated_urdu_paragraphs.txt


### Classify language out of the list given below using just stop words. Remove punctuations, make it lower.

In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
import nltk
from nltk.corpus import stopwords
stopwords.fileids()

['albanian',
 'arabic',
 'azerbaijani',
 'basque',
 'belarusian',
 'bengali',
 'catalan',
 'chinese',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'greek',
 'hebrew',
 'hinglish',
 'hungarian',
 'indonesian',
 'italian',
 'kazakh',
 'nepali',
 'norwegian',
 'portuguese',
 'romanian',
 'russian',
 'slovene',
 'spanish',
 'swedish',
 'tajik',
 'tamil',
 'turkish']

In [11]:
Test="An article is qualunque member van un class of dedicated words naquele estão used with noun phrases per mark the identifiability of the referents of the noun phrases"

In [12]:
import nltk
from nltk.corpus import stopwords
import string

def classify_language(text, languages):
    """Classifies the language of a text based on stopwords."""
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()

    language_scores = {}
    for lang in languages:
        try:
            stop_words = set(stopwords.words(lang))
            score = len(set(words) & stop_words)
            language_scores[lang] = score
        except OSError:
            # Handle languages for which stopwords are not available
            language_scores[lang] = 0
    return language_scores

# Get the list of available languages for stopwords
available_languages = stopwords.fileids()

# Test the function with the provided text
language_scores = classify_language(Test, available_languages)

display(language_scores)

{'albanian': 1,
 'arabic': 0,
 'azerbaijani': 1,
 'basque': 0,
 'belarusian': 0,
 'bengali': 0,
 'catalan': 3,
 'chinese': 0,
 'danish': 0,
 'dutch': 3,
 'english': 5,
 'finnish': 0,
 'french': 1,
 'german': 1,
 'greek': 0,
 'hebrew': 0,
 'hinglish': 8,
 'hungarian': 1,
 'indonesian': 1,
 'italian': 2,
 'kazakh': 0,
 'nepali': 0,
 'norwegian': 0,
 'portuguese': 1,
 'romanian': 1,
 'russian': 0,
 'slovene': 0,
 'spanish': 1,
 'swedish': 0,
 'tajik': 0,
 'tamil': 0,
 'turkish': 0}

### Rule Based Roman Urdu Text Normalization

Roman Urdu lacks standard lexicon and usually many spelling variations exist for a given word, e.g., the word zindagi (life) is also written as zindagee, zindagy, zaindagee and zndagi. So, in this question you have to Normalize Roman Urdu words using the following Rules given in the attached Pdf. Your Code works for a complete Sentence or multiple sentences.

For Example: zaroori, zaruri, zarori map to the 'zrory'. So zrory becomes the correct word for all representations mentioned above.

In [13]:
import re

# Function to normalize text based on the given rules
def normalize_roman_urdu(text):
    rules = [
        # Rule #: (pattern, replacement, flags)
        (r'ain$', 'ein'),                 # 1
        (r'(?<!^)(ar)', 'r'),             # 2 (not at start)
        (r'ai', 'ae'),                    # 3
        (r'i[yY]+', 'i'),                 # 4 multiple y's
        (r'ay$', 'e'),                    # 5
        (r'ih[hH]+', 'eh'),               # 6
        (r'ey$', 'e'),                    # 7
        (r's+', 's'),                     # 8
        (r'ie$', 'e'),                    # 9
        (r'ry(?!$)', 'ri'),               # 10 (not at end)
        (r'^es', 'is'),                   # 11 (start)
        (r'sy$', 'si'),                   # 12 (end)
        (r'a+', 'a'),                     # 13
        (r'ty(?!$)', 'ti'),               # 14 (not at end)
        (r'j+', 'j'),                     # 15
        (r'o+', 'o'),                     # 16
        (r'e{2,}', 'i'),                  # 17 multiple e's
        (r'(?<=[a-zA-Z])i$', 'y'),        # 18 i→y if preceded by letter
        (r'd+', 'd'),                     # 19
        (r'u', 'o'),                      # 20
        (r'(?<=\w)h', '')                  # 21 remove h if preceded by letter
    ]

    for pattern, repl in rules:
        text = re.sub(pattern, repl, text)

    return text

# Example
sentence = "zaroori zaruri zarori zindagee zindagy zaindagee zndagi"
words = sentence.split()
normalized_words = [normalize_roman_urdu(word) for word in words]

print(" ".join(normalized_words))


zrory zrory zrory zindagy zindagy zaendagy zndagy
