In [1]:
from datasets import load_dataset

dataset = load_dataset("KANZOO/scrapped_articles")

print(dataset)

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['title', 'text'],
        num_rows: 144
    })
})


In [2]:

# Access the 'train' subset
train_dataset = dataset['train']


# Function to merge all text into one large dataset
def merge_all_text(dataset):
    combined_text = ""
    for example in dataset['train']:
        combined_text += example['text'] + " "  
    return combined_text.strip()


# Merge all text from the dataset
combined_text = merge_all_text(dataset)



In [3]:
# Function to count the number of words
def count_words(text):
    words = text.split()  
    return len(words)

# Count the words in the merged text
word_count = count_words(combined_text)

# Print the total word count
print("Total Word Count:", word_count)

Total Word Count: 247616


In [4]:
import re
# Function to remove all punctuation except for Urdu period (۔)
def remove_punctuation(text):
    # Regex pattern to remove all punctuation except for "۔"
    pattern = r'[^\w\s۔]'  
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text


# Remove all punctuation except for "۔"
cleaned_text = remove_punctuation(combined_text)

In [5]:
# Function to count the number of words
def count_words(cleaned_text):
    words = cleaned_text.split()  
    return len(words)

# Count the words in the merged text
word_count = count_words(cleaned_text)

# Print the total word count
print("Total Word Count:", word_count)

Total Word Count: 247421


In [6]:
# Function to count the number of sentences based on the period "۔"
def count_sentences(cleaned_text):
    sentences = cleaned_text.split('۔') 
    # Filter out empty strings resulting from splitting
    return len([s for s in sentences if s.strip()])  

# Get the sentences in the cleaned text
sentences = count_sentences(cleaned_text) 

# Print the total sentence count
print("Total Sentence Count:", sentences)

Total Sentence Count: 9251


In [7]:
# Function to remove the Urdu period "۔" from the text
def remove_period(text):
    cleaned_text_without_period = text.replace('۔', '')  
    return cleaned_text_without_period

cleaned_text_without_period = remove_period(cleaned_text)

### Embedding Similarity Thresholding for SBD.

In [8]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

# Load the pre-trained mBERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased')



In [9]:
input_text = cleaned_text_without_period

In [10]:
# Step 1: Tokenize the input text in batches
def chunk_text(text, chunk_size=128):
    """Split text into smaller batches."""
    tokens = tokenizer.tokenize(text)
    return [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]


In [11]:
# Step 2: Process each batch through BERT and collect embeddings
def process_batches(token_batches):
    all_embeddings = []
    all_tokens = []
    
    for token_batch in token_batches:
        # Convert tokens to token ids
        inputs = tokenizer.convert_tokens_to_ids(token_batch)
        inputs = torch.tensor([inputs])  
        
        # Get embeddings from BERT
        with torch.no_grad():
            outputs = model(inputs)
            last_hidden_states = outputs.last_hidden_state 
        
        # Collect the embeddings and tokens
        embeddings = last_hidden_states.squeeze(0).numpy()  
        all_embeddings.append(embeddings)
        all_tokens.extend(token_batch)
    
    # Concatenate embeddings for all batches
    all_embeddings = np.vstack(all_embeddings)
    return all_tokens, all_embeddings

In [12]:
# Split the text into batches and process
token_batches = chunk_text(input_text)
tokens, embeddings = process_batches(token_batches)

# Normalize embeddings for comparison
norm_embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)


In [16]:
# Step 3: Detect sentence boundaries based on similarity threshold
threshold = 0.6 
cosine_similarities = np.array([np.dot(norm_embeddings[i], norm_embeddings[i+1]) for i in range(len(norm_embeddings)-1)])

detected_sentences = []
current_sentence = []

for i, token in enumerate(tokens[:-1]):  
    current_sentence.append(token)
    
    # If the cosine similarity drops below the threshold, it indicates a sentence boundary
    if cosine_similarities[i] < threshold:
        detected_sentences.append(" ".join(current_sentence))
        current_sentence = []

In [17]:
# Add the last sentence if not empty
if current_sentence:
    detected_sentences.append(" ".join(current_sentence))

# Print only the first few detected sentences 
for i, sentence in enumerate(detected_sentences[:10]):  
    print(f"Detected Sentence {i+1}: {sentence}")

# Optionally, print the total number of detected sentences
print(f"\nTotal sentences detected: {len(detected_sentences)}")

Detected Sentence 1: اس
Detected Sentence 2: س
Detected Sentence 3: ##لس
Detected Sentence 4: ##لے
Detected Sentence 5: کی
Detected Sentence 6: دیگر
Detected Sentence 7: ا
Detected Sentence 8: ##ق
Detected Sentence 9: ##سا
Detected Sentence 10: ##ط

Total sentences detected: 393397
