###### © Habibi Group, Fall 2024
This the second model for the project. It uses a custom build vectorizer to make sparse vectors for each sentence and then uses cosine distance (dot product) as the nearess measure. The model is trained on the training data and then tested on the test data.

*THIS IS THE COMBINED DATA FLAVOR 2*

In [1]:
# Importing libraries
import re
import json
import math
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

Preparing the dataset for the *Naive Bayes* model.

In [2]:
# Loading data
df = pd.read_csv('../combined_data/dataset.csv')
df = df.dropna()
df.head()

Unnamed: 0,id,local_id,link,title,content,gold_label
0,1,1,https://urdu.arynews.tv/car-sales-in-pakistan/,پاکستان میں گاڑیوں کی فروخت میں بڑا اضافہ,ملکی آٹو سیکٹر سے زبردست خبر آگئی۔ پاکستان می...,Business
1,2,2,https://urdu.arynews.tv/gold-rates-in-pakistan-3/,پاکستان میں سونے کی قیمت آج کتنی کم ہوئی؟,کراچی: کاروباری ہفتے کے پہلے روز سونے کی قیمت ...,Business
2,3,5,https://urdu.arynews.tv/cotton-production-cott...,امریکا سے معیاری روئی کی درآمد بڑھ گئی,کراچی: پاکستان میں کپاس کی پیداوار میں کمی کے ...,Business
3,4,3,https://urdu.arynews.tv/psx-today-11-nov/,پاکستان اسٹاک ایکسچینج میں نئی تاریخ رقم,پاکستان اسٹاک ایکسچینج نے ایک اور سنگ میل عبور...,Business
4,5,4,https://urdu.arynews.tv/ghee-and-cooking-oil-p...,عوام کے لیے نئی مشکل : گھی اور کوکنگ آئل کی قی...,لاہور : گھی اور کوکنگ آئل کی قیمتوں میں ایک با...,Business


Cleaning the data and preprocessing.

In [3]:
#Preprocessing the data
# Add this function to perform stemming
def simple_urdu_stemmer(word):
    suffixes = ['یں', 'اں', 'وں', 'یں', 'ہاں', 'ی', 'ے', 'و', 'ہ']
    for suffix in suffixes:
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word

# Loading Urdu stopwords from the json file
with open('../data/kaggle_stopwords.json', 'r', encoding='utf-8') as file:
    urdu_stopwords = set(json.load(file).keys())

#Loading Shanzae Stopwords
with open('../data/shanzae/stopwords.json', 'r', encoding='utf-8') as file:
    shanzae_stopwords = set(json.load(file).keys())

#Loading Yamsheen Stopwords
with open('../data/yamsheen/stopwords.json', 'r', encoding='utf-8') as file:
    yamsheen_stopwords = set(json.load(file).keys())

# Function to clean our Urdu sentences
def clean_content(text, stopwords):
    text = str(text)
    text = re.sub(r'[^\u0600-\u06FF\s]', '', text)
    text = ' '.join(word for word in text.split() if word not in stopwords)
    text = ' '.join(word for word in text.split() if word not in shanzae_stopwords)
    text = ' '.join(word for word in text.split() if word not in yamsheen_stopwords)
    text = text.lower()
    text = ' '.join(simple_urdu_stemmer(word) for word in text.split())
    return text

df['content'] = df['content'].apply(lambda x: clean_content(x, urdu_stopwords))
df.head()

Unnamed: 0,id,local_id,link,title,content,gold_label
0,1,1,https://urdu.arynews.tv/car-sales-in-pakistan/,پاکستان میں گاڑیوں کی فروخت میں بڑا اضافہ,ملک آٹ سیکٹر س زبردست خبر آگئی۔ پاکستان گاڑی ...,Business
1,2,2,https://urdu.arynews.tv/gold-rates-in-pakistan-3/,پاکستان میں سونے کی قیمت آج کتنی کم ہوئی؟,کراچ کاروبار ہفت پہل روز سون قیمت رجحان رہا۔ پ...,Business
2,3,5,https://urdu.arynews.tv/cotton-production-cott...,امریکا سے معیاری روئی کی درآمد بڑھ گئی,کراچ پاکستان کپاس پیداوار باعث اسپننگ مل س معی...,Business
3,4,3,https://urdu.arynews.tv/psx-today-11-nov/,پاکستان اسٹاک ایکسچینج میں نئی تاریخ رقم,پاکستان اسٹاک ایکسچینج ن میل عبور لیا۔ کاروبار...,Business
4,5,4,https://urdu.arynews.tv/ghee-and-cooking-oil-p...,عوام کے لیے نئی مشکل : گھی اور کوکنگ آئل کی قی...,لاہور کوکنگ آئل قیمت اضاف ہوا، قمیت س تجاوز کر...,Business


Implementation of the Naive Bayes model.

In [4]:
# Implementing TF-IDF Vectorizer
def compute_tfidf(corpus):
    tf = defaultdict(Counter)
    df = Counter()
    N = len(corpus)
    
    for doc_id, doc in enumerate(corpus):
        tokens = doc.split()
        tf[doc_id].update(tokens)
        for token in set(tokens):
            df[token] += 1
    
    tfidf = defaultdict(dict)
    for doc_id, term_freqs in tf.items():
        for term, freq in term_freqs.items():
            tfidf[doc_id][term] = (freq / len(term_freqs)) * math.log(N / (df[term] + 1))
    
    return tfidf

In [5]:
# Implementing Naive Bayes Classifier
class NaiveBayesClassifier:
    def __init__(self):
        self.prior_probs = {}
        self.ngram_counts = {}
        self.total_ngrams_per_class = {}
        self.vocabulary = set()

    def train(self, data, n=1):
        class_counts = data['gold_label'].value_counts()
        total_documents = len(data)
        self.prior_probs = {cls: count / total_documents for cls, count in class_counts.items()}

        self.ngram_counts = {cls: defaultdict(int) for cls in class_counts.index}
        self.total_ngrams_per_class = {cls: 0 for cls in class_counts.index}

        for index, row in data.iterrows():
            cls = row['gold_label']
            tokens = row['content'].split()
            ngrams = [' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
            self.vocabulary.update(ngrams)
            for ngram in ngrams:
                self.ngram_counts[cls][ngram] += 1
                self.total_ngrams_per_class[cls] += 1

    def predict(self, data, n=1):
        predictions = []
        vocab_size = len(self.vocabulary)

        for index, row in data.iterrows():
            tokens = row['content'].split()
            ngrams = [' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
            class_scores = {}

            for cls in self.prior_probs:
                log_prob = 0
                for ngram in ngrams:
                    count = self.ngram_counts[cls].get(ngram, 0) + 1  # Laplace smoothing
                    total = self.total_ngrams_per_class[cls] + vocab_size
                    log_prob += math.log(count / total)
                class_scores[cls] = math.log(self.prior_probs[cls]) + log_prob

            predicted_class = max(class_scores, key=class_scores.get)
            predictions.append(predicted_class)

        return predictions

Training the Model.<br>
*We are using unigrams for **Naive Bayes** because unigrams gives out the best accuracies in our testing of the model.*

In [6]:
# Split into train and test
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Train the classifier
classifier = NaiveBayesClassifier()
n = 1  # Use unigrams
classifier.train(train_data, n)

# Predict on the test set
predictions = classifier.predict(test_data, n)

# Evaluate the performance
y_val = test_data['gold_label'].values
accuracy = accuracy_score(y_val, predictions)
report = classification_report(y_val, predictions)

print(f"Accuracy on test dataset: {accuracy:.4f}")
print("\nClassification Report on test dataset:")
print(report)

Accuracy on test dataset: 0.8782

Classification Report on test dataset:
                    precision    recall  f1-score   support

          Business       0.84      0.89      0.86       246
     Entertainment       0.91      0.92      0.91       284
     International       0.82      0.80      0.81       299
Science-Technology       0.86      0.83      0.85       311
            Sports       0.96      0.95      0.96       305

          accuracy                           0.88      1445
         macro avg       0.88      0.88      0.88      1445
      weighted avg       0.88      0.88      0.88      1445



##### Testing on Externally Source Data *(to mimic the real-world scenario)*

- The test is on `DAWN` dataset, which follows a similar distribution as our training set.

In [7]:
# Load the new dataset
new_df = pd.read_csv('../data/dawn_dataset_c.csv')

# Preprocess the content
new_df['content'] = new_df['content'].apply(lambda x: clean_content(x, urdu_stopwords))

# Predict using the trained Naive Bayes classifier
predictions = classifier.predict(new_df, n)

# If the new dataset has labels, evaluate the performance
if 'gold_label' in new_df.columns:
    y_true = new_df['gold_label'].values
    accuracy = accuracy_score(y_true, predictions)
    report = classification_report(y_true, predictions)
    print("Accuracy on new dataset:", accuracy)
    print("\nClassification Report on new dataset:")
    print(report)
else:
    # Add predictions to the dataframe
    new_df['predicted_labels'] = predictions

    # Save predictions to CSV
    new_df.to_csv('./data/dawn_dataset_predictions.csv', index=False)

    # Display the predictions
    print(new_df[['content', 'predicted_labels']])

Accuracy on new dataset: 0.8974358974358975

Classification Report on new dataset:
                    precision    recall  f1-score   support

          Business       0.96      0.86      0.91        63
     Entertainment       0.00      0.00      0.00         3
     International       0.90      0.97      0.93       149
Science-Technology       0.85      0.58      0.69        19
            Sports       0.00      0.00      0.00         0

          accuracy                           0.90       234
         macro avg       0.54      0.48      0.51       234
      weighted avg       0.90      0.90      0.89       234



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


- The test is on `BBC` dataset, which follows a different distribution as our training set with long articles.

In [8]:
# Load the new dataset
new_df = pd.read_csv('../data/bbc_dataset_c.csv')

# Preprocess the content
new_df['content'] = new_df['content'].apply(lambda x: clean_content(x, urdu_stopwords))

# Predict using the trained Naive Bayes classifier
predictions = classifier.predict(new_df, n)

# If the new dataset has labels, evaluate the performance
if 'gold_label' in new_df.columns:
    y_true = new_df['gold_label'].values
    accuracy = accuracy_score(y_true, predictions)
    report = classification_report(y_true, predictions)
    print("Accuracy on new dataset:", accuracy)
    print("\nClassification Report on new dataset:")
    print(report)
else:
    # Add predictions to the dataframe
    new_df['predicted_labels'] = predictions

    # Save predictions to CSV
    new_df.to_csv('./data/dawn_dataset_predictions.csv', index=False)

    # Display the predictions
    print(new_df[['content', 'predicted_labels']])

Accuracy on new dataset: 0.8458188153310104

Classification Report on new dataset:
                    precision    recall  f1-score   support

          Business       0.91      0.86      0.88       222
     Entertainment       0.85      0.89      0.87       240
     International       0.75      0.88      0.81       208
Science-Technology       0.76      0.67      0.71       239
            Sports       0.97      0.94      0.95       239

          accuracy                           0.85      1148
         macro avg       0.85      0.85      0.84      1148
      weighted avg       0.85      0.85      0.85      1148



*Thank you for bearing through this end*.<br>
For testing your dataset, please change one of the above *External Test Datasets* to your dataset and run the code. The notebook will automatically test the model on the new dataset. Please ensure that the file direcotry is correct and the dataset is in the same format as the training and testing datasets. See *Testing your dataset* section in the report for more details.
###### (c) Habibi Group, Fall 2024