###### © Habibi Group, Fall 2024
This the third model for the project. It uses a custom build vectorizer to make sparse vectors for each sentence and then uses cosine distance (dot product) as the nearess measure. The model is trained on the training data and then tested on the test data.

*THIS IS THE COMBINED DATA FLAVOR 2*

In [1]:
# Importing libraries
import re
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

Preparing the dataset for the *Naive Bayes* model.

In [2]:
# Loading data
df = pd.read_csv('../combined_data/dataset.csv')
df = df.dropna()
df.head()

Unnamed: 0,id,local_id,link,title,content,gold_label
0,1,1,https://urdu.arynews.tv/car-sales-in-pakistan/,پاکستان میں گاڑیوں کی فروخت میں بڑا اضافہ,ملکی آٹو سیکٹر سے زبردست خبر آگئی۔ پاکستان می...,Business
1,2,2,https://urdu.arynews.tv/gold-rates-in-pakistan-3/,پاکستان میں سونے کی قیمت آج کتنی کم ہوئی؟,کراچی: کاروباری ہفتے کے پہلے روز سونے کی قیمت ...,Business
2,3,5,https://urdu.arynews.tv/cotton-production-cott...,امریکا سے معیاری روئی کی درآمد بڑھ گئی,کراچی: پاکستان میں کپاس کی پیداوار میں کمی کے ...,Business
3,4,3,https://urdu.arynews.tv/psx-today-11-nov/,پاکستان اسٹاک ایکسچینج میں نئی تاریخ رقم,پاکستان اسٹاک ایکسچینج نے ایک اور سنگ میل عبور...,Business
4,5,4,https://urdu.arynews.tv/ghee-and-cooking-oil-p...,عوام کے لیے نئی مشکل : گھی اور کوکنگ آئل کی قی...,لاہور : گھی اور کوکنگ آئل کی قیمتوں میں ایک با...,Business


Cleaning the data and preprocessing.

In [3]:
#Preprocessing the data
# Add this function to perform stemming
def simple_urdu_stemmer(word):
    suffixes = ['یں', 'اں', 'وں', 'یں', 'ہاں', 'ی', 'ے', 'و', 'ہ']
    for suffix in suffixes:
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word

# Loading Urdu stopwords from the json file
with open('../data/kaggle_stopwords.json', 'r', encoding='utf-8') as file:
    urdu_stopwords = set(json.load(file).keys())

#Loading Shanzae Stopwords
with open('../data/shanzae/stopwords.json', 'r', encoding='utf-8') as file:
    shanzae_stopwords = set(json.load(file).keys())

#Loading Yamsheen Stopwords
with open('../data/yamsheen/stopwords.json', 'r', encoding='utf-8') as file:
    yamsheen_stopwords = set(json.load(file).keys())

# Function to clean our Urdu sentences
def clean_content(text, stopwords):
    text = str(text)
    text = re.sub(r'[^\u0600-\u06FF\s]', '', text)
    text = ' '.join(word for word in text.split() if word not in stopwords)
    text = ' '.join(word for word in text.split() if word not in shanzae_stopwords)
    text = ' '.join(word for word in text.split() if word not in yamsheen_stopwords)
    text = text.lower()
    text = ' '.join(simple_urdu_stemmer(word) for word in text.split())
    return text

df['content'] = df['content'].apply(lambda x: clean_content(x, urdu_stopwords))
df.head()

Unnamed: 0,id,local_id,link,title,content,gold_label
0,1,1,https://urdu.arynews.tv/car-sales-in-pakistan/,پاکستان میں گاڑیوں کی فروخت میں بڑا اضافہ,ملک آٹ سیکٹر س زبردست خبر آگئی۔ پاکستان گاڑی ...,Business
1,2,2,https://urdu.arynews.tv/gold-rates-in-pakistan-3/,پاکستان میں سونے کی قیمت آج کتنی کم ہوئی؟,کراچ کاروبار ہفت پہل روز سون قیمت رجحان رہا۔ پ...,Business
2,3,5,https://urdu.arynews.tv/cotton-production-cott...,امریکا سے معیاری روئی کی درآمد بڑھ گئی,کراچ پاکستان کپاس پیداوار باعث اسپننگ مل س معی...,Business
3,4,3,https://urdu.arynews.tv/psx-today-11-nov/,پاکستان اسٹاک ایکسچینج میں نئی تاریخ رقم,پاکستان اسٹاک ایکسچینج ن میل عبور لیا۔ کاروبار...,Business
4,5,4,https://urdu.arynews.tv/ghee-and-cooking-oil-p...,عوام کے لیے نئی مشکل : گھی اور کوکنگ آئل کی قی...,لاہور کوکنگ آئل قیمت اضاف ہوا، قمیت س تجاوز کر...,Business


Implementation of the Neural Network.

In [4]:
# Implementing Multi-Label Classification Neural Network
class MultiLabelNeuralNetwork:
    def __init__(self, input_size, hidden_sizes, output_size, learning_rate=0.01):
        self.learning_rate = learning_rate
        
        # Initialize weights and biases
        self.layers_sizes = [input_size] + hidden_sizes + [output_size]
        self.weights = []
        self.biases = []
        self.history = {'loss': [], 'accuracy': [], 'val_loss': [], 'val_accuracy': []}
        
        for i in range(len(self.layers_sizes) - 1):
            self.weights.append(np.random.randn(self.layers_sizes[i], self.layers_sizes[i+1]) * np.sqrt(2./self.layers_sizes[i]))
            self.biases.append(np.zeros((1, self.layers_sizes[i+1])))

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
    
    def sigmoid_derivative(self, x):
        return x * (1 - x)

    def forward_propagation(self, X):
        self.activations = [X]
        for i in range(len(self.weights)):
            net = np.dot(self.activations[-1], self.weights[i]) + self.biases[i]
            self.activations.append(self.sigmoid(net))
        return self.activations[-1]

    def backward_propagation(self, X, y, output):
        m = X.shape[0]
        delta = output - y
        
        dWeights = []
        dBiases = []
        
        for i in range(len(self.weights) - 1, -1, -1):
            dW = np.dot(self.activations[i].T, delta) / m
            db = np.sum(delta, axis=0, keepdims=True) / m
            
            if i > 0:
                delta = np.dot(delta, self.weights[i].T) * self.sigmoid_derivative(self.activations[i])
            
            dWeights.insert(0, dW)
            dBiases.insert(0, db)
        
        return dWeights, dBiases

    def fit(self, X, y, epochs=100, batch_size=32, verbose=True):
        m = X.shape[0]
        
        for epoch in range(epochs):
            # Mini-batch gradient descent
            indices = np.random.permutation(m)
            X = X[indices]
            y = y[indices]
            
            for i in range(0, m, batch_size):
                batch_X = X[i:i+batch_size]
                batch_y = y[i:i+batch_size]
                
                output = self.forward_propagation(batch_X)
                dWeights, dBiases = self.backward_propagation(batch_X, batch_y, output)
                
                for j in range(len(self.weights)):
                    self.weights[j] -= self.learning_rate * dWeights[j]
                    self.biases[j] -= self.learning_rate * dBiases[j]
            
            if verbose and epoch % 10 == 0:
                predictions = self.predict(X)
                accuracy = np.mean(np.all(predictions == y, axis=1))
                print(f"Epoch {epoch}, Accuracy: {accuracy:.4f}")

    def predict(self, X):
        output = self.forward_propagation(X)
        return (output >= 0.5).astype(int)

In [5]:
# Custom Multi-Label Binarizer from Scratch, One-Hot encoding
class CustomMultiLabelBinarizer:
    def __init__(self):
        self.classes_ = None
    
    def fit(self, y):
        unique_labels = set()
        for labels in y:
            # Handle both list/tuple and string inputs
            if isinstance(labels, str):
                unique_labels.add(labels)
            else:
                unique_labels.update(labels)
        
        # Sort labels
        self.classes_ = sorted(list(unique_labels))
        return self
    
    def transform(self, y):
        if self.classes_ is None:
            raise ValueError("Call fit before transform")
        
        # Initialize binary matrix
        n_samples = len(y)
        n_classes = len(self.classes_)
        binary_matrix = [[0] * n_classes for _ in range(n_samples)]
        
        # Create label to index mapping
        label_to_idx = {label: idx for idx, label in enumerate(self.classes_)}
        
        # Fill binary matrix
        for i, labels in enumerate(y):
            if isinstance(labels, str):
                labels = [labels]
            for label in labels:
                if label in label_to_idx:
                    binary_matrix[i][label_to_idx[label]] = 1
                    
        return binary_matrix
    
    def fit_transform(self, y):
        return self.fit(y).transform(y)

In [6]:
#Count Vecotrizer Implementation
class SimpleCountVectorizer:
    def __init__(self, max_features=1000):
        self.max_features = max_features
        self.vocabulary_ = {}
        self.vocab_size = 0
    
    def _tokenize(self, text):
        text = text.lower()
        tokens = re.findall(r'\b\w+\b', text)
        return tokens
    
    def fit(self, documents):
        word_freq = defaultdict(int)
        for doc in documents:
            tokens = self._tokenize(doc)
            for token in tokens:
                word_freq[token] += 1
        
        sorted_words = sorted(word_freq.items(), key=lambda x: (-x[1], x[0]))
        self.vocabulary_ = {word: idx for idx, (word, _) in enumerate(sorted_words[:self.max_features])}
        self.vocab_size = len(self.vocabulary_)
        return self
    
    def transform(self, documents):
        rows = []
        cols = []
        data = []
        
        for doc_idx, doc in enumerate(documents):
            tokens = self._tokenize(doc)
            doc_word_counts = defaultdict(int)
            
            for token in tokens:
                if token in self.vocabulary_:
                    doc_word_counts[token] += 1
                    
            for token, count in doc_word_counts.items():
                vocab_idx = self.vocabulary_[token]
                rows.append(doc_idx)
                cols.append(vocab_idx)
                data.append(count)
        
        # Create sparse matrix
        return csr_matrix((data, (rows, cols)), 
                        shape=(len(documents), self.vocab_size))
    
    def fit_transform(self, documents):
        return self.fit(documents).transform(documents)

Training the Model, and testing it.

In [7]:
# Prepare the data
vectorizer = SimpleCountVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['content'])

# Prepare labels
mlb = CustomMultiLabelBinarizer()
y = np.array(mlb.fit_transform([[label] for label in df['gold_label']]))

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to dense arrays
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

# Initialize and train the network
input_size = X_train_dense.shape[1]
hidden_sizes = [256, 128]
output_size = y_train.shape[1]

nn = MultiLabelNeuralNetwork(
    input_size=input_size,
    hidden_sizes=hidden_sizes,
    output_size=output_size,
    learning_rate=0.01
)

# Train
nn.fit(X_train_dense, y_train, epochs=250, batch_size=32)

# Evaluate
y_pred = nn.predict(X_test_dense)
print("\nNeural Network Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=mlb.classes_))

Epoch 0, Accuracy: 0.0000
Epoch 10, Accuracy: 0.5182
Epoch 20, Accuracy: 0.7606
Epoch 30, Accuracy: 0.8252
Epoch 40, Accuracy: 0.8555
Epoch 50, Accuracy: 0.8699
Epoch 60, Accuracy: 0.8821
Epoch 70, Accuracy: 0.8920
Epoch 80, Accuracy: 0.9036
Epoch 90, Accuracy: 0.9095
Epoch 100, Accuracy: 0.9178
Epoch 110, Accuracy: 0.9235
Epoch 120, Accuracy: 0.9270
Epoch 130, Accuracy: 0.9316
Epoch 140, Accuracy: 0.9356
Epoch 150, Accuracy: 0.9410
Epoch 160, Accuracy: 0.9455
Epoch 170, Accuracy: 0.9502
Epoch 180, Accuracy: 0.9508
Epoch 190, Accuracy: 0.9521
Epoch 200, Accuracy: 0.9553
Epoch 210, Accuracy: 0.9579
Epoch 220, Accuracy: 0.9590
Epoch 230, Accuracy: 0.9612
Epoch 240, Accuracy: 0.9628

Neural Network Accuracy: 0.8207612456747405

Classification Report:
                    precision    recall  f1-score   support

          Business       0.83      0.79      0.81       246
     Entertainment       0.93      0.90      0.92       284
     International       0.78      0.74      0.76       299
S

  _warn_prf(average, modifier, msg_start, len(result))


*Note: It gives an accuracy of 87.5% on the test dataset.*

##### Testing on Externally Source Data *(to mimic the real-world scenario)*

- The test is on `DAWN` dataset, which follows a similar distribution as our training set.

In [8]:
# Load the new dataset
new_df = pd.read_csv('../data/dawn_dataset_c.csv')

# Preprocess the content
new_df['content'] = new_df['content'].apply(lambda x: clean_content(x, urdu_stopwords))

# Transform the new data using the existing vectorizer
X_new = vectorizer.transform(new_df['content'])

# Convert to dense array
X_new_dense = X_new.toarray()

# Predict using the trained model
y_pred_new = nn.predict(X_new_dense)

# If the new dataset has labels, evaluate the performance
if 'gold_label' in new_df.columns:
    # Prepare labels
    new_df['gold_label'] = new_df['gold_label'].fillna('')
    new_df['gold_label'] = new_df['gold_label'].str.split(',')
    new_df['gold_label'] = new_df['gold_label'].apply(lambda x: [label.strip() for label in x if label.strip()])
    y_new = mlb.transform(new_df['gold_label'])

    # Evaluate performance
    print("Accuracy on new dataset:", accuracy_score(y_new, y_pred_new))
    print("\nClassification Report on new dataset:")
    print(classification_report(y_new, y_pred_new, target_names=mlb.classes_))
else:
    # Convert predictions to labels
    predicted_labels = mlb.inverse_transform(y_pred_new)

    # Add predictions to the dataframe
    new_df['predicted_labels'] = [';'.join(labels) for labels in predicted_labels]

    # Save predictions to CSV
    new_df.to_csv('./data/dawn_dataset_predictions.csv', index=False)

    # Display the predictions
    print(new_df[['content', 'predicted_labels']])

Accuracy on new dataset: 0.8803418803418803

Classification Report on new dataset:
                    precision    recall  f1-score   support

          Business       0.98      0.83      0.90        63
     Entertainment       0.00      0.00      0.00         3
     International       0.90      0.97      0.93       149
Science-Technology       0.88      0.74      0.80        19
            Sports       0.00      0.00      0.00         0

         micro avg       0.89      0.90      0.90       234
         macro avg       0.55      0.51      0.53       234
      weighted avg       0.91      0.90      0.90       234
       samples avg       0.89      0.90      0.89       234



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


- The test is on `BBC` dataset, which follows a different distribution as our training set with long articles.

In [9]:
# Load the new dataset
new_df = pd.read_csv('../data/bbc_dataset_c.csv')

# Preprocess the content
new_df['content'] = new_df['content'].apply(lambda x: clean_content(x, urdu_stopwords))

# Transform the new data using the existing vectorizer
X_new = vectorizer.transform(new_df['content'])

# Convert to dense array
X_new_dense = X_new.toarray()

# Predict using the trained model
y_pred_new = nn.predict(X_new_dense)

# If the new dataset has labels, evaluate the performance
if 'gold_label' in new_df.columns:
    # Prepare labels
    new_df['gold_label'] = new_df['gold_label'].fillna('')
    new_df['gold_label'] = new_df['gold_label'].str.split(',')
    new_df['gold_label'] = new_df['gold_label'].apply(lambda x: [label.strip() for label in x if label.strip()])
    y_new = mlb.transform(new_df['gold_label'])

    # Evaluate performance
    print("Accuracy on new dataset:", accuracy_score(y_new, y_pred_new))
    print("\nClassification Report on new dataset:")
    print(classification_report(y_new, y_pred_new, target_names=mlb.classes_))
else:
    # Convert predictions to labels
    predicted_labels = mlb.inverse_transform(y_pred_new)

    # Add predictions to the dataframe
    new_df['predicted_labels'] = [';'.join(labels) for labels in predicted_labels]

    # Save predictions to CSV
    new_df.to_csv('./data/dawn_dataset_predictions.csv', index=False)

    # Display the predictions
    print(new_df[['content', 'predicted_labels']])

Accuracy on new dataset: 0.6846689895470384

Classification Report on new dataset:
                    precision    recall  f1-score   support

          Business       0.90      0.64      0.75       222
     Entertainment       0.76      0.94      0.84       240
     International       0.68      0.84      0.75       208
Science-Technology       0.53      0.67      0.60       239
            Sports       0.93      0.96      0.95       239

         micro avg       0.74      0.81      0.78      1148
         macro avg       0.76      0.81      0.78      1148
      weighted avg       0.76      0.81      0.78      1148
       samples avg       0.75      0.81      0.77      1148



  _warn_prf(average, modifier, msg_start, len(result))


*Thank you for bearing through this end*.<br>
For testing your dataset, please change one of the above *External Test Datasets* to your dataset and run the code. The notebook will automatically test the model on the new dataset. Please ensure that the file direcotry is correct and the dataset is in the same format as the training and testing datasets. See *Testing your dataset* section in the report for more details.
###### (c) Habibi Group, Fall 2024