In [161]:
import os
import json

import pandas as pd
import numpy as np
import json
import nltk
import re

from module.utils import get_project_root
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from typing import List


In [162]:
root_path = os.path.join(get_project_root(), 'data', 'bbc')
text_path = os.path.join(root_path, 'raw_text')

class_id_map = {
    'business': 0,
    'entertainment': 1,
    'politics': 2,
    'sport': 3,
    'tech': 4
}
df_dict = {
    'class': [],
    'text': [],
    'classid' : [], 
    'title': [],
    'filename': []
}

# Load the data into dataframe
for _class in class_id_map.keys():
    _path = os.path.join(text_path, _class)
    text_list = os.listdir(_path)
    for _text in text_list:
        with open(os.path.join(_path, _text), 'r') as f:
            text = f.read()
        title = text.split('\n')[0]
        text = text.replace(title, '')
        df_dict['class'].append(_class)
        df_dict['text'].append(text)
        df_dict['classid'].append(class_id_map[_class])
        df_dict['title'].append(title)
        df_dict['filename'].append(_text)

df = pd.DataFrame(df_dict)
df.head()
    

Unnamed: 0,class,text,classid,title,filename
0,business,\n\nUK house prices dipped slightly in Novembe...,0,UK house prices dip in November,415.txt
1,business,\n\nShares in struggling German football club ...,0,Rescue hope for Borussia Dortmund,219.txt
2,business,\n\nIndia's rupee has hit a five-year high aft...,0,India's rupee hits five-year high,018.txt
3,business,\n\nTurkey's investment in Iran's mobile indus...,0,Turkey-Iran mobile deal 'at risk',074.txt
4,business,\n\nTelecoms equipment maker Nortel Networks h...,0,Nortel in $300m profit revision,425.txt


In [163]:
len(df)

2225

In [164]:
def clean_and_split_words(text: str, use_stemming: bool = False) -> list:
    """
    Clean and split words from text.
    - Converts text to lowercase.
    - Removes special characters and numbers.
    - Removes stopwords.
    - Optionally applies stemming.
    """
    text = re.sub(r'[^a-zA-Z\s]', ' ', text.lower())
    words = text.split()
    stop_words = set(stopwords.words('english'))
    words = [word.strip() for word in words if word not in stop_words]
    if use_stemming:
        stemmer = PorterStemmer()
        words = [stemmer.stem(word) for word in words]
    return words


class TfIdfVectorizer:
    def __init__(self, sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, 2), stop_word_lang='english'):
        nltk.download('stopwords')
        self.sublinear_tf = sublinear_tf
        self.min_df = min_df
        self.norm = norm
        self.ngram_range = ngram_range
        self.stop_words = stopwords.words(stop_word_lang)
        self.token_map = {}
    
    def fit_transform(self, documents: pd.Series) -> np.ndarray:
        # Step 1: Tokenize and preprocess all documents
        docs_tokens = [clean_and_split_words(doc) for doc in documents]
        vocabulary = set(word for doc in docs_tokens for word in doc)
        vocab_index = {word: idx for idx, word in enumerate(vocabulary)}

        # Step 2: Calculate document frequencies (DF) for IDF calculation
        df = {word: 0 for word in vocabulary}
        for tokens in docs_tokens:
            unique_tokens = set(tokens)
            for token in unique_tokens:
                df[token] += 1

        # Step 3: Calculate IDF
        total_docs = len(documents)
        idf = {word: np.log(total_docs / (df[word] + 1)) + 1 for word in vocabulary}

        # Step 4: Calculate total frequency for all tokens
        tf = {word: 0 for word in vocabulary}
        for tokens in docs_tokens:
            for token in tokens:
                tf[token] += 1
        
        # Step 4.5 filter out words with low frequency
        vocab_index = {word: idx for idx, word in enumerate(vocabulary) if tf[word] >= self.min_df}
        vocabulary = set(vocab_index.keys())

        # Step 4: Calculate TF-IDF matrix
        tfidf_matrix = np.zeros((total_docs, len(vocabulary)))
        for doc_idx, tokens in enumerate(docs_tokens):
            doc_freq = {word: 0 for word in vocab_index.keys()}
            for token in tokens:
                if token in vocab_index:
                    doc_freq[token] += 1
            vector = np.array([doc_freq[word] * idf[word] for word in vocab_index.keys()])
            if self.sublinear_tf:
                vector = np.log(vector + 1)
            tfidf_matrix[doc_idx] = vector

        # Step 5: Apply normalization (L2 norm)
        if self.norm == 'l2':
            norms = np.linalg.norm(tfidf_matrix, axis=1, keepdims=True)
            tfidf_matrix = tfidf_matrix / norms
        elif self.norm == 'l1':
            norms = np.linalg.norm(tfidf_matrix, ord=1, axis=1, keepdims=True)
            tfidf_matrix = tfidf_matrix / norms
        elif self.norm == None:
            pass
        
        self.token_map = vocab_index

        return tfidf_matrix

In [165]:
# Usage example
texts = pd.Series(["example text data", "another set of text data"])
vectorizer = TfIdfVectorizer(norm=None)
features = vectorizer.fit_transform(texts)
print(features)

[nltk_data] Downloading package stopwords to /home/nemit/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [166]:
texts = df['text']
features = vectorizer.fit_transform(texts)
features.shape
print(features)

In [167]:
from rich import print
features.shape
vectorizer.token_map

{'statistics': 2,
 'emotions': 5,
 'wow': 8,
 'yoran': 12,
 'phenomenal': 13,
 'combines': 14,
 'best': 15,
 'passage': 18,
 'durham': 23,
 'initial': 26,
 'payers': 29,
 'powerful': 32,
 'hop': 35,
 'manifesto': 38,
 'piece': 39,
 'manufacturer': 40,
 'later': 42,
 'perpignan': 46,
 'cruise': 49,
 'trick': 51,
 'thrive': 53,
 'pornographic': 54,
 'ended': 56,
 'spaniard': 59,
 'louvre': 60,
 'atp': 63,
 'resigned': 68,
 'cheap': 69,
 'drivers': 70,
 'improper': 71,
 'lambert': 75,
 'zombies': 85,
 'zone': 87,
 'conducting': 92,
 'issuing': 93,
 'foundation': 96,
 'campaign': 98,
 'fold': 99,
 'attack': 100,
 'winners': 106,
 'appointed': 108,
 'attorney': 110,
 'liverpool': 112,
 'stadium': 116,
 'overcame': 117,
 'closures': 120,
 'smashed': 121,
 'revamped': 127,
 'delete': 128,
 'engaging': 130,
 'wholesale': 133,
 'quietly': 134,
 'trade': 135,
 'lives': 136,
 'picking': 138,
 'rate': 139,
 'undermining': 140,
 'procedures': 141,
 'danish': 145,
 'kilroy': 147,
 'stealth': 148,
 '

In [168]:
from sklearn.feature_selection import chi2

N = 3
for category, category_id in sorted(class_id_map.items()):
    features_chi2 = chi2(features, df['classid'] == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(list(vectorizer.token_map.keys()))[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    print("# '{}':".format(category))
    print("  . Most correlated unigrams:\n       . {}".format('\n       . '.join(unigrams[-N:])))

In [169]:
df['embedding']=list(features)

In [170]:
df.head()

Unnamed: 0,class,text,classid,title,filename,embedding
0,business,\n\nUK house prices dipped slightly in Novembe...,0,UK house prices dip in November,415.txt,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,business,\n\nShares in struggling German football club ...,0,Rescue hope for Borussia Dortmund,219.txt,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,business,\n\nIndia's rupee has hit a five-year high aft...,0,India's rupee hits five-year high,018.txt,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,business,\n\nTurkey's investment in Iran's mobile indus...,0,Turkey-Iran mobile deal 'at risk',074.txt,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,business,\n\nTelecoms equipment maker Nortel Networks h...,0,Nortel in $300m profit revision,425.txt,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [171]:
df.set_index('filename', inplace=True)
df.head()

Unnamed: 0_level_0,class,text,classid,title,embedding
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
415.txt,business,\n\nUK house prices dipped slightly in Novembe...,0,UK house prices dip in November,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
219.txt,business,\n\nShares in struggling German football club ...,0,Rescue hope for Borussia Dortmund,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
018.txt,business,\n\nIndia's rupee has hit a five-year high aft...,0,India's rupee hits five-year high,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
074.txt,business,\n\nTurkey's investment in Iran's mobile indus...,0,Turkey-Iran mobile deal 'at risk',"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
425.txt,business,\n\nTelecoms equipment maker Nortel Networks h...,0,Nortel in $300m profit revision,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [172]:
if isinstance(df.iloc[0]['embedding'], list):
    # Convert list to numpy arrays if necessary
    df['embedding'] = df['embedding'].apply(np.array)

# Group by 'classid' and calculate the mean of embeddings
centroid_df = df.groupby('classid')['embedding'].apply(lambda x: np.mean(np.stack(x), axis=0)).reset_index()

# Print the resulting DataFrame with centroids
print(centroid_df)

In [173]:
from scipy.spatial import distance

df_test = df.copy()

# Convert embeddings to numpy arrays if they are lists
df['embedding'] = df['embedding'].apply(lambda x: np.array(x) if isinstance(x, list) else x)

# Check if centroid DataFrame has embeddings as numpy arrays
centroid_df['embedding'] = centroid_df['embedding'].apply(lambda x: np.array(x) if isinstance(x, list) else x)

# Merge the original DataFrame with the centroids DataFrame
merged_df = df.reset_index().merge(centroid_df, on='classid', suffixes=('', '_centroid'))

# Calculate Euclidean distance from each document's embedding to its category's centroid
merged_df['distance_to_centroid'] = merged_df.apply(lambda row: distance.euclidean(row['embedding'], row['embedding_centroid']), axis=1)

# Inspect the merged DataFrame for any issues
print(merged_df[['filename', 'classid', 'distance_to_centroid']].head())

# Update the original DataFrame
df['distance_to_centroid'] = merged_df.set_index('filename')['distance_to_centroid']

# Print to check if distances are still NaN
print(df[['class', 'distance_to_centroid']].head())


In [174]:
distance_variance = merged_df.groupby('classid')['distance_to_centroid'].var().reset_index(name='distance_variance')
centroid_df = centroid_df.merge(distance_variance, on='classid')

In [175]:
centroid_df

Unnamed: 0,classid,embedding,distance_variance
0,0,"[0.0036010061355603046, 0.0, 0.0, 0.0, 0.00019...",0.000237
1,1,"[0.000253538761023488, 0.000461578521913221, 0...",0.000247
2,2,"[0.002593148193533404, 0.00019137929848217345,...",0.000587
3,3,"[0.0, 0.0007328688548199845, 0.000214456346198...",0.000342
4,4,"[0.002286735726424549, 0.0004704532128799674, ...",0.000371


In [176]:
def gaussian_pdf(x, variance:float, mean=0):
    return (1 / np.sqrt(2 * np.pi * variance)) * np.exp(-0.5 * ((x - mean) ** 2) / variance)

In [177]:
df['pmf_cat0'] = df['distance_to_centroid'].apply(lambda x: gaussian_pdf(x, centroid_df.iloc[0]['distance_variance']))
df['pmf_cat1'] = df['distance_to_centroid'].apply(lambda x: gaussian_pdf(x, centroid_df.iloc[1]['distance_variance']))
df['pmf_cat2'] = df['distance_to_centroid'].apply(lambda x: gaussian_pdf(x, centroid_df.iloc[2]['distance_variance']))
df['pmf_cat3'] = df['distance_to_centroid'].apply(lambda x: gaussian_pdf(x, centroid_df.iloc[3]['distance_variance']))
df['pmf_cat4'] = df['distance_to_centroid'].apply(lambda x: gaussian_pdf(x, centroid_df.iloc[4]['distance_variance']))

In [178]:
print(df)
df.to_csv('./test.csv')