In [80]:
import pandas as pd
import numpy as np
import nltk
import string
import sklearn

EPSILON = 1e-5

In [81]:
data_df = pd.read_csv('data/train.csv')
train_df, test_df = sklearn.model_selection.train_test_split(data_df, test_size=0.5, random_state=42)
print(f'Train set shape: {train_df.shape}')
print(f'Test set shape: {test_df.shape}')

labels_unq = np.unique(train_df['Category'])
labels_map = {category: idx for idx, category in enumerate(labels_unq)}
print(labels_map)

Train set shape: (745, 3)
Test set shape: (745, 3)
{'business': 0, 'entertainment': 1, 'politics': 2, 'sport': 3, 'tech': 4}


In [82]:
def lower(df):
    df = df.copy()
    df['Text'] = df['Text'].apply(lambda x: x.lower())
    return df

def remove_punctuations(df):
    df = df.copy()
    df['Text'] = df['Text'].apply(lambda x: ''.join([char for char in x if char not in string.punctuation]))
    return df

def tokenize(df):
    df = df.copy()
    df['tokens'] = df['Text'].apply(lambda x: nltk.word_tokenize(x))
    return df

def filter_stopwords(df):
    df = df.copy()
    stopwords = nltk.corpus.stopwords.words('english')
    df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word not in stopwords])
    return df

def stem(df):
    df = df.copy()
    porter = nltk.stem.porter.PorterStemmer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    df['tokens'] = df['tokens'].apply(lambda x: [porter.stem(word) for word in x])
    return df

def preprocess(df):
    return stem(filter_stopwords(tokenize(remove_punctuations(lower(df)))))

train_df_preprocessed = preprocess(train_df)
test_df_preprocessed = preprocess(test_df)

In [83]:
def get_vocab(df):
    unique_words = list(set(df['tokens'].sum()))
    vocab2idx = {word: idx for idx, word in enumerate(unique_words)}
    idx2vocab = {idx: word for idx, word in enumerate(unique_words)}
    print(f'Vocabulary size = {len(unique_words)}')

    return vocab2idx, idx2vocab

vocab2idx, idx2vocab = get_vocab(train_df_preprocessed)

Vocabulary size = 14306


In [84]:
def get_tf_mat(df, vocab2idx):
    tf_mat = np.zeros((len(labels_map), len(vocab2idx)), dtype=np.int32)
    for category, label in labels_map.items():
        df_category = df[df['Category'] == category]
        terms = list(df_category['tokens'].sum())
        for term in terms:
            tf_mat[label, vocab2idx[term]] += 1
    return tf_mat

def get_icf_mat(tf_mat):
    term_presence_mat = np.where(tf_mat > 0, 1, 0)
    cf_mat = np.sum(term_presence_mat, axis=0, keepdims=True)
    icf_mat = np.log10(tf_mat.shape[0] / cf_mat)
    
    return icf_mat

def get_tf_icf_mat(tf_mat, icf_mat):
    return tf_mat * icf_mat

tf_mat = get_tf_mat(train_df_preprocessed, vocab2idx)
icf_mat = get_icf_mat(tf_mat)
tf_icf_mat = get_tf_icf_mat(tf_mat, icf_mat)

In [85]:
def get_probabilities_mat(df, tf_mat):
    denominator = np.sum(tf_mat, axis=1, keepdims=True)
    prob_mat = tf_mat / denominator

    categories, category_counts = np.unique(df['Category'], return_counts=True)
    class_prob_mat = np.zeros((5,))
    for idx, category in enumerate(categories):
        class_prob_mat[labels_map[category]] = category_counts[idx] / np.sum(category_counts)

    return prob_mat, class_prob_mat

prob_mat, class_prob_mat = get_probabilities_mat(train_df_preprocessed, tf_mat)


In [86]:
def get_inference(df, tf_icf_mat, prob_mat, class_prob_mat, vocab2idx):
    df = df.copy()
    df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word in vocab2idx])
    
    def infer(tokens):
        probs = []
        for category, label in labels_map.items():
            class_prob = np.log(class_prob_mat[label])
            post_prob = 0
            for term in tokens:
                post_prob += np.log(prob_mat[label, vocab2idx[term]] * tf_icf_mat[label, vocab2idx[term]] + EPSILON)
            
            probs.append(class_prob + post_prob)
        return np.argmax(probs)
            
    
    df['prediction'] = df['tokens'].apply(infer)
    return df

test_df_pred = get_inference(test_df_preprocessed, tf_icf_mat, prob_mat, class_prob_mat, vocab2idx)
train_df_pred = get_inference(train_df_preprocessed, tf_icf_mat, prob_mat, class_prob_mat, vocab2idx)

In [87]:
def get_metrics(df):
    y_true = [labels_map[category] for category in df['Category']]
    y_pred = df['prediction'].tolist()
    report = sklearn.metrics.classification_report(y_true, y_pred)
    
    return report

In [88]:
train_report = get_metrics(train_df_pred)
test_report = get_metrics(test_df_pred)

In [89]:
print(train_report)

              precision    recall  f1-score   support

           0       0.99      1.00      1.00       165
           1       0.99      1.00      1.00       143
           2       1.00      1.00      1.00       132
           3       1.00      0.99      1.00       182
           4       1.00      0.99      1.00       123

    accuracy                           1.00       745
   macro avg       1.00      1.00      1.00       745
weighted avg       1.00      1.00      1.00       745



In [90]:
print(test_report)

              precision    recall  f1-score   support

           0       0.97      0.95      0.96       171
           1       0.98      0.98      0.98       130
           2       0.98      0.96      0.97       142
           3       0.99      1.00      1.00       164
           4       0.95      0.99      0.97       138

    accuracy                           0.98       745
   macro avg       0.98      0.98      0.98       745
weighted avg       0.98      0.98      0.98       745

