# Exercise B: Traditional Text Classification
#### Tzanis Nikolaos mtn2217

First of all we import all the necessary libraries.

In [51]:
import numpy as np
import pandas as pd

from time import time
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score


Then we use pandas to read the train and test CSVs and convert them into DataFrames.

In [52]:
train_df_raw = pd.read_csv('../data/train.csv')
test_df_raw = pd.read_csv('../data/test.csv')
train_df_raw.head(10)

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."
5,3,"Stocks End Up, But Near Year Lows (Reuters)",Reuters - Stocks ended slightly higher on Frid...
6,3,Money Funds Fell in Latest Week (AP),AP - Assets of the nation's retail money marke...
7,3,Fed minutes show dissent over inflation (USATO...,USATODAY.com - Retail sales bounced back a bit...
8,3,Safety Net (Forbes.com),Forbes.com - After earning a PH.D. in Sociolog...
9,3,Wall St. Bears Claw Back Into the Black,"NEW YORK (Reuters) - Short-sellers, Wall Stre..."


The next step is to merge titles and descriptions and convert everything to lowercase. For that purpose we create a function that is going to be called for both the training and the test set.

In [53]:
def mergeTitlesAndDescriptionsAndConvertToLowercase(dataframe):
    new_dataframe = pd.DataFrame({'Class Index': dataframe['Class Index'], 'Text': dataframe['Title'].str.lower() + ' ' + dataframe['Description'].str.lower()})
    return new_dataframe

The above function is called for the training set first:

In [54]:
train_df = mergeTitlesAndDescriptionsAndConvertToLowercase(train_df_raw)

X = train_df['Text'].copy()
y = train_df['Class Index'].copy()

And then for the test set:

In [55]:
test_df = mergeTitlesAndDescriptionsAndConvertToLowercase(test_df_raw)

### Model Training

Next we train the models and record the training time for each one of them. For that purpose we create a function that is goint to be called for all models. This function returns the predictions of each model and the results for Accuracy, Dimensionality and Time Cost.

In [56]:
def trainModel(analyzer, ngram_range, model):
    params = {
    "analyzer": analyzer,
    "stop_words": 'english',
    "ngram_range": ngram_range
    }

    start = time()
    vectorizer = TfidfVectorizer(**params)

    X_train = vectorizer.fit_transform(train_df['Text'])
    X_test = vectorizer.transform(test_df['Text'])

    clf = model
    clf.fit(X_train, y)
    end = time()

    time_needed = np.round(end-start, 2)

    y_pred = clf.predict(X_test)
    accuracy = np.round(accuracy_score(test_df['Class Index'], y_pred), 2)
    print("Accuracy:", accuracy)
    print("Time needed:", time_needed)
    print("Dimensionality:", len(vectorizer.vocabulary_))
    return y_pred, [accuracy, len(vectorizer.vocabulary_), time_needed] 

The array 'results' is created to store the results of each model in regards to Accuracy, Dimensionality and Time Cost, in order to create the Dataframe that will display the results later.

In [57]:
results = []

Then we call the function for each of the models, starting with Multinomial Naive Bayes with word unigrams.

In [58]:
nb_word_unigrams_preds, result = trainModel('word', (1,1), MultinomialNB())
results.append(result)

Accuracy: 0.9
Time needed: 3.96
Dimensionality: 64695


Then for the Multinomial Naive Bayes with character trigrams.

In [59]:
nb_char_trigrams_preds, result = trainModel('char', (3,3), MultinomialNB())
results.append(result)

Accuracy: 0.87
Time needed: 18.02
Dimensionality: 31074


Then for the SVM Linear kernel with C=1 and word unigrams.

In [60]:
svm_word_unigrams_preds, result = trainModel('word', (1,1), LinearSVC(C=1))
results.append(result)

Accuracy: 0.92
Time needed: 7.32
Dimensionality: 64695


Then for the SVM Linear kernel with C=1 and character trigrams.

In [61]:
svm_char_trigrams_preds, result = trainModel('char', (3,3), LinearSVC(C=1))
results.append(result)

Accuracy: 0.91
Time needed: 27.6
Dimensionality: 31074


Finally, using the results array created earlier we create a pandas DataFrame to display the results in the matrix form requested.

In [62]:
results_df = pd.DataFrame(results, columns=['Accuracy', 'Dimensionality', 'Time Cost'])
results_df = results_df.rename(index={0: 'NB (word 1-grams)', 1: 'NB (char 3-grams)', 2: 'SVM (word 1-grams)', 3: 'SVM (char 3-grams)'})
display(results_df.T)

Unnamed: 0,NB (word 1-grams),NB (char 3-grams),SVM (word 1-grams),SVM (char 3-grams)
Accuracy,0.9,0.87,0.92,0.91
Dimensionality,64695.0,31074.0,64695.0,31074.0
Time Cost,3.96,18.02,7.32,27.6


### Exploration of Misclassified Texts

Next we are going to explore various texts that were misclassified from the various models and delve deeper trying to identify texts that were misclassified by all the models. After that, we print a random text that was classified incorrectly by all the models.

In [86]:
import random

misclassified_texts = []
for i in range(len(test_df)):
    if ((nb_word_unigrams_preds[i] != test_df['Class Index'][i]) and (nb_char_trigrams_preds[i] != test_df['Class Index'][i]) and (svm_word_unigrams_preds[i] != test_df['Class Index'][i]) and (svm_char_trigrams_preds[i] != test_df['Class Index'][i])):
        misclassified_texts.append(i)

if len(misclassified_texts) != 0:
    print('An example text was was misclassified by all the models was:')
    print(test_df.iloc[random.choice(misclassified_texts)]['Text'])
else:
    print('All texts were classified correctly by at least one model.')

An example text was was misclassified by all the models was:
bbc 'must keep up' bbc boss mark thompson says the corporation must keep up with change, after announcing nearly 3,000 job cuts.


Then using the test dataframe we explore how many texts were classified incorrectly per category. Actually, by using the index of the misclassified texts we count how many times this particular index was misclassified in the test dataframe. There is also a line of code that converts numeric indexes to strings for better visibility.

In [77]:
misclassified_per_category = test_df.iloc[misclassified_texts]['Class Index'].value_counts().sort_index()
misclassified_per_category.index = misclassified_per_category.index.map({1: 'World', 2: 'Sports', 3: 'Business', 4: 'Sci/Tech'})
print('Number of texts that were misclassified by all models per category:')
print(misclassified_per_category)

Number of texts that were misclassified by all models per category:
World       112
Sports        9
Business    138
Sci/Tech     88
Name: Class Index, dtype: int64


For the last part of the exercise a dictionary to save the misclassification counts is initialized. Then we go through all the misclassified texts, creating tuples where the first number represents the correct category and the second number represents the wrong prediction of the models. Using the Counter library we get the total counts of each tuple and then print the one that has the highest count.

In [90]:

misclassification_counts = {}

for i in misclassified_texts:
    true_label = test_df.iloc[i]['Class Index']
    
    predicted_labels = [nb_word_unigrams_preds[i], nb_char_trigrams_preds[i], svm_word_unigrams_preds[i], svm_char_trigrams_preds[i]]    
    predicted_label_counts = Counter(predicted_labels)
    
    del predicted_label_counts[true_label]
    
    for predicted_label, count in predicted_label_counts.items():
        pair = (true_label, predicted_label)
        misclassification_counts[pair] = misclassification_counts.get(pair, 0) + count

most_frequent_pair, count = max(misclassification_counts.items(), key=lambda x: x[1])

map_dict = {1: 'World', 2: 'Sports', 3: 'Business', 4: 'Sci/Tech'}

print(f'The most common pair of correct category and wrong prediction was {tuple(map_dict[num] for num in most_frequent_pair)}. It was encountered a total of {count} times.')

The most common pair of correct category and wrong prediction was ('Business', 'Sci/Tech'). It was encountered a total of 395 times.
