In [None]:
# utilities
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

# preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy

# models
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

# metrics & model selection
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics

# Read data
First the data provided by `build_dataset.py` is read into separate dataframes:
- `train_df` containing the training data
- `val_df` containing the validation data

In [None]:
train_df = pd.read_csv('data/train.csv')
val_df = pd.read_csv('data/val.csv')

In [None]:
train_df.shape

In [None]:
val_df.shape

# Preprocessing

We use the SpaCy library to preprocess our data. Since TF-IDF is calculated using only the frequency of a term in a document, the number of of documents in the corpus and the number of documents a term appears in there's no need for us to add tags to the tokens. Therefore the preprocessing pipeline includes only:
- Tokenisation
- Remove punctuation, spaces and non alphanumeric tokens
- Lemmatisation

The lemmas of the tokens are then used to calculate the TF-IDF scores. 

In [None]:
# install spacy language model, needed for preprocessing
!python -m spacy download en_core_web_sm

In [None]:
# load spacy language model and define pipeline
nlp = spacy.load('en_core_web_sm')
unwanted_pipes = ["ner", "parser", "tagger"]

In [None]:
# create custom tokenizer to use with sklearn
def spacy_tokenizer(doc):
  with nlp.disable_pipes(*unwanted_pipes):
    return [t.lemma_ for t in nlp(doc) if \
            not t.is_punct and \
            not t.is_space and \
            t.is_alpha]

In [None]:
# calculate tf-idf vectors
vectorizer = TfidfVectorizer(tokenizer=spacy_tokenizer)

X_train = vectorizer.fit_transform(train_df.text).toarray()
X_val = vectorizer.transform(val_df.text).toarray()

y_train = train_df.label
y_val = val_df.label

# Classification Models
With the TF-IDF scored as our features we then go on to find the best classification model for our task. 

## Models
In this comparison we included the following classifiers from sklearn:
- Random Forest
- Linear SVC
- Multinomial Naive Bayes
- Hist Gradient Boosting
- Gaussian Naive Bayes
- Linear Discriminant Analysis
- Quadratic Discriminant Analysis

In [None]:
# define models to compare
models = [
    RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    HistGradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis()
]

# 5 Cross-validation
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))

entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
    
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

# Comparison of model performance
With this first comparison we take the average and standard deviation of the model's accuracies. 

In [None]:
mean_accuracy = cv_df.groupby('model_name').accuracy.mean()
std_accuracy = cv_df.groupby('model_name').accuracy.std()

acc = pd.concat([mean_accuracy, std_accuracy], axis= 1, 
          ignore_index=True)
acc.columns = ['Mean Accuracy', 'Standard deviation']
acc

The `GaussianNB` model achieves the highest accuracy which is why we continue using this model.

# Model Evaluation

In [None]:
# fit GaussianNB model to predict y values of X_val
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

In [None]:
# print clafficiation report
print('\t\t\t\tCLASSIFICATIION METRICS\n')
print(metrics.classification_report(y_val, y_pred, 
                                    target_names= train_df.label.unique()))

In [None]:
# plot confusion matrix
conf_mat = confusion_matrix(y_val, y_pred)
fig, ax = plt.subplots(figsize=(8,8))
sns.heatmap(conf_mat, annot=True, cmap="Blues", fmt='d',
            xticklabels=train_df.label.unique(), 
            yticklabels=train_df.label.unique())
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title("CONFUSION MATRIX - GaussianNB\n", size=16);

# Export Model

In [None]:
with open('models/gaussian_nb.pkl', 'wb') as f:
    pickle.dump(model, f)