# Model

Now that a rough EDA is complete, I'll build a simple TF-IDF + Random Forest model and evaluate performance with standard classification metrics: precision, recall, and F1.

In [1]:
from glob import glob
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", 200)
plt.style.use('bmh')

In [2]:
def get_text_file_contents(path: str) -> str: 
    """Read in a text file and return the contents.

    Parameters
    ----------
    path : str
        Input path to a .txt file.
    """
    
    with open(path, "r") as f:
        contents = f.read()
    return contents

category_folders = glob("./trellis_assessment_ds/*")

category_contents = {}
for folder_path in category_folders:
    category = folder_path.split("\\")[-1]
    document_contents = []
    for path in glob(folder_path + "\\*"):
        tokens = get_text_file_contents(path)
        document_contents.append(tokens)
    category_contents[category] = document_contents


In [3]:
data = pd.DataFrame.from_dict(category_contents, orient="index").reset_index()
data = (
    pd.melt(data, id_vars="index")
    .rename({"index": "category", "value": "text"}, axis=1)
    .drop("variable", axis=1)
    .dropna()
    .sort_values("category")
    .reset_index() # This is just for the aesthetics of the index aligning with the categories...
    .drop("index", axis=1)
)

data["tokens"] = data["text"].apply(lambda x: x.split())
data["token_count"] = data["tokens"].apply(len)
data["characters"] = data["text"].apply(len)
data["characters_per_token"] = data["characters"] / data["token_count"]

data.head()

Unnamed: 0,category,text,tokens,token_count,characters,characters_per_token
0,business,"Lufthansa flies back to profit\n\nGerman airline Lufthansa has returned to profit in 2004 after posting huge losses in 2003.\n\nIn a preliminary report, the airline announced net profits of 400m e...","[Lufthansa, flies, back, to, profit, German, airline, Lufthansa, has, returned, to, profit, in, 2004, after, posting, huge, losses, in, 2003., In, a, preliminary, report,, the, airline, announced,...",149,848,5.691275
1,business,"German economy rebounds\n\nGermany's economy, the biggest among the 12 countries sharing the euro, grew at its fastest rate in four years during 2004, driven by strong exports.\n\nGross domestic p...","[German, economy, rebounds, Germany's, economy,, the, biggest, among, the, 12, countries, sharing, the, euro,, grew, at, its, fastest, rate, in, four, years, during, 2004,, driven, by, strong, exp...",222,1377,6.202703
2,business,Weak data buffets French economy\n\nA batch of downbeat government data has cast doubt over the French economy's future prospects.\n\nOfficial figures showed on Friday that unemployment was unchan...,"[Weak, data, buffets, French, economy, A, batch, of, downbeat, government, data, has, cast, doubt, over, the, French, economy's, future, prospects., Official, figures, showed, on, Friday, that, un...",256,1621,6.332031
3,business,Christmas shoppers flock to tills\n\nShops all over the UK reported strong sales on the last Saturday before Christmas with some claiming record-breaking numbers of festive shoppers.\n\nA spokesma...,"[Christmas, shoppers, flock, to, tills, Shops, all, over, the, UK, reported, strong, sales, on, the, last, Saturday, before, Christmas, with, some, claiming, record-breaking, numbers, of, festive,...",374,2187,5.847594
4,business,"Business fears over sluggish EU economy\n\nAs European leaders gather in Rome on Friday to sign the new EU constitution, many companies will be focusing on matters much closer to home - namely how...","[Business, fears, over, sluggish, EU, economy, As, European, leaders, gather, in, Rome, on, Friday, to, sign, the, new, EU, constitution,, many, companies, will, be, focusing, on, matters, much, c...",823,4603,5.592953


# Initial Modeling Attempt

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, make_scorer

X = data.query("category != 'other'")["text"].values
y = data.query("category != 'other'")["category"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [5]:
originalclass = []
predictedclass = []


def classification_report_cv(y_true, y_pred):
    """Classification Report that can be used in CV.

    Credit: https://stackoverflow.com/a/42567557"""
    originalclass.extend(y_true)
    predictedclass.extend(y_pred)
    return f1_score(y_true, y_pred, average="macro")


tfidf = TfidfVectorizer()

# TODO: in production I would refactor this as a pipeline.
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)

clf = RandomForestClassifier()

nested_score = cross_val_score(
    clf, X=X_train, y=y_train, cv=5, scoring=make_scorer(classification_report_cv)
)

print(classification_report(originalclass, predictedclass))

               precision    recall  f1-score   support

     business       0.94      0.93      0.93        80
entertainment       0.94      0.95      0.94        80
         food       0.99      0.95      0.97        80
     graphics       0.76      0.89      0.82        80
   historical       0.98      0.99      0.98        80
      medical       0.89      0.94      0.91        80
     politics       0.94      0.97      0.96        80
        space       0.93      0.84      0.88        80
        sport       0.97      0.96      0.97        80
  technologie       0.94      0.84      0.89        80

     accuracy                           0.93       800
    macro avg       0.93      0.93      0.93       800
 weighted avg       0.93      0.93      0.93       800



As it stands, this is great performance! Especially without knowing any business context such as cost of false positives or false negatives. A few notes:
* I'm just doing a simple 5-fold cross validation. Stratifying on the classes would be better, but given we have an even number of each class it should be roughly even in each split.
* F1 could be swapped for F-beta if we want to weight precision or recall differently.

### Future Model Improvements

If performance needed work I could do the following:
1) Hyperparameter optimize (for this small data set it would be trivial to do a random grid search, but otherwise `hyperopt` or `optuna` are good choices).
2) Change models. I often start with Random Forest for a baseline but then move on to XGBoost.
3) If I switch to XGBoost (or something else), then optimize that too.
4) Feature engineering including fiddling with the parameters in TF-IDF or using some of the features I already created like token counts or characters per token.
5) Rinse and repeat on any optimizing after changing features.

As it stands, this doesn't handle the requirement of dealing with the other category.

### Scaling Considerations

Assuming the documents stay approximately the same size, then this approach could be easily scaled to millions of documents. I've done this exact approach on a corpus around 100k using a beefy laptop or small CPU cluster (many years ago). 

If this started to choke out modest compute then it could easily be horizontally scaled across multiple machines or rebuilt in PySpark.


In [6]:
# Save the models so they can be used in the API.
clf.fit(X_train, y_train)

with open("./models/clf.pickle", "wb") as f:
    pickle.dump(clf, f)

with open("./models/tfidf.pickle", "wb") as f:
    pickle.dump(tfidf, f)

## Dealing with Other Class

The requirement to mark documents as 'Other' if they don't fall into the existing categories is tricky.

In [9]:
# I know I really shouldn't be doing this on the test data. 
# I should have split into train, eval, and test to be more proper.
maxium_predictions = np.amax(clf.predict_proba(X_test), axis=1)
print(f"Mean: {np.mean(maxium_predictions):.4f}")
print(f"Std.: {np.std(maxium_predictions):.4f}")
print(f"Median: {np.median(maxium_predictions):.4f}")

Mean: 0.4720
Std.: 0.1649
Median: 0.4600


The average/median probability is ~0.47. Assuming a low cost to misclassifying, I'll assume any prediction that is the `mean - 2 * standard_devations` should be classifed as other. I am assuming that my classifier is well-calibrated. The literature claims this is true for random forest, but I have not verified it for this example.

In [12]:
prediction_threshold = round(
    np.mean(maxium_predictions) - 1 * np.std(maxium_predictions), 3
)
print(f"Prediction threshold: {prediction_threshold}")


def evaluate_prediction_probability(
    prediction_probabilities: np.array, prediction_threshold: float = 0.155
) -> str:
    # If no probabilities are above the threshold.
    if sum(prediction_probabilities > prediction_threshold) == 0:
        return "other"
    else:
        # Get the index of the highest probability and return the corresponding class.
        return clf.classes_[np.argmax(prediction_probabilities)]

# TODO: not shown, but I explored this on the "other" files and it wasn't particularly performant.
# I think it is an easy solution, but perhaps far from the best solution.
# How much this needs to be improved depends on the cost of classifying correctly or not.

Prediction threshold: 0.307
