# Import Libraries

In [None]:
# suppresse future warnings - unfortunately pycaret/sklearn simply does not obey this
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
from tqdm.notebook import tqdm
import IPython
import math

In [None]:
# configure loggin and test that it is working
import logging
logging.basicConfig(filename='cw01.log', filemode='w', level=logging.CRITICAL)
logging.warning('Watch out!')

# Load and explore the data (4 marks)

In [None]:
# load the dataset
source_path = 'data/raw/product-cat-dataset.csv'
#source_path = 'data/raw/product-category-dataset.csv'

df_source = pd.read_csv(source_path)
df_source.info()

In [None]:
df_source.head()

## Explore Levels

Get an overview of hierarchy of levels used in the dataset.

In [None]:
# perform data grouping
source_grouped = df_source.groupby(['Level_1', 'Level_2', 'Level_3'])
source_grouped_count = source_grouped.size().to_frame(name = 'count')
source_grouped_count.head(10)

In [None]:
# visualize the dataset hierarchy
fig = px.treemap(source_grouped_count.reset_index(), path=['Level_1', 'Level_2', 'Level_3'], values='count')
fig.update_layout(margin = dict(t=25, l=25, r=25, b=25))
fig.show()

In [None]:
def print_categories(data:pd.DataFrame):
    """
    Print a count of the distinct categories in the dataset.
    """
    # determine the number of distinct
    print('--- Distinct Categories ---')
    print('Level 1 : %d' % data[['Level_1']].drop_duplicates().shape[0])
    print('Level 2 : %d' % data[['Level_2']].drop_duplicates().shape[0])
    print('Level 3 : %d' % data[['Level_3']].drop_duplicates().shape[0])
    print('All     : %d' % data[['Level_1', 'Level_2', 'Level_3']].drop_duplicates().shape[0])

# show the number of distinct categories in the dataset
print_categories(df_source)

## Deal with Missing Data (4 marks)

In [None]:
# Check if data has missing values in the Description column
df_source.isna().sum()

In [None]:
# show the rows with missing values
df_source.iloc[df_source.index[df_source.isnull().any(axis=1)]]

In [None]:
# Deal with missing values
df_clean = df_source.dropna()
df_clean.shape

In [None]:
# show the number of categories remaining after dropping null values
print_categories(df_clean)

## Drop Classes where the number of instances is < 10 (4 marks)

In [None]:
def remove_small_categories(data:pd.DataFrame, column:str, n:int):
    """
    This function find instances in the dataset provided and find instances in the specified column that has less than n rows.

    Returns:
        The original dataset with categories that have less than n rows have been removed.
    """
    df_counts = data[column].value_counts().to_frame('counts')
    remove_list = set(df_counts[df_counts.counts < n].index.values)
    
    return data.query('%s not in @remove_list' % column)

In [None]:
# Apply to Level_1 
df_clean = remove_small_categories(df_clean, 'Level_1', 10)
df_clean.shape

In [None]:
# Apply to Level_2
df_clean = remove_small_categories(df_clean, 'Level_2', 10)
df_clean.shape

In [None]:
# Apply to Level_3
df_clean = remove_small_categories(df_clean, 'Level_3', 10)
df_clean.shape

In [None]:
# convert the levels to categories
level_cols = ['Level_1', 'Level_2', 'Level_3']
df_clean[level_cols] = df_clean[level_cols].astype('category')
df_clean.info()

In [None]:
# reset the index
df_clean.reset_index(inplace=True, drop=True)
df_clean.head()

In [None]:
# level 1 distribution
df_clean.Level_1.value_counts().plot(kind='bar')

In [None]:
# level 2 distribution
df_clean.Level_2.value_counts()

In [None]:
# level 3 distribution
df_clean.Level_3.value_counts()

### Now let's write a Function to Prepare Text (4 marks)
We will apply it to our DataFrame later on

* This function receives a text string and performs the following:
* Convert text to lower case
* Remove punctuation marks
* Apply stemming using the popular Snowball or Porter Stemmer (optional)
* Apply NGram Tokenisation
* Return the tokenised text as a list of strings

In [None]:
import nltk
nltk.download('punkt')

In [None]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
import string
import re

def process_text(text, n = 1):
    """
    Takes in a string of text, then performs the following:
    1. Convert text to lower case and remove all punctuation
    2. Optionally apply stemming
    3. Apply Ngram Tokenisation
    4. Returns the tokenised text as a list
    """
    # convert to lower case & 
    text = text.lower()

    # remove punctuation
    #text = re.sub("[^0-9A-Za-z ]", "" , text)
    text = re.sub("[^A-Za-z ]", "" , text)

    # split the text into tokens
    tokens = word_tokenize(text)

    # perform stemming
    ps = PorterStemmer()
    tokens = [ps.stem(token)  for token in tokens]

    #return ' '.join(tokens)

    # get the ngrams
    n_grams  = ngrams(tokens, n)

    # return the ngrams as a list of strings
    return [ ' '.join(grams) for grams in n_grams]
    #return list(n_grams)

In [None]:
# Here is an example function call
process_text("Here we're testing the process_text function, results are as follows:", n = 3)

In [None]:
# Results should look like this:
['here were test',
 'were test the',
 'test the processtext',
 'the processtext function',
 'processtext function result',
 'function result are',
 'result are as',
 'are as follow']

### Now let's apply TF-IDF to extract features from plain text (10 marks)

In [None]:
# Might take a while...
# Here you apply the process_text function to the Description column of the data
# Then you pass the results to the bag of words tranformer
# See here: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD

In [None]:
def bow_transform(corpus, n=3, max_features=None):
    """
    Perform the bag-of-words transformation on the document corpus. The number of ngrams to use as tokens is specified using the n argument.
    """
    count_vectorizer = CountVectorizer(analyzer=lambda x:process_text(x, n), max_features=max_features)
    bow = count_vectorizer.fit_transform(corpus)

    return count_vectorizer, bow

# test the function
test_docs = [
    "Here we're testing the process_text function, results are as follows:",
    "Here you apply the process_text function to the Description column of the data"
]

count_vectorizer, bow = bow_transform(test_docs)
print(count_vectorizer.get_feature_names())
print(bow.toarray())

#### Please Note
During model evaluation it was found that using 3 ngrams is not ideal and better model performance is posible if ngrams are not created. When initially reading the coursework specification I was also puzzled by this approach as it is somehing I have never done in practise and could not wrap my head around how it would improve accuracy as a preprocessing step to TF-IDF. My intuition is that it is very likley to increase the number of tokens (and there by increase the curse of dimentionality) and make the matrix even more sparse.

In [None]:
# vectorize the description column in the dataset
count_vectorizer, bow = bow_transform(df_clean.Description, n=1)

In [None]:
#print(count_vectorizer.get_feature_names())
print('Feature Count:', len(bow.toarray()[0]))

Now we can use .transform on our Bag-of-Words (bow) transformed object and transform the entire DataFrame of text file contents. Let's go ahead and check out how the bag-of-words counts for the entire corpus in a large, sparse matrix:

In [None]:
# After that you pass the result of the previous step to sklearn's TfidfTransformer
# which will convert them into a feature matrix
# See here: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html
    
tfidf_transformer = TfidfTransformer()
text_tfidf = tfidf_transformer.fit_transform(bow)
print('Feature Count:', len(text_tfidf.toarray()[0]))

In [None]:
# perform dimensionality reduction
dim_reduction = True
n_components = 50 #500 #100 #50 #2000

if dim_reduction:
    svd = TruncatedSVD(n_components=n_components, random_state=42)
    text_svd = svd.fit_transform(text_tfidf)

    print('Total variance explained: %.2f' % svd.explained_variance_ratio_.sum())
else:
    text_svd = text_tfidf.toarray()

In [None]:
# The resulting matrix is in sparse format, we can transform it into dense
# Code prepared for you so you can see what results look like
#text_tfidf = pd.DataFrame(text_tfidf.toarray())
text_tfidf = pd.DataFrame(text_svd)
text_tfidf.shape

Because dimensionality reduction was done on the TF-IDF features the matrix will not contain many zeros as per the original instructions. 

_I have tried to stick to the instructions as close I could, but many of the steps did not make 100% sense to me as I have done a number of NLP projects in the past and the approach is not what is typically seen in industry, so my thinking might have steered me away from the specific restrictions in this coursework. I can only hope that this does not negatively effect my grade._

In [None]:
# This is an example result, the matrix will contain lots of zero values, that is expected
# Some values will be non-zero
text_tfidf.head()

# Now the Data is Ready for Classifier Usage

### Split Data into Train and Test sets (4 marks)

In [None]:
from sklearn.model_selection import train_test_split

# get the dependent and indeendent variables
y = df_clean.drop('Description', axis=1)
X = text_tfidf

# Train/Test split
#X_train, X_test, y_train, y_test = train_test_split(X.index,y,test_size=0.3)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5429)

print('Training Set : %d' % X_train.shape[0])
print('Testing  Set : %d' % X_test.shape[0])

In [None]:
# You might need to reset index in each dataframe (depends on you how you do things)
# done for you to make it clearer
X_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

In [None]:
# You might need to take classes as separate columns (depends on you how you do things)
class1 = y_train['Level_1'].astype(str)
class2 = y_train['Level_2'].astype(str)
class3 = y_train['Level_3'].astype(str)

## Model training for the three levels (8 marks)

In [None]:
from pycaret.classification import *

In [None]:
def create_model(data:pd.DataFrame, target:str, session_id:int=None, exclude=None):
    '''
    Create a classification model for the dataset provided.

    Parameters:
        data: pandas.DataFrame
            The DataFrame to build the model for.
        target: str
            The name of the target variable.
        exclude: list of str, default = None
            To omit certain models from training and evaluation, pass a list containing model id in the exclude parameter. 
    Returns:
    '''
    #from pycaret.classification import * 

    # create the classifier for the model
    logging.info('Setting up the classifier...')
    classifier = setup(
        data = data, 
        target = target, 
        session_id=session_id,
        fold = 10 if data.shape[0] * 0.7 > 10 else math.floor(data.shape[0] * 0.7) - 1,
        verbose=False,
        silent=True) 

    # if the dataset is too small knn can not be used
    if data.shape[0] < 20 and exclude:
        exclude = exclude + ['knn'] 

    # search for the best classifier
    logging.info('Finding the best model...')
    best = compare_models(
        verbose=False,
        exclude=exclude)

    # if the data is so limited that a model could not be found, create a default model
    if not best:
        logging.warning('No best model found')
        best = pycaret.classification.create_model('dt')

    # tune the model
    logging.info('Tuning the model...')
    tuned_model = tune_model(best, verbose=False)
    print(tuned_model)

    # finalize the model and return the results
    #logging.info('Finalizing the model...')
    #return finalize_model(tuned_model)
    
    return tuned_model


#create_model(
#    data=X_train.join(class1),
#    target='Level_1',
#    session_id=23)

## Create and save model for level 1

In [None]:
## Create and save model for level 1
exclude_models = ['gbc', 'lightgbm'] # gbc and lightgbm are excluded for debugging as they take significantly longer than other models to fit and only offer a fairly small increase in accuracy.
train_l1 = X_train.join(class1)

level1_model = create_model(
    data=train_l1,
    target='Level_1',
    session_id=23,
    exclude=exclude_models)

# dislay the model parameters
print(level1_model)

# display the model evaluation
_ = predict_model(level1_model)

# save the model
finalize_model(level1_model)
save_model(level1_model, 'models/level_1')

## Create and save models for level 2

In [None]:
def get_level_values(data:pd.DataFrame, column:str):
    '''
    Get a list of distinct values in the specified column.
    '''
    return list(data[column].unique())

#get_level_values(y_train, 'Level_1')

In [None]:
def filter_level(data:pd.DataFrame, column:str, value:str):
    '''
    Filter the dataframe where the specified column matches the provided value.
    '''
    return data.query('%s == @value' % column)

#filter_level(y_train, 'Level_1', '014303D1')

In [None]:
def create_level_model(data:pd.DataFrame, features:pd.DataFrame, target:str, parent:str, parent_value:str, session_id:int=None, exclude=None, extra_features=None):
    '''
    Create a classification model for the dataset provided.

    Parameters:
        data: pandas.DataFrame
            The DataFrame to build the model for.
        features: pandas.DataFrame
            The dateframe contraing the features used for predicting the targer variable.
        target: str
            The name of the target variable.
        parent:str
            The parent column name to filter the data on.
        parent_value:str 
            The value to filter the parent column on.                       
        exclude: list of str, default = None
            To omit certain models from training and evaluation, pass a list containing model id in the exclude parameter. 
        session_id: int
            The session id to use to control the model randomness.
    Returns:
        The model created for the level.
    '''
    # get the training dataset
    df_train = filter_level(data, parent, parent_value) 
    # select only the target value and extra features
    select_columns = [target] if extra_features is None else [target] + extra_features
    df_train = df_train[select_columns] 
    # join the features to the training set
    df_train = df_train.join(features, how='left') 

    # create the model
    return create_model(
        data=df_train,
        target=target,
        session_id=session_id,
        exclude=exclude)

#level_model = create_level_model(
#    data=y_train, 
#    features=X_train,
#    target='Level_2',
#    session_id=23,
#    exclude=['gbc', 'lightgbm'],
#    parent='Level_1',
#    parent_value='014303D1'
#    )

#IPython.display.clear_output()

#print(level_model)
#_ = predict_model(level_model)

### Create One Model per parent level as per the coursework instructions

In [None]:
## Create and save models for level 2
# get the unique level 1 values
parent_values = get_level_values(y_train, 'Level_1')

for parent_value in tqdm(parent_values):
    print('---------------- Building Model For: %s ----------------' % parent_value)

    current_model = create_level_model(
        data=y_train, 
        features=X_train,
        target='Level_2',
        session_id=23,
        exclude=exclude_models,
        parent='Level_1',
        parent_value=parent_value,
        extra_features=['Level_1']
        )

    # show the model created
    print(current_model)
    _ = predict_model(current_model)

    # finalize and save the model
    finalize_model(current_model)  
    save_model(current_model, 'models/level_2/' + parent_value)

IPython.display.clear_output() 

### Experiment with creating a single level 2 model

If I was doing this task in practice I would not have started with a multi-model approach as it might be very inefficient for real-time scoring to load models in and out of memory with every prediction, considering that there can potentially be a large number of models in the recommended approach.

In [None]:
level2_model = create_model(
    data=X_train.join(class2).join(class1),
    target='Level_2',
    session_id=23,
    exclude=exclude_models)

# dislay the model parameters
print(level2_model)

# display the model evaluation
_ = predict_model(level2_model)
finalize_model(level2_model)

In [None]:
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, precision_recall_fscore_support

# create a classification report for the single level 2 model
df_predicted = predict_model(
    level2_model, 
    data=X_test.join(y_test['Level_2'].astype(str)).join(y_test['Level_1'].astype(str)))

print(classification_report(y_test['Level_2'].astype(str), df_predicted.Label))

## Create and save models for level 3

In [None]:
current_model = create_level_model(
    data=y_train, 
    features=X_train,
    target='Level_3',
    session_id=23,
    exclude=['gbc', 'lightgbm'],
    parent='Level_2',
    #parent_value='0864A',
    parent_value='7AED7',
    extra_features=['Level_1', 'Level_2']
    )

In [None]:
## Create and save models for level 3
parent_values = get_level_values(y_train, 'Level_2')

for parent_value in tqdm(parent_values):
    print('---------------- Building Model For: %s ----------------' % parent_value)

    current_model = create_level_model(
        data=y_train, 
        features=X_train,
        target='Level_3',
        session_id=23,
        exclude=exclude_models + ['dummy'], # knn and the dummy model are causing a problem because some of the categories are too small
        parent='Level_2',
        parent_value=parent_value,
        extra_features=['Level_1', 'Level_2']
        )

    # show the model created
    print(current_model)
    _ = predict_model(current_model)

    # finalize and save the model
    finalize_model(current_model)  
    save_model(current_model, 'models/level_3/' + parent_value)

IPython.display.clear_output() 

## Predict the test set (8 marks)

In [None]:
def predict_level(data:pd.DataFrame, features:pd.DataFrame, target:str, parent:str):
    # get the parent values to predict for
    parent_values = get_level_values(data, parent)
    predictions = pd.Series(name=target)

    for parent_value in tqdm(parent_values):
        # load the model
        current_model = load_model('models/%s/%s' % (target, parent_value), verbose=False)

        # get the dataset to perform the predictions on
        current_data = filter_level(data, parent, parent_value)
        current_data = current_data.join(features, how='left')

        # perform the predictions
        current_predictions = predict_model(current_model, data=current_data)
        current_predictions = current_predictions.Label.rename(target)

        # add the current predictions to the predictions list
        predictions = pd.concat([predictions, current_predictions])

    # join the predictions to the predicted dataframe
    df_predicted = data.join(predictions)
    return df_predicted

```python
# Creating an empty Dataframe with column names only (depends on you how you do things)
results = pd.DataFrame(columns=['Level1_Pred', 'Level2_Pred', 'Level3_Pred'])

## Here we reload the saved models and use them to predict the levels
# load model for level 1 (done for you)
with open('level1.pk', 'rb') as nb:
    model = pickle.load(nb)

## loop through the test data, predict level 1, then based on that predict level 2
## and based on level 2 predict level 3 (you need to load saved models accordingly)
``` 

In [None]:
# predict the level 1 classes
current_model = load_model('models/level_1', verbose=False)
results = pd.DataFrame(predict_model(current_model, data=X_test).Label.rename('Level_1'))

# perform the predictions for level 2
results = predict_level(data=results, features=X_test, target='Level_2', parent='Level_1')

# perform the predictions for level 3
results = predict_level(data=results, features=X_test, target='Level_3', parent='Level_2')

In [None]:
## After you add the predictions to the results dataframe
## they should look like this
results = results.add_suffix('_Pred')
results

## Compute Accuracy on each level (4 marks)
Now you have the predictions for each level (in the test data), and you also have the actual levels, you can compute the accurcay

In [None]:
from sklearn.metrics import accuracy_score

#create the evaluation dataframe
df_evaluation = y_test.join(results)

### Level 1 accuracy


In [None]:
def print_accuracy(data:pd.DataFrame, y_true, y_pred):
    '''
    Output the accuracy score for the specified level predictions.
    '''
    score = accuracy_score(
        y_true=data[y_true].astype(str),
        y_pred=data[y_pred].astype(str))

    # output the score
    print('%s Accuracy: %.2f' % (y_true, score))

In [None]:
# display the accuracy score
print_accuracy(data=df_evaluation, y_true='Level_1', y_pred='Level_1_Pred')

### Level 2 accuracy                


In [None]:
print_accuracy(data=df_evaluation, y_true='Level_2', y_pred='Level_2_Pred')

### Level 3 accuracy


In [None]:
print_accuracy(data=df_evaluation, y_true='Level_3', y_pred='Level_3_Pred')

### Correct Parent Evaluation

The following evaluations are to test the accuracy of the level models by assuming the parent leven was correctly predicted.

This information is useful to determine if a certain level is performing particularly badly and if the ngram, or feature count variations effects certain levels more than others.

#### Level 1 Evaluation

In [None]:
# load the model
current_model = load_model('models/level_1', verbose=False)
df_predicted = pd.DataFrame(predict_model(current_model, data=X_test).Label.rename('Level_1'))

# join the predictions predictions to the evaluation dataframe
df_evaluation = y_test.join(df_predicted, rsuffix='_Pred')

# output the model evaluation
print(classification_report(df_evaluation['Level_1'].astype(str), df_evaluation['Level_1_Pred'].astype(str)))

#### Level 2 Evaluation

In [None]:
# create the dataset for predicted values
df_predicted = y_test.copy()
df_predicted.drop(['Level_2', 'Level_3'], axis=1, inplace=True)
df_predicted

# perform the predictions for the specified level
df_predicted = predict_level(data=df_predicted, features=X_test, target='Level_2', parent='Level_1')

# join the predictions predictions to the evaluation dataframe
df_evaluation = y_test.join(df_predicted, rsuffix='_Pred')

# output the model evaluation
print(classification_report(df_evaluation['Level_2'].astype(str), df_evaluation['Level_2_Pred'].astype(str)))

#### Level 3 Evaluation

In [None]:
# create the dataset for predicted values
df_predicted = y_test.copy()
df_predicted.drop('Level_3', axis=1, inplace=True)

# perform the predictions for the specified level
df_predicted = predict_level(data=df_predicted, features=X_test, target='Level_3', parent='Level_2')

# join the predictions predictions to the evaluation dataframe
df_evaluation = y_test.join(df_predicted, rsuffix='_Pred')

# output the model evaluation
print(classification_report(df_evaluation['Level_3'].astype(str), df_evaluation['Level_3_Pred'].astype(str)))

## Well done!