# Import Libraries

In [38]:
import pandas as pd
import plotly.express as px

# Load and explore the data (4 marks)

In [39]:
# load the dataset
source_path = 'data/raw/product-cat-dataset.csv'
#source_path = 'data/raw/product-category-dataset.csv'

df_source = pd.read_csv(source_path)
df_source.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10649 entries, 0 to 10648
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Description  10637 non-null  object
 1   Level_1      10649 non-null  object
 2   Level_2      10649 non-null  object
 3   Level_3      10649 non-null  object
dtypes: object(4)
memory usage: 332.9+ KB


In [40]:
df_source.head()

Unnamed: 0,Description,Level_1,Level_2,Level_3
0,gerb cap help keep littl on head cov warm day ...,09BF5150,C7E19,D06E
1,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,98CF
2,tut ballet anym leap foxy fash ruffl tul toddl...,09BF5150,C7E19,D06E
3,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,98CF
4,easy keep feel warm cozy inf toddl girl hoody ...,2CEC27F1,ADAD6,98CF


## Explore Levels

Get an overview of hirachy of levels used in the dataset.

In [41]:
# perform data grouping
source_grouped = df_source.groupby(['Level_1', 'Level_2', 'Level_3'])
source_grouped_count = source_grouped.size().to_frame(name = 'count')
source_grouped_count.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
Level_1,Level_2,Level_3,Unnamed: 3_level_1
014303D1,77F62,5AE1,229
014303D1,7AED7,6539,282
09BF5150,262E7,29B3,65
09BF5150,5E038,6BE5,118
09BF5150,6C6B1,3AAD,38
09BF5150,915D4,A2FA,47
09BF5150,A6301,DC8D,1
09BF5150,AF6B9,A104,38
09BF5150,C7E19,D06E,429
09BF5150,F824F,7288,74


In [42]:
# visualize the dataset hirachy
fig = px.treemap(source_grouped_count.reset_index(), path=['Level_1', 'Level_2', 'Level_3'], values='count')
fig.update_layout(margin = dict(t=25, l=25, r=25, b=25))
fig.show()

In [43]:
def print_categories(data:pd.DataFrame):
    """
    Print a count of the distinct categories in the dataset.
    """
    # determine the number of distinct
    print('--- Distinct Categories ---')
    print('Level 1 : %d' % data[['Level_1']].drop_duplicates().shape[0])
    print('Level 2 : %d' % data[['Level_2']].drop_duplicates().shape[0])
    print('Level 3 : %d' % data[['Level_3']].drop_duplicates().shape[0])
    print('All     : %d' % data[['Level_1', 'Level_2', 'Level_3']].drop_duplicates().shape[0])

# show the number of distinct categories in the dataset
print_categories(df_source)

--- Distinct Categories ---
Level 1 : 15
Level 2 : 39
Level 3 : 43
All     : 43


## Deal with Missing Data (4 marks)

In [44]:
# Check if data has missing values in the Description column
df_source.isna().sum()

Description    12
Level_1         0
Level_2         0
Level_3         0
dtype: int64

In [45]:
# show the rows with missing values
df_source.iloc[df_source.index[df_source.isnull().any(axis=1)]]

Unnamed: 0,Description,Level_1,Level_2,Level_3
1063,,4C3D8686,74974,62E8
3435,,09BF5150,F824F,7288
3459,,09BF5150,F824F,7288
7763,,09BF5150,6C6B1,3AAD
7797,,09BF5150,6C6B1,3AAD
7805,,09BF5150,5E038,6BE5
7817,,09BF5150,5E038,6BE5
7868,,09BF5150,5E038,6BE5
7945,,09BF5150,262E7,29B3
7971,,09BF5150,262E7,29B3


In [46]:
# Deal with missing values
df_clean = df_source.dropna()
df_clean.shape

(10637, 4)

In [47]:
# show the number of caterories remaining after dropping null values
print_categories(df_clean)

--- Distinct Categories ---
Level 1 : 15
Level 2 : 39
Level 3 : 43
All     : 43


In [48]:
# convert the levels to categories
level_cols = ['Level_1', 'Level_2', 'Level_3']
df_clean[level_cols] = df_clean[level_cols].astype('category')
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10637 entries, 0 to 10648
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   Description  10637 non-null  object  
 1   Level_1      10637 non-null  category
 2   Level_2      10637 non-null  category
 3   Level_3      10637 non-null  category
dtypes: category(3), object(1)
memory usage: 200.8+ KB




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



## Drop Classes where the number of instances is < 10 (4 marks)

In [72]:
def remove_small_categories(data:pd.DataFrame, column:str, n:int):
    df_counts = data[column].value_counts().to_frame(columns=['count'])
    return df_counts

remove_small_categories(df_clean, 'Level_1', 10)

SyntaxError: invalid syntax (2357650066.py, line 3)

In [49]:
# Apply to Level_1 


In [50]:
# Apply to Level_2


In [51]:
# Apply to Level_3


### Now let's write a Function to Prepare Text (4 marks)
We will apply it to our DataFrame later on

* This function receives a text string and performs the following:
* Convert text to lower case
* Remove punctuation marks
* Apply stemming using the popular Snowball or Porter Stemmer (optional)
* Apply NGram Tokenisation
* Return the tokenised text as a list of strings

In [52]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer
import string

def process_text(text, n = 1):
    """
    Takes in a string of text, then performs the following:
    1. Convert text to lower case and remove all punctuation
    2. Optionally apply stemming
    3. Apply Ngram Tokenisation
    4. Returns the tokenised text as a list
    """
    # write steps here
    
    return tokenised

In [53]:
# Here is an example function call
process_text("Here we're testing the process_text function, results are as follows:", n = 3)

NameError: name 'tokenised' is not defined

In [None]:
# Results should look like this:
['here were test',
 'were test the',
 'test the processtext',
 'the processtext function',
 'processtext function result',
 'function result are',
 'result are as',
 'are as follow']

### Now let's apply TF-IDF to extract features from plain text (10 marks)

In [None]:
# Might take a while...
# Here you apply the process_text function to the Description column of the data
# Then you pass the results to the bag of words tranformer
# See here: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html



Now we can use .transform on our Bag-of-Words (bow) transformed object and transform the entire DataFrame of text file contents. Let's go ahead and check out how the bag-of-words counts for the entire corpus in a large, sparse matrix:

In [None]:
# After that you pass the result of the previous step to sklearn's TfidfTransformer
# which will convert them into a feature matrix
# See here: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html
    


In [None]:
# The resulting matrix is in sparse format, we can transform it into dense
# Code prepared for you so you can see what results look like
text_tfidf = pd.DataFrame(text_tfidf.toarray())

In [None]:
# This is an example result, the matrix will contain lots of zero values, that is expected
# Some values will be non-zero
text_tfidf.head()

# Now the Data is Ready for Classifier Usage

### Split Data into Train and Test sets (4 marks)

In [None]:
# Train/Test split


In [None]:
# You might need to reset index in each dataframe (depends on you how you do things)
# done for you to make it clearer
X_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

In [None]:
# You might need to take classes as separate columns (depends on you how you do things)
class1 = y_train['Level_1'].astype(str)
class2 = y_train['Level_2'].astype(str)
class3 = y_train['Level_3'].astype(str)

## Model training for the three levels (8 marks)

In [None]:
# Create and save model for level 1


In [None]:
## Create and save models for level 2


In [None]:
## Create and save models for level 3


## Predict the test set (8 marks)

In [None]:
# Creating an empty Dataframe with column names only (depends on you how you do things)
results = pd.DataFrame(columns=['Level1_Pred', 'Level2_Pred', 'Level3_Pred'])

## Here we reload the saved models and use them to predict the levels
# load model for level 1 (done for you)
with open('level1.pk', 'rb') as nb:
    model = pickle.load(nb)

## loop through the test data, predict level 1, then based on that predict level 2
## and based on level 2 predict level 3 (you need to load saved models accordingly)

    

In [None]:
## After you add the predictions to the results dataframe
## they should look like this
results

## Compute Accuracy on each level (4 marks)
Now you have the predictions for each level (in the test data), and you also have the actual levels, you can compute the accurcay

In [None]:
# Level 1 accuracy


In [None]:
# Level 2 accuracy


In [None]:
# Level 3 accuracy


## Well done!