### Import Libraries

In [7]:
import pandas as pd
import numpy as np
import csv
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import pickle
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier 
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Load and explore the data (4 marks)

In [8]:
dataset= pd.read_csv('./product-category-dataset.csv')
display(dataset)

Unnamed: 0,Description,Level_1,Level_2,Level_3
0,gerb cap help keep littl on head cov warm day ...,09BF5150,C7E19,FDCF
1,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,ED0D
2,tut ballet anym leap foxy fash ruffl tul toddl...,09BF5150,C7E19,D06E
3,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,98CF
4,easy keep feel warm cozy inf toddl girl hoody ...,2CEC27F1,ADAD6,3918
...,...,...,...,...
10634,term 10 issu on year subscriptionyo sav 75 cov...,90A8B052,C719A,1BE5
10635,term 12 issu on year subscriptionyo sav 86 cov...,90A8B052,C719A,F45B
10636,term 9 issu on year subscriptionyo sav 64 cov ...,90A8B052,C719A,A0E2
10637,term 26 issu on year subscriptionyo sav 54 cov...,90A8B052,C719A,1BE5


### Deal with Missing Data (4 marks)

In [9]:
# Check if data has missing values in the Description column
dataset.loc[dataset['Description'].isnull()]

Unnamed: 0,Description,Level_1,Level_2,Level_3
1063,,4C3D8686,74974,62E8
3434,,09BF5150,F824F,7288
3458,,09BF5150,F824F,7288
7754,,09BF5150,6C6B1,3AAD
7788,,09BF5150,6C6B1,3AAD
7796,,09BF5150,5E038,6BE5
7808,,09BF5150,5E038,6BE5
7859,,09BF5150,5E038,6BE5
7936,,09BF5150,262E7,29B3
7962,,09BF5150,262E7,29B3


In [10]:
# Deal with missing values
dataset.dropna(subset = ["Description"], inplace=True)
dataset

Unnamed: 0,Description,Level_1,Level_2,Level_3
0,gerb cap help keep littl on head cov warm day ...,09BF5150,C7E19,FDCF
1,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,ED0D
2,tut ballet anym leap foxy fash ruffl tul toddl...,09BF5150,C7E19,D06E
3,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,98CF
4,easy keep feel warm cozy inf toddl girl hoody ...,2CEC27F1,ADAD6,3918
...,...,...,...,...
10634,term 10 issu on year subscriptionyo sav 75 cov...,90A8B052,C719A,1BE5
10635,term 12 issu on year subscriptionyo sav 86 cov...,90A8B052,C719A,F45B
10636,term 9 issu on year subscriptionyo sav 64 cov ...,90A8B052,C719A,A0E2
10637,term 26 issu on year subscriptionyo sav 54 cov...,90A8B052,C719A,1BE5


In [11]:
# Check if there is a row duplication
duplicateDFRow = dataset[dataset.duplicated()]
print(duplicateDFRow)

                                             Description  ... Level_3
5      newborn inf toddl boy hoody jacket oshkosh b g...  ...    ED0D
118                          trendy styl wom dress papil  ...    D97D
164                          trendy styl wom dress papil  ...    D97D
170    giv complet look girl dress set swak dress fea...  ...    A2B2
172                          trendy styl wom dress papil  ...    D97D
...                                                  ...  ...     ...
10292                         revlon wom 0 5 oz nail pol  ...    33D1
10294  glory gorg col fing complet outfit express moo...  ...    F0EF
10295                                 5 oz 15ml nail pol  ...    9203
10305  claim fam bet ev purs prid new improv formul p...  ...    F0EF
10306                         revlon wom 0 5 oz nail pol  ...    33D1

[502 rows x 4 columns]


In [12]:
# Remove Row Duplication
dataset.drop_duplicates(subset ='Description',keep = 'first', inplace = True)
dataset

Unnamed: 0,Description,Level_1,Level_2,Level_3
0,gerb cap help keep littl on head cov warm day ...,09BF5150,C7E19,FDCF
1,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,ED0D
2,tut ballet anym leap foxy fash ruffl tul toddl...,09BF5150,C7E19,D06E
4,easy keep feel warm cozy inf toddl girl hoody ...,2CEC27F1,ADAD6,3918
6,mit warm protect real stay dainty littl hand p...,09BF5150,C7E19,D06E
...,...,...,...,...
10634,term 10 issu on year subscriptionyo sav 75 cov...,90A8B052,C719A,1BE5
10635,term 12 issu on year subscriptionyo sav 86 cov...,90A8B052,C719A,F45B
10636,term 9 issu on year subscriptionyo sav 64 cov ...,90A8B052,C719A,A0E2
10637,term 26 issu on year subscriptionyo sav 54 cov...,90A8B052,C719A,1BE5


### Drop Classes where the number of instances is < 10 (4 marks)

In [13]:
# Apply to Level_1 
lvl1cnts = dataset['Level_1'].value_counts() 
print('lvl1cnts: ',lvl1cnts)
dataset=dataset[dataset.isin(lvl1cnts.index[lvl1cnts >= 10]).values]


lvl1cnts:  B092BA29    876
AAC8EE56    807
57164AC1    806
2CEC27F1    804
35E04739    802
09BF5150    736
69286F45    735
EFEF723B    693
96F95EEC    565
3E1E0D78    556
4C3D8686    530
90A8B052    503
4513C920    455
014303D1    442
D410C91A    358
Name: Level_1, dtype: int64


In [14]:
# Apply to Level_2
lvl2cnts = dataset['Level_2'].value_counts() 
print('lvl2cnts: ',lvl2cnts)
dataset=dataset[dataset.isin(lvl2cnts.index[lvl2cnts >= 10]).values]


lvl2cnts:  2D5A3    735
C719A    482
375FE    446
9D9EE    439
5A8AB    430
BAE8A    412
9B69F    410
B2DB4    408
94728    406
74974    404
A04D3    397
914A1    397
C7E19    397
390F1    394
ADAD6    392
CB803    386
7B638    385
ACD06    358
F4055    286
7AED7    232
02FA0    227
77F62    210
36080    168
223B2    126
E6162    117
5E038    105
E69F5     91
D5531     80
31FED     78
F824F     68
262E7     62
915D4     43
6C6B1     33
AF6B9     28
08960     21
0864A     15
Name: Level_2, dtype: int64


In [15]:
# Apply to Level_3
lvl3cnts = dataset['Level_3'].value_counts() 
print('lvl3cnts: ',lvl3cnts)
dataset=dataset[dataset.isin(lvl3cnts.index[lvl3cnts >= 10]).values]


lvl3cnts:  28A7    309
AA6B    219
2CFE    211
A0E2    206
BB6B    205
       ... 
74C9     26
1000     21
D55B     20
98A8     17
96B8     15
Name: Level_3, Length: 94, dtype: int64


In [16]:
dataset.reset_index(inplace=True)
dataset

Unnamed: 0,index,Description,Level_1,Level_2,Level_3
0,0,gerb cap help keep littl on head cov warm day ...,09BF5150,C7E19,FDCF
1,1,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,ED0D
2,2,tut ballet anym leap foxy fash ruffl tul toddl...,09BF5150,C7E19,D06E
3,4,easy keep feel warm cozy inf toddl girl hoody ...,2CEC27F1,ADAD6,3918
4,6,mit warm protect real stay dainty littl hand p...,09BF5150,C7E19,D06E
...,...,...,...,...,...
9663,10634,term 10 issu on year subscriptionyo sav 75 cov...,90A8B052,C719A,1BE5
9664,10635,term 12 issu on year subscriptionyo sav 86 cov...,90A8B052,C719A,F45B
9665,10636,term 9 issu on year subscriptionyo sav 64 cov ...,90A8B052,C719A,A0E2
9666,10637,term 26 issu on year subscriptionyo sav 54 cov...,90A8B052,C719A,1BE5


### Now let's write a Function to Prepare Text (4 marks)
We will apply it to our DataFrame later on

* This function receives a text string and performs the following:
* Convert text to lower case
* Remove punctuation marks
* Apply stemming using the popular Snowball or Porter Stemmer (optional)
* Apply NGram Tokenisation
* Return the tokenised text as a list of strings

In [17]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer
from nltk import ngrams
import string
import re
stemmer = SnowballStemmer("english")
def process_text(text, n = 1):
    """
    Takes in a string of text, then performs the following:
    1. Convert text to lower case and remove all punctuation
    2. Optionally apply stemming
    3. Apply Ngram Tokenisation
    4. Returns the tokenised text as a list
    """
    # write steps here
    text=text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text=text.split()
    stem_sentence=[]
    for word in text:
        stem_sentence.append(stemmer.stem(word))
    n_gramm = ngrams(stem_sentence, n)

    tokenised = []
    for w in n_gramm:
        w=' '.join(w)
        tokenised.append(w)
    return tokenised

In [18]:
# Here is an example function call
process_text("Here we're testing the process_text function, results are as follows:", n = 3)

['here were test',
 'were test the',
 'test the process_text',
 'the process_text function',
 'process_text function result',
 'function result are',
 'result are as',
 'are as follow']

In [None]:
# Results should look like this:
['here were test',
 'were test the',
 'test the processtext',
 'the processtext function',
 'processtext function result',
 'function result are',
 'result are as',
 'are as follow']

['here were test',
 'were test the',
 'test the processtext',
 'the processtext function',
 'processtext function result',
 'function result are',
 'result are as',
 'are as follow']

### Now let's apply TF-IDF to extract features from plain text (10 marks)

In [19]:
# Might take a while...
# Here you apply the process_text function to the Description column of the data
# Then you pass the results to the bag of words tranformer
# See here: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

dataset['tokenized_sents'] = dataset.apply(lambda row: process_text(row['Description']), axis=1)
dataset['tokenized_sents'] = dataset.apply(lambda row: ' '.join(row['tokenized_sents']) , axis=1)
dataset
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(dataset['tokenized_sents']).toarray()

Now we can use .transform on our Bag-of-Words (bow) transformed object and transform the entire DataFrame of text file contents. Let's go ahead and check out how the bag-of-words counts for the entire corpus in a large, sparse matrix:

In [21]:
# After that you pass the result of the previous step to sklearn's TfidfTransformer
# which will convert them into a feature matrix
# See here: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html
from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
text_tfidf = tfidfconverter.fit_transform(X)
text_tfidf 


<9668x1500 sparse matrix of type '<class 'numpy.float64'>'
	with 220336 stored elements in Compressed Sparse Row format>

In [22]:
# The resulting matrix is in sparse format, we can transform it into dense
# Code prepared for you so you can see what results look like
text_tfidf = pd.DataFrame(text_tfidf.toarray())
text_tfidf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,1460,1461,1462,1463,1464,1465,1466,1467,1468,1469,1470,1471,1472,1473,1474,1475,1476,1477,1478,1479,1480,1481,1482,1483,1484,1485,1486,1487,1488,1489,1490,1491,1492,1493,1494,1495,1496,1497,1498,1499
0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
1,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.149423
2,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
3,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.134054
4,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9663,0.0,0.0,0.0,0.302873,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.160239,0.0,0.0,0.0,0.0,0.0,0.000000
9664,0.0,0.0,0.0,0.111130,0.0,0.0,0.0,0.134634,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.186457,0.0,0.0,0.0,0.0,0.117590,0.0,0.0,0.0,0.0,0.0,0.000000
9665,0.0,0.0,0.0,0.146853,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.155389,0.0,0.0,0.0,0.0,0.0,0.000000
9666,0.0,0.0,0.0,0.150201,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.229888,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.158932,0.0,0.0,0.0,0.0,0.0,0.000000


In [23]:
# This is an example result, the matrix will contain lots of zero values, that is expected
# Some values will be non-zero
text_tfidf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,1460,1461,1462,1463,1464,1465,1466,1467,1468,1469,1470,1471,1472,1473,1474,1475,1476,1477,1478,1479,1480,1481,1482,1483,1484,1485,1486,1487,1488,1489,1490,1491,1492,1493,1494,1495,1496,1497,1498,1499
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.149423
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.134054
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Now the Data is Ready for Classifier Usage

### Split Data into Train and Test sets (4 marks)

In [24]:
# Train/Test split
from sklearn.model_selection import train_test_split
X_train_lvl1, X_test_lvl1, y_train_lvl1, y_test_lvl1 = train_test_split(text_tfidf, dataset['Level_1'], test_size=0.2, random_state=0)


In [None]:
# # You might need to reset index in each dataframe (depends on you how you do things)
# # done for you to make it clearer
# X_train_lvl1.reset_index(inplace=True, drop=True)
# X_test_lvl1.reset_index(inplace=True, drop=True)
# y_train_lvl1.reset_index(inplace=True, drop=True)
# y_test_lvl1.reset_index(inplace=True, drop=True)

In [None]:
# # You might need to take classes as separate columns (depends on you how you do things)
# class1 = y_train_lvl1['Level_1'].astype(str)
# class2 = y_train_lvl1['Level_2'].astype(str)
# class3 = y_train_lvl1['Level_3'].astype(str)

## Model training for the three levels (8 marks)

In [25]:
# Create and save model for level 1
classifier = MultinomialNB(fit_prior=True)

classifier.fit(X_train_lvl1, y_train_lvl1)
with open("classifierLevel1", 'wb') as picklefile:
   pickle.dump(classifier,picklefile) 


In [26]:
y_pred = classifier.predict(X_test_lvl1)
y_pred

array(['69286F45', '57164AC1', '3E1E0D78', ..., '96F95EEC', 'EFEF723B',
       '4513C920'], dtype='<U8')

In [27]:
## Create and save models for level 2
lvl1_unique_values =  pd.unique(dataset['Level_1'])
for u in lvl1_unique_values:
  df=dataset[dataset['Level_1']==u]
  print("u ",u)
  X_train_lvl2, X_test_lvl2, y_train_lvl2, y_test_lvl2 = train_test_split(text_tfidf.loc[df.index], df['Level_2'], test_size=0.2, random_state=0)
  # classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
  classifier = MultinomialNB(fit_prior=True)
  classifier.fit(X_train_lvl2, y_train_lvl2) 
  y_pred_lvl2 = classifier.predict(X_test_lvl2)
  y_pred_lvl2
  with open(u, 'wb') as picklefile:
    pickle.dump(classifier,picklefile)


u  09BF5150
u  2CEC27F1
u  AAC8EE56
u  4C3D8686
u  69286F45
u  57164AC1
u  4513C920
u  35E04739
u  EFEF723B
u  96F95EEC
u  014303D1
u  90A8B052
u  B092BA29
u  3E1E0D78
u  D410C91A


In [28]:
## Create and save models for level 3
lvl2_unique_values =  pd.unique(dataset['Level_2'])
for u in lvl2_unique_values:
  # dataset[u] where u=cats,dogs,tigers // first iteration only cats, second iteration dogs, third iteration tigers
  # df=dataset[u] thus df is only // catS,.....
  # yeb2a keda el df dih el dataset KOLAHAAA
  # df dih per iteartion 
  df=dataset[dataset['Level_2']==u]
  print("u ",u)
  X_train_lvl3, X_test_lvl3, y_train_lvl3, y_test_lvl3 = train_test_split(text_tfidf.loc[df.index], df['Level_3'], test_size=0.2, random_state=0)
  # classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
  classifier = MultinomialNB(fit_prior=True)
  classifier.fit(X_train_lvl3, y_train_lvl3) 
  with open(u, 'wb') as picklefile:
    pickle.dump(classifier,picklefile)
  y_pred_lvl3 = classifier.predict(X_test_lvl3)
  y_pred_lvl3

u  C7E19
u  ADAD6
u  914A1
u  74974
u  2D5A3
u  9B69F
u  7B638
u  F4055
u  0864A
u  F824F
u  B2DB4
u  02FA0
u  D5531
u  CB803
u  BAE8A
u  31FED
u  E69F5
u  390F1
u  94728
u  36080
u  77F62
u  A04D3
u  7AED7
u  915D4
u  6C6B1
u  5E038
u  262E7
u  AF6B9
u  C719A
u  375FE
u  5A8AB
u  08960
u  9D9EE
u  E6162
u  ACD06
u  223B2


## Predict the test set (8 marks)

In [29]:
# Creating an empty Dataframe with column names only (depends on you how you do things)
results = pd.DataFrame(columns=['Level1_Pred', 'Level2_Pred', 'Level3_Pred'])

## Here we reload the saved models and use them to predict the levels
# load model for level 1 (done for you)
# with open('level1.pk', 'rb') as nb:
with open('classifierLevel1', 'rb') as nb:
    model = pickle.load(nb)

y_pred_all = model.predict(X)
print(results['Level1_Pred'])
# print(y_pred_all)
## loop through the test data, predict level 1, then based on that predict level 2
## and based on level 2 predict level 3 (you need to load saved models accordingly)
k=0
lvl1_pred=[]
lvl2_pred=[]
lvl3_pred=[]
_, X_test_all, _, y_test_all = train_test_split(X, y_pred_all, test_size=0.05, random_state=0)
print(len(y_test_all))   
for i in y_test_all:
  print("i " ,i)
  lvl1_pred.append(i)
  with open(i, 'rb') as training_model:
    print("training_model : ",training_model)
    model = pickle.load(training_model)
    y_pred_level2 = model.predict([X_test_all[k]])
    lvl2_pred.append(y_pred_level2[0])
    with open(y_pred_level2[0], 'rb') as training_model:
      model = pickle.load(training_model)
      y_pred_level3 = model.predict([X_test_all[k]])
      lvl3_pred.append(y_pred_level3[0])


  # print(lvl2_pred)
  k=k+1

results['Level1_Pred']=(lvl1_pred)
results['Level2_Pred']=(lvl2_pred)
results['Level3_Pred']=(lvl3_pred)



Series([], Name: Level1_Pred, dtype: object)
484
i  69286F45
training_model :  <_io.BufferedReader name='69286F45'>
i  57164AC1
training_model :  <_io.BufferedReader name='57164AC1'>
i  3E1E0D78
training_model :  <_io.BufferedReader name='3E1E0D78'>
i  09BF5150
training_model :  <_io.BufferedReader name='09BF5150'>
i  3E1E0D78
training_model :  <_io.BufferedReader name='3E1E0D78'>
i  09BF5150
training_model :  <_io.BufferedReader name='09BF5150'>
i  69286F45
training_model :  <_io.BufferedReader name='69286F45'>
i  09BF5150
training_model :  <_io.BufferedReader name='09BF5150'>
i  4513C920
training_model :  <_io.BufferedReader name='4513C920'>
i  2CEC27F1
training_model :  <_io.BufferedReader name='2CEC27F1'>
i  69286F45
training_model :  <_io.BufferedReader name='69286F45'>
i  35E04739
training_model :  <_io.BufferedReader name='35E04739'>
i  57164AC1
training_model :  <_io.BufferedReader name='57164AC1'>
i  AAC8EE56
training_model :  <_io.BufferedReader name='AAC8EE56'>
i  2CEC27F1
t

In [30]:
## After you add the predictions to the results dataframe
## they should look like this
results

Unnamed: 0,Level1_Pred,Level2_Pred,Level3_Pred
0,69286F45,2D5A3,28A7
1,57164AC1,94728,5912
2,3E1E0D78,9D9EE,05A0
3,09BF5150,5E038,6BE5
4,3E1E0D78,9D9EE,818C
...,...,...,...
479,2CEC27F1,ADAD6,98CF
480,AAC8EE56,914A1,D97D
481,96F95EEC,36080,C563
482,AAC8EE56,914A1,D97D


## Compute Accuracy on each level (4 marks)
Now you have the predictions for each level (in the test data), and you also have the actual levels, you can compute the accurcay

In [31]:
# Level 1 accuracy
print(confusion_matrix(y_test_lvl1,y_pred))
print(classification_report(y_test_lvl1,y_pred))
print(accuracy_score(y_test_lvl1, y_pred))

[[ 63   0   0   0   0   0   0   0   0   0  21   0   0   0   0]
 [  0  85  11   1   1   4   3   7   6   0   0  12   3   0   1]
 [  0   3 128  12   0   1   0   0   0   0   0  11   0   0   0]
 [  0   2   8 122   0   1   0  14   0   0   0   1   1   0   0]
 [  0   3   0   0  94   0   3   1   1   0   0   2   0  10   1]
 [  0   8  11  12   1  47   1   8   0   0   0   0   0   0   2]
 [  0   5   0   0   2   0  98   0   1   0   1   0   0   2   0]
 [  0   3   2   7   0   0   0 139   0   0   0   1   0   0   0]
 [  0   3   0   0   1   0   3   0 132   2   0   0   2   0   0]
 [  0   0   0   0   0   0   0   0   1  89   0   0   2   0   0]
 [  6   0   0   0   0   0   0   0   0   0 114   0   0   0   0]
 [  0   0   4   1   0   0   0   2   0   0   0 168   0   0   0]
 [  0   1   0   0   0   0   0   1   1   3   0   1 172   0   0]
 [  0   3   0   0   7   0   1   0   0   0   3   0   4  61   0]
 [  0   1   2   0   0   0   0   0   0   0   0   2   0   0 153]]
              precision    recall  f1-score   support


In [32]:
# Level 2 accuracy
print(confusion_matrix(y_test_lvl2,y_pred_lvl2))
print(classification_report(y_test_lvl2,y_pred_lvl2))
print(accuracy_score(y_test_lvl2, y_pred_lvl2))

[[72]]
              precision    recall  f1-score   support

       ACD06       1.00      1.00      1.00        72

    accuracy                           1.00        72
   macro avg       1.00      1.00      1.00        72
weighted avg       1.00      1.00      1.00        72

1.0


In [33]:
# Level 3 accuracy
print(confusion_matrix(y_test_lvl3,y_pred_lvl3))
print(classification_report(y_test_lvl3,y_pred_lvl3))
print(accuracy_score(y_test_lvl3, y_pred_lvl3))

[[0 1 0 4]
 [0 0 0 5]
 [0 1 0 4]
 [0 3 0 8]]
              precision    recall  f1-score   support

        74C9       0.00      0.00      0.00         5
        8FEF       0.00      0.00      0.00         5
        D55B       0.00      0.00      0.00         5
        F213       0.38      0.73      0.50        11

    accuracy                           0.31        26
   macro avg       0.10      0.18      0.12        26
weighted avg       0.16      0.31      0.21        26

0.3076923076923077


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Well done!