# Supervised Classification Model

### Load TSV data into a Dataframe

In [1]:
import pandas as pd
import numpy as np
import sklearn
import re
import string
import nltk

In [2]:
data=pd.read_csv("data.tsv", sep='\t', header=None)

In [3]:
data.head()

Unnamed: 0,0,1,2,3
0,0,28,a,"Meat, fish, poultry and game; Meat extracts; P..."
1,1,34,a,Business management and organization consultan...
2,2,29,a,"Vinegar; Salts, seasonings, flavourings and co..."
3,3,41,a,Conducting technical project studies; Scientif...
4,4,35,a,Brokerage; Brokerage house services; Agencies ...


In [4]:
data.tail()

Unnamed: 0,0,1,2,3
34676,34676,4,a,Pharmaceutical preparations for use in dermato...
34677,34677,6,a,"Outdoor power equipment, namely, chainsaws, wo..."
34678,34678,30,a,"Raw and unprocessed agricultural, aquacultural..."
34679,34679,28,a,"Cooked truffles; Truffles, preserved; Dried tr..."
34680,34680,2,a,"Soap; Cosmetics; Dentifrices; Perfumery, essen..."


In [5]:
# drop columns that don't contain useful data, such as index column
data=data.drop(data.columns[[0, 2]], axis=1)

In [6]:
data.head()

Unnamed: 0,1,3
0,28,"Meat, fish, poultry and game; Meat extracts; P..."
1,34,Business management and organization consultan...
2,29,"Vinegar; Salts, seasonings, flavourings and co..."
3,41,Conducting technical project studies; Scientif...
4,35,Brokerage; Brokerage house services; Agencies ...


In [7]:
# rename the columns
data.columns=["NICE Category", "Product Description"]

In [8]:
data.head()

Unnamed: 0,NICE Category,Product Description
0,28,"Meat, fish, poultry and game; Meat extracts; P..."
1,34,Business management and organization consultan...
2,29,"Vinegar; Salts, seasonings, flavourings and co..."
3,41,Conducting technical project studies; Scientif...
4,35,Brokerage; Brokerage house services; Agencies ...


In [9]:
# change the ordering of columns
data=data[["Product Description", "NICE Category"]]

In [10]:
data.head()

Unnamed: 0,Product Description,NICE Category
0,"Meat, fish, poultry and game; Meat extracts; P...",28
1,Business management and organization consultan...,34
2,"Vinegar; Salts, seasonings, flavourings and co...",29
3,Conducting technical project studies; Scientif...,41
4,Brokerage; Brokerage house services; Agencies ...,35


### Preprocess and Clean Data

In [11]:
# Load stopwords from NLTK corpus
stopwords=nltk.corpus.stopwords.words("english")

In [12]:
# Utilize NLTK Lemmatizer
lemmatizer=nltk.stem.WordNetLemmatizer()

In [13]:
def clean(text):
    
    # remove punctuation
    text="".join([word.lower() for word in text if word not in string.punctuation])
    
    # tokenize text
    tokens=re.split('\W+', text)
    
    # remove stopwords
    text=[token for token in tokens if token not in stopwords]
    
    # lemmatize the tokens
    text=[lemmatizer.lemmatize(token) for token in text]
    
    return text

### TF-IDF Vectorization

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect=TfidfVectorizer(analyzer=clean)
X_features=tfidf_vect.fit_transform(data["Product Description"])

In [15]:
X_features=pd.DataFrame(X_features.toarray())

In [16]:
X_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22761,22762,22763,22764,22765,22766,22767,22768,22769,22770
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Train, Test Split

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test=train_test_split(X_features, data["NICE Category"], test_size=0.2, random_state=20)

### Naive Bayes Model

In [19]:
from sklearn.naive_bayes import MultinomialNB

mnb=MultinomialNB()
clf=mnb.fit(X_train, y_train)
y_pred=clf.predict(X_test)

In [20]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[ 59   0   3 ...   0   0   0]
 [  0  20   3 ...   0   0   0]
 [  0   0 239 ...   0   0   0]
 ...
 [  0   0   0 ... 111   0   0]
 [  0   0   2 ...   0  64   0]
 [  0   0   0 ...   0   0  30]]
              precision    recall  f1-score   support

           0       1.00      0.53      0.69       111
           1       1.00      0.74      0.85        27
           2       0.92      0.98      0.95       244
           3       1.00      0.57      0.72        37
           4       0.86      0.96      0.91       228
           5       0.97      0.63      0.76       102
           6       0.91      0.83      0.87       150
           7       0.99      0.88      0.93        83
           8       0.75      0.96      0.84       801
           9       0.97      0.56      0.71       121
          10       0.94      0.94      0.94       193
          11       0.96      0.81      0.88       109
          12       0.00      0.00      0.00         6
          13       0.96      0.95      0.95        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
