In [28]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import log_loss, roc_auc_score, recall_score, precision_score, average_precision_score, f1_score, classification_report, accuracy_score, plot_roc_curve, plot_precision_recall_curve, plot_confusion_matrix
import nltk
from nltk import word_tokenize
import re
from sklearn import feature_extraction, feature_selection, model_selection, naive_bayes, pipeline, manifold, preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [29]:
data = pd.read_excel(r"C:\Users\gog10\GfK_spectacle_lenses_data.xlsx")

In [30]:
#Replace null values in the data
data.loc[data['main_text'].isnull(), 'main_text'] = 'Not Known'
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3549 entries, 0 to 3548
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   item_id         3549 non-null   int64 
 1   main_text       3549 non-null   object
 2   country_name    3549 non-null   object
 3   retailer_pg     3549 non-null   object
 4   mdm_brand_text  3549 non-null   object
 5   mdm_model_text  3549 non-null   object
dtypes: int64(1), object(5)
memory usage: 166.5+ KB


In [31]:
#Get counts for brand text
data['item_id'].value_counts()

82981040     290
138176095    226
87099837     116
88210952      71
79429232      51
            ... 
81343196       5
122769301      5
124336576      5
119795873      5
64840970       5
Name: item_id, Length: 260, dtype: int64

In [32]:
#Delete columns not needed for model
data.drop(['country_name', 'retailer_pg', 'mdm_model_text', 'mdm_brand_text'], axis = 'columns', inplace=True)

In [33]:
data.head()

Unnamed: 0,item_id,main_text
0,138176095,ID1.60LIFESTL3IURBAN(RX)
1,138176095,ID1.60LIFESTL3IURBAN(RX)
2,138176095,ID1.60LIFESTL3IURBAN(RX)
3,138176095,ID1.60LIFESTL3IURBAN(RX)
4,138176095,ID1.60LIFESTL3IURBAN(RX)


In [34]:
# rename columns for model
data = data.rename(columns={"item_id":"y", "main_text":"text"})
data

Unnamed: 0,y,text
0,138176095,ID1.60LIFESTL3IURBAN(RX)
1,138176095,ID1.60LIFESTL3IURBAN(RX)
2,138176095,ID1.60LIFESTL3IURBAN(RX)
3,138176095,ID1.60LIFESTL3IURBAN(RX)
4,138176095,ID1.60LIFESTL3IURBAN(RX)
...,...,...
3544,25278162,CRISTALES 34212 MI MF XPERIENCE 1.5 BASIC HV P...
3545,115519055,CRISTALES 34186 MI HILUX 1.6 SHV FAB MONOFOCAL...
3546,112904161,CRISTALES 34317 NULUX ACTIVE TF 1.5 HV ORGÁNIC...
3547,138176095,CRISTALES 34244 MI LIFESTYLE 3I 1.5 HVLL BC PR...


In [35]:
#list of stop words from nltk
lst_stopwords = nltk.corpus.stopwords.words("english")
lst_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [36]:
#create a function to remove these stop words, remove certain characters, stem the text and lemmatize the text
'''
Preprocess a string.
:parameter
    :param text: string - name of column containing text
    :param lst_stopwords: list - list of stopwords to remove
    :param flg_stemm: bool - whether stemming is to be applied
    :param flg_lemm: bool - whether lemmitisation is to be applied
:return
    cleaned text
'''
def preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).strip())
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()
    
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

In [37]:
'''
Preprocess a string.
:parameter
    :param text: string - name of column containing text
    :param lst_stopwords: list - list of stopwords to remove
    :param flg_stemm: bool - whether stemming is to be applied
    :param flg_lemm: bool - whether lemmitisation is to be applied
:return
    cleaned text
'''
def preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).strip())
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

In [38]:
#Apply the function 
data["text_clean"] = data["text"].apply(lambda x: 
          preprocess_text(x, flg_stemm=False, flg_lemm=True, 
          lst_stopwords=lst_stopwords))
data.head()

Unnamed: 0,y,text,text_clean
0,138176095,ID1.60LIFESTL3IURBAN(RX),ID160LIFESTL3IURBANRX
1,138176095,ID1.60LIFESTL3IURBAN(RX),ID160LIFESTL3IURBANRX
2,138176095,ID1.60LIFESTL3IURBAN(RX),ID160LIFESTL3IURBANRX
3,138176095,ID1.60LIFESTL3IURBAN(RX),ID160LIFESTL3IURBANRX
4,138176095,ID1.60LIFESTL3IURBAN(RX),ID160LIFESTL3IURBANRX


In [39]:
## split dataset
data_train, data_test = model_selection.train_test_split(data, test_size=0.3)
## get target
y_train = data_train["y"].values
y_test = data_test["y"].values

In [40]:
## Count (classic BoW)
vectorizer = feature_extraction.text.CountVectorizer(max_features=10000, ngram_range=(1,2))

## Tf-Idf (advanced variant of BoW)
vectorizer = feature_extraction.text.TfidfVectorizer(max_features=10000, ngram_range=(1,2))

In [41]:
texts = data_train["text_clean"]
vectorizer.fit(texts)
X_train = vectorizer.transform(texts)
dic_vocabulary = vectorizer.vocabulary_

In [42]:
vectorizer.get_feature_names()

['00',
 '00 fab',
 '000',
 '000 0000',
 '000 ess',
 '0000',
 '0000 0000',
 '0000 0050',
 '0000 0075',
 '0000 0125',
 '0000 0150',
 '0000 0175',
 '0000 0200',
 '0000 0225',
 '0000 0250',
 '0000 0275',
 '0000 0325',
 '0000 0350',
 '0000 0375',
 '0000 0400',
 '0000 0750',
 '0000 125',
 '0000 200',
 '0000 300',
 '0000 airwear',
 '0000 varilux',
 '0002',
 '001101',
 '001101 hifin',
 '001102',
 '001102 hifin',
 '0016',
 '0025',
 '0025 0050',
 '0025 0075',
 '0025 0100',
 '0025 0125',
 '0025 0200',
 '0025 0225',
 '0025 0250',
 '0025 0275',
 '0025 0300',
 '0025 0350',
 '0025 250',
 '0025 275',
 '0025 airwear',
 '004',
 '004 0125',
 '005',
 '005 0025',
 '005 0075',
 '005 0125',
 '005 0150',
 '005 0225',
 '005 0325',
 '0050',
 '0050 0000',
 '0050 0025',
 '0050 0050',
 '0050 0075',
 '0050 0100',
 '0050 0150',
 '0050 0175',
 '0050 0250',
 '0050 0275',
 '0050 0300',
 '0050 0375',
 '0050 0400',
 '0050 0475',
 '0050 0750',
 '0050 125',
 '0050 200',
 '0050 250',
 '0050 airwear',
 '0050 aiwear',
 '0050 

In [43]:
vectorizer.vocabulary_

{'varilux': 6307,
 'comfort': 2803,
 'ne': 4919,
 'ormix': 5137,
 '16': 716,
 'crizal': 3052,
 'sun': 5976,
 'color': 2772,
 'gris': 3900,
 'varilux comfort': 6311,
 'comfort ne': 2811,
 'ne ormix': 4922,
 'ormix 16': 5138,
 '16 crizal': 752,
 'crizal sun': 3070,
 'sun color': 5981,
 'color gris': 2780,
 'superkid': 6048,
 'lite': 4598,
 'sv': 6063,
 '15': 596,
 'durcap': 3315,
 'diametro': 3276,
 '60': 1885,
 'superkid lite': 6050,
 'lite sv': 4599,
 'sv 15': 6064,
 '15 durcap': 619,
 'durcap diametro': 3321,
 'diametro 60': 3277,
 'rh': 5577,
 '3341': 1469,
 '023': 181,
 'rh 3341': 5592,
 '3341 rh': 1470,
 'rh 023': 5579,
 'cristales': 2850,
 '2540': 1241,
 'orlite': 5117,
 '174': 1034,
 'super': 6006,
 'hmc': 4039,
 'cristales 2540': 2888,
 '2540 orlite': 1242,
 'orlite 174': 5120,
 '174 super': 1053,
 'super hmc': 6009,
 'design': 3248,
 'stylis': 5966,
 '167': 910,
 'eps': 3417,
 'varilux design': 6315,
 'design stylis': 3253,
 'stylis 167': 5967,
 '167 eps': 938,
 'precissuperb':

In [44]:
array_of_feature = X_train.toarray()
array_of_feature

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [45]:
y = data_train["y"]
X_names = vectorizer.get_feature_names()
p_value_limit = 0.95
data_features = pd.DataFrame()
for cat in np.unique(y):
    chi2, p = feature_selection.chi2(X_train, y==cat)
    data_features = data_features.append(pd.DataFrame(
                   {"feature":X_names, "score":1-p, "y":cat}))
    data_features = data_features.sort_values(["y","score"], 
                    ascending=[True,False])
    data_features = data_features[data_features["score"]>p_value_limit]
X_names = data_features["feature"].unique().tolist()

In [46]:
for cat in np.unique(y):
   print("# {}:".format(cat))
   print("  . selected features:",
         len(data_features[data_features["y"]==cat]))
   print("  . top features:", ",".join(
data_features[data_features["y"]==cat]["feature"].values[:10]))
   print(" ")

# 2296911:
  . selected features: 65
  . top features: 11mm,11mm 16,16 hilux,16 hset,16 hshv,active 16,am1,am1 hardhva,b001,b001 kra
 
# 18378494:
  . selected features: 30
  . top features: 0200 1800,1800,1800 200,19,19 as,200 masterlux,26629,26629 160,ad,ad indo
 
# 18378561:
  . selected features: 27
  . top features: 15 durcap,15 stock,cristales superkid,durcap diametro,durcap stock,lite,lite sv,litemonofocalorganico1500durcapsin,litemonofocalorganico1500durcapsin color,superkid
 
# 18378572:
  . selected features: 71
  . top features: 0075 indo,105 0125,1523,1523 icrmamin70,1523 icrmaminø7bifocalmineral1523sin,7008,7008 105,cristales 7008,cristales telegrand,céfir
 
# 18378781:
  . selected features: 57
  . top features: 17,17 lab,35,35 exp,35 hifi,35 lab,exp,exp mineral,flint,flint 17
 
# 19026396:
  . selected features: 72
  . top features: 16 hard,16 pol,16rx hard,16rxmonofocalorganico1600con,16rxmonofocalorganico1600con color,16rxmonofocalorganico1600hardcon,16rxmonofocalorgan

  . top features: 0000 0275,0000 airwear,0050,0050 0000,0050 airwear,0075 airwear,0275 airwear,16 kids,16 ormix,29534
 
# 86061799:
  . selected features: 19
  . top features: 3es 954100,6eu 954100,954100,954100 3es,954100 6eu,crizal easy,easy eo,easy ess,ess ideal,ideal
 
# 86804350:
  . selected features: 67
  . top features: 14 75,167 clarity,167 pasillo,75 hvll,clarity,clarity 167,clarity p11,cristales id,h26,h26 hvl
 
# 87099837:
  . selected features: 311
  . top features: basic,basic gris,indosol,indosol basic,indosoluniforme,stk,superfin,superfin indosol,superfin indosoluniforme,basic marron
 
# 87121775:
  . selected features: 84
  . top features: 174,174 trans,174 transitions,ess lineis,lineis,lineis 174,modelo lineis,essilor lineis,transitions,65200600
 
# 87999683:
  . selected features: 31
  . top features: 1317,1317 bifocal,15 hmc,2581,2581 bo,2582,2582 bo,2584,2584 bo,bifocal
 
# 88210952:
  . selected features: 113
  . top features: 160 eyas,160 fab,16rx,eyas,eyas 16rx,

# 106788109:
  . selected features: 18
  . top features: 6492,6492 7sy,6492 sy,6492 syn,6492 zsy,7sy 6492,7sy mp,mp,sy 6492,syn 6492
 
# 106789564:
  . selected features: 22
  . top features: 13103,13103 7sy,13103 ao,13103 czs,13103 czy,13103 sy,13103 syn,13103 zsy,7sy 13103,ao 13103
 
# 106790570:
  . selected features: 19
  . top features: 7489,7489 7sy,7489 czs,7489 czy,7489 syn,7489 zei,7489 zsy,7sy,7sy 7489,czs
 
# 106791193:
  . selected features: 13
  . top features: 6551,6551 hmc,6551 sy,6551 syn,cza 6551,mp,sy,sy 6551,sy mp,syn 6551
 
# 106791342:
  . selected features: 27
  . top features: 2cz,2cz 6836,2cz hm,6836,6836 2cz,6836 7sy,6836 czs,6836 sy,6836 syn,6836 zsy
 
# 106792878:
  . selected features: 22
  . top features: 6950,6950 7sy,6950 czy,6950 sy,6950 syn,6950 zsy,7sy 6950,czy,czy 6950,czy hm
 
# 106792953:
  . selected features: 17
  . top features: 6399,6399 czs,6399 hmc,6399 sy,6399 syn,ao 6399,czs,czs 6399,czs mp,mp
 
# 106793053:
  . selected features: 22
  . top

# 124606420:
  . selected features: 82
  . top features: 0250 200,030,030 0300,0300 0250,11342,11342 095,200 vx,226,226 varilux,275 vx
 
# 124606451:
  . selected features: 37
  . top features: 16 eps,4d 16,984100,984100 varilux,crizal sundegradado,deg,deg crizal,eps physiotints,physiotints,physiotints crizal
 
# 124632457:
  . selected features: 27
  . top features: 167 eps,1670,colorindice,colorindice 1670,crizalforteuvmulticapano,crizalforteuvmulticapano endurecidosin,design,design 167,design stylis,endurecidosin
 
# 124633843:
  . selected features: 28
  . top features: 167 eps,434540,434540 varilux,design,design stylis,lente 434540,stylis,stylis 167,varilux design,organico progresivo
 
# 124691773:
  . selected features: 18
  . top features: en2,en2 hld,en2 ho,en2 hol,en2 hoy,enrarp,hld,hld en2,hld enrarp,ho
 
# 124721429:
  . selected features: 44
  . top features: 0025 0075,0025 0100,0100 250,0200 300,0250 0200,032,032 0250,080 0025,150 0250,250 vx
 
# 124834949:
  . selected fe

In [47]:
classifier = naive_bayes.MultinomialNB()

## pipeline
model = pipeline.Pipeline([("vectorizer", vectorizer),  
                           ("classifier", classifier)])
## train classifier
model["classifier"].fit(X_train, y_train)
## test
X_test = data_test["text_clean"].values
predicted = model.predict(X_test)
predicted_prob = model.predict_proba(X_test)

In [48]:
classes = np.unique(y_test)
y_test_array = pd.get_dummies(y_test, drop_first=False).values

precision = precision_score(y_test, predicted, average='micro', zero_division=0)
print('Precision score: {0:0.2f}'.format(precision))

recall = recall_score(y_test, predicted, average='micro', zero_division=0)
print('Recall score: {0:0.2f}'.format(recall))

f1 = f1_score(y_test, predicted, average='micro', zero_division=0)
print('f1 score: {0:0.2f}'.format(recall))

Precision score: 0.34
Recall score: 0.34
f1 score: 0.34
