In [101]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import log_loss, roc_auc_score, recall_score, precision_score, average_precision_score, f1_score, classification_report, accuracy_score, plot_roc_curve, plot_precision_recall_curve, plot_confusion_matrix
import nltk
from nltk import word_tokenize
import re
from sklearn import feature_extraction, feature_selection, model_selection, naive_bayes, pipeline, manifold, preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [102]:
data = pd.read_excel(r"C:\Users\gog10\GfK_spectacle_lenses_data.xlsx")

In [103]:
#Replace null values in the data
data.loc[data['main_text'].isnull(), 'main_text'] = 'Not Known'
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3549 entries, 0 to 3548
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   item_id         3549 non-null   int64 
 1   main_text       3549 non-null   object
 2   country_name    3549 non-null   object
 3   retailer_pg     3549 non-null   object
 4   mdm_brand_text  3549 non-null   object
 5   mdm_model_text  3549 non-null   object
dtypes: int64(1), object(5)
memory usage: 166.5+ KB


In [104]:
#Denote country name into numerical categories
from sklearn.preprocessing import LabelEncoder
l = LabelEncoder()
categorical_col = ['country_name']
for col in categorical_col:
    data[col] = l.fit_transform(data[col])
data.head()

Unnamed: 0,item_id,main_text,country_name,retailer_pg,mdm_brand_text,mdm_model_text
0,138176095,ID1.60LIFESTL3IURBAN(RX),1,CRISTALES,HOYA,HOYALUX ID LIFESTYLE 3-I HVLL
1,138176095,ID1.60LIFESTL3IURBAN(RX),1,CRISTALES,HOYA,HOYALUX ID LIFESTYLE 3-I HVLL
2,138176095,ID1.60LIFESTL3IURBAN(RX),1,CRISTALES,HOYA,HOYALUX ID LIFESTYLE 3-I HVLL
3,138176095,ID1.60LIFESTL3IURBAN(RX),1,CRISTALES,HOYA,HOYALUX ID LIFESTYLE 3-I HVLL
4,138176095,ID1.60LIFESTL3IURBAN(RX),1,CRISTALES,HOYA,HOYALUX ID LIFESTYLE 3-I HVLL


In [105]:
#Split dataframe into countries
data_germany = data[data['country_name'] == 0]
data_spain = data[data['country_name'] == 1]
data_spain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2270 entries, 0 to 3548
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   item_id         2270 non-null   int64 
 1   main_text       2270 non-null   object
 2   country_name    2270 non-null   int32 
 3   retailer_pg     2270 non-null   object
 4   mdm_brand_text  2270 non-null   object
 5   mdm_model_text  2270 non-null   object
dtypes: int32(1), int64(1), object(4)
memory usage: 115.3+ KB


In [106]:
#Get counts for item id
data_spain['item_id'].value_counts()

82981040     290
138176095    226
87099837     116
88210952      71
79429232      50
            ... 
98426893       3
117860560      3
80508911       2
119539790      1
72145121       1
Name: item_id, Length: 128, dtype: int64

In [107]:
#Delete columns not needed for model
data_spain.drop(['mdm_model_text', 'country_name'], axis = 'columns', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [108]:
data_spain.head()

Unnamed: 0,item_id,main_text,retailer_pg,mdm_brand_text
0,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES,HOYA
1,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES,HOYA
2,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES,HOYA
3,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES,HOYA
4,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES,HOYA


In [109]:
## rename columns
data_spain = data_spain.rename(columns={"mdm_brand_text":"y", "main_text":"text", "retailer_pg":"pg"})
data_spain

Unnamed: 0,item_id,text,pg,y
0,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES,HOYA
1,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES,HOYA
2,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES,HOYA
3,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES,HOYA
4,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES,HOYA
...,...,...,...,...
3544,25278162,CRISTALES 34212 MI MF XPERIENCE 1.5 BASIC HV P...,CRISTALES,HOYA
3545,115519055,CRISTALES 34186 MI HILUX 1.6 SHV FAB MONOFOCAL...,CRISTALES,HOYA
3546,112904161,CRISTALES 34317 NULUX ACTIVE TF 1.5 HV ORGÁNIC...,CRISTALES,HOYA
3547,138176095,CRISTALES 34244 MI LIFESTYLE 3I 1.5 HVLL BC PR...,CRISTALES,HOYA


In [110]:
lst_stopwords = nltk.corpus.stopwords.words("english")
lst_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [111]:
'''
Preprocess a string.
:parameter
    :param text: string - name of column containing text
    :param lst_stopwords: list - list of stopwords to remove
    :param flg_stemm: bool - whether stemming is to be applied
    :param flg_lemm: bool - whether lemmitisation is to be applied
:return
    cleaned text
'''
def preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).strip())
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

In [112]:
'''
Preprocess a string.
:parameter
    :param text: string - name of column containing text
    :param lst_stopwords: list - list of stopwords to remove
    :param flg_stemm: bool - whether stemming is to be applied
    :param flg_lemm: bool - whether lemmitisation is to be applied
:return
    cleaned text
'''
def preprocess_pg(pg, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (remove punctuations and characters and then strip)
    pg = re.sub(r'[^\w\s]', '', str(pg).strip())
            
    ## Tokenize (convert from string to list)
    lst_pg = pg.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_pg = [word for word in lst_pg if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_pg = [ps.stem(word) for word in lst_pg]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_pg = [lem.lemmatize(word) for word in lst_pg]
            
    ## back to string from list
    pg = " ".join(lst_pg)
    return pg

In [113]:
data_spain["text_clean"] = data_spain["text"].apply(lambda x: 
          preprocess_text(x, flg_stemm=False, flg_lemm=True, 
          lst_stopwords=lst_stopwords))
data_spain.head()

Unnamed: 0,item_id,text,pg,y,text_clean
0,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES,HOYA,ID160LIFESTL3IURBANRX
1,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES,HOYA,ID160LIFESTL3IURBANRX
2,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES,HOYA,ID160LIFESTL3IURBANRX
3,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES,HOYA,ID160LIFESTL3IURBANRX
4,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES,HOYA,ID160LIFESTL3IURBANRX


In [114]:
data_spain["pg_clean"] = data_spain["pg"].apply(lambda x: 
          preprocess_pg(x, flg_stemm=False, flg_lemm=True, 
          lst_stopwords=lst_stopwords))
data_spain.head()

Unnamed: 0,item_id,text,pg,y,text_clean,pg_clean
0,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES,HOYA,ID160LIFESTL3IURBANRX,CRISTALES
1,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES,HOYA,ID160LIFESTL3IURBANRX,CRISTALES
2,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES,HOYA,ID160LIFESTL3IURBANRX,CRISTALES
3,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES,HOYA,ID160LIFESTL3IURBANRX,CRISTALES
4,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES,HOYA,ID160LIFESTL3IURBANRX,CRISTALES


In [115]:
mnb = MultinomialNB()
tf = TfidfVectorizer()

X = data_spain.text_clean + data_spain.pg_clean
y = data_spain['item_id']


In [117]:
## split dataset
data_spain_train, data_spain_test = model_selection.train_test_split(data_spain, test_size=0.3)
## get target
y_train = data_spain_train["y"].values
y_test = data_spain_test["y"].values

In [118]:
## Count (classic BoW)
vectorizer = feature_extraction.text.CountVectorizer(max_features=10000, ngram_range=(1,2))

## Tf-Idf (advanced variant of BoW)
vectorizer = feature_extraction.text.TfidfVectorizer(max_features=10000, ngram_range=(1,2))

In [119]:
texts = data_spain_train["text_clean"]
vectorizer.fit(texts)
X_train = vectorizer.transform(texts)
dic_vocabulary = vectorizer.vocabulary_

In [120]:
vectorizer.get_feature_names()

['000',
 '000 0000',
 '000 5º',
 '000 ess',
 '000 loa',
 '0000',
 '0000 0000',
 '0000 0050',
 '0000 0075',
 '0000 0100',
 '0000 0125',
 '0000 0150',
 '0000 0175',
 '0000 0200',
 '0000 0225',
 '0000 0250',
 '0000 0275',
 '0000 0300',
 '0000 0325',
 '0000 0350',
 '0000 0375',
 '0000 0400',
 '0000 0750',
 '0000 200',
 '0000 2250',
 '0000 275',
 '0000 300',
 '0000 airwear',
 '0000 varilux',
 '0002',
 '000202',
 '000202 masterlux',
 '001101',
 '001101 hifin',
 '001102',
 '001102 hifin',
 '0016',
 '0025',
 '0025 0050',
 '0025 0075',
 '0025 0100',
 '0025 0125',
 '0025 0200',
 '0025 0225',
 '0025 0275',
 '0025 0300',
 '0025 0350',
 '0025 275',
 '0025 airwear',
 '0025 ormix',
 '004',
 '004 0125',
 '005',
 '005 0075',
 '005 0125',
 '005 0225',
 '005 0325',
 '0050',
 '0050 0000',
 '0050 0025',
 '0050 0050',
 '0050 0075',
 '0050 0100',
 '0050 0150',
 '0050 0175',
 '0050 0200',
 '0050 0225',
 '0050 0275',
 '0050 0300',
 '0050 0375',
 '0050 0400',
 '0050 0475',
 '0050 0750',
 '0050 125',
 '0050 250'

In [121]:
vectorizer.vocabulary_

{'798540': 1821,
 'varilux': 4881,
 '4d': 1403,
 'stylis': 4570,
 '167': 782,
 'eye': 2832,
 'protect': 4298,
 'system': 4664,
 'ojo': 3884,
 'der': 2495,
 'esf': 2678,
 '125': 429,
 'cil': 2143,
 'eje': 2596,
 '165': 773,
 'adic': 1931,
 '225': 1012,
 'diam': 2523,
 '70': 1651,
 '798540 varilux': 1822,
 'varilux 4d': 4882,
 '4d stylis': 1409,
 'stylis 167': 4571,
 '167 eye': 809,
 'eye protect': 2836,
 'protect system': 4301,
 'system ojo': 4670,
 'ojo der': 3885,
 'der esf': 2496,
 'esf 125': 2683,
 '125 cil': 434,
 'cil eje': 2150,
 'eje 165': 2607,
 '165 adic': 776,
 'adic 225': 1933,
 '225 diam': 1019,
 'diam 70': 2531,
 'comfort': 2204,
 'ne': 3794,
 'orma': 3969,
 '15': 501,
 'crizal': 2428,
 'sun': 4576,
 'uv': 4838,
 'marronc': 3629,
 'varilux comfort': 4884,
 'comfort ne': 2211,
 'ne orma': 3796,
 'orma 15': 3970,
 '15 crizal': 523,
 'crizal sun': 2443,
 'sun uv': 4591,
 'uv marronc': 4854,
 '518100': 1422,
 'ormix': 3987,
 '16': 617,
 'prevencia': 4215,
 '35': 1233,
 '95': 1

In [122]:
array_of_feature = X_train.toarray()
array_of_feature

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [123]:
y = data_spain_train["y"]
X_names = vectorizer.get_feature_names()
p_value_limit = 0.95
data_spain_features = pd.DataFrame()
for cat in np.unique(y):
    chi2, p = feature_selection.chi2(X_train, y==cat)
    data_spain_features = data_spain_features.append(pd.DataFrame(
                   {"feature":X_names, "score":1-p, "y":cat}))
    data_spain_features = data_spain_features.sort_values(["y","score"], 
                    ascending=[True,False])
    data_spain_features = data_spain_features[data_spain_features["score"]>p_value_limit]
X_names = data_spain_features["feature"].unique().tolist()

In [124]:
for cat in np.unique(y):
   print("# {}:".format(cat))
   print("  . selected features:",
         len(data_spain_features[data_spain_features["y"]==cat]))
   print("  . top features:", ",".join(
data_spain_features[data_spain_features["y"]==cat]["feature"].values[:10]))
   print(" ")

# AMERICAN OPTIC:
  . selected features: 29
  . top features: 16 hcm,16 photofg,16 photomarron,american,american optical,hcm,hcm zeiss,hmc 6570,hmc 7075,hmcczaiss
 
# BBGR:
  . selected features: 16
  . top features: 17 60,17 70,17 diams,17 stock,60 diams,65 diams,diams,diams stock,lentesbbgrunem,lentesbbgrunem 17
 
# ESSILOR:
  . selected features: 188
  . top features: ormix,crizal,prevencia,ormix 16,crizal prevencia,varilux,16 crizal,eps,orma,essilor
 
# EYEART:
  . selected features: 32
  . top features: 16 ultra,160monofocalorganico0hmcsin,160monofocalorganico0hmcsin color,161,161 hmc,174 super,174monofocalorganico0shmcsin,174monofocalorganico0shmcsin color,2540,2540 orlite
 
# HOYA:
  . selected features: 179
  . top features: 160,eyas,rx,hilux,lifestyle,id160lifestl3iurbanrx,balansis,id,balansis 160,160 rx
 
# INDO:
  . selected features: 224
  . top features: superfin,indosol,indosol basic,basic,durcap,natural10,activa_rvd,basic gris,indofin,indofin 16
 
# KODAK:
  . selected f

In [125]:
classifier = naive_bayes.MultinomialNB()

## pipeline
model = pipeline.Pipeline([("vectorizer", vectorizer),  
                           ("classifier", classifier)])
## train classifier
model["classifier"].fit(X_train, y_train)
## test
X_test = data_spain_test["text_clean"].values
predicted = model.predict(X_test)
predicted_prob = model.predict_proba(X_test)

In [126]:
classes = np.unique(y_test)
y_test_array = pd.get_dummies(y_test, drop_first=False).values

precision = precision_score(y_test, predicted, average='micro', zero_division=0)
print('Precision score: {0:0.2f}'.format(precision))

recall = recall_score(y_test, predicted, average='micro', zero_division=0)
print('Recall score: {0:0.2f}'.format(recall))

f1 = f1_score(y_test, predicted, average='micro', zero_division=0)
print('f1 score: {0:0.2f}'.format(recall))

Precision score: 0.90
Recall score: 0.90
f1 score: 0.90
