In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import log_loss, roc_auc_score, recall_score, precision_score, average_precision_score, f1_score, classification_report, accuracy_score, plot_roc_curve, plot_precision_recall_curve, plot_confusion_matrix
import nltk
from nltk import word_tokenize
import re
from sklearn import feature_extraction, feature_selection, model_selection, naive_bayes, pipeline, manifold, preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.naive_bayes import MultinomialNB

In [2]:
data = pd.read_excel(r"C:\Users\gog10\GfK_spectacle_lenses_data.xlsx")

In [3]:
#Replace null values in the data
data.loc[data['main_text'].isnull(), 'main_text'] = 'Not Known'
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3549 entries, 0 to 3548
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   item_id         3549 non-null   int64 
 1   main_text       3549 non-null   object
 2   country_name    3549 non-null   object
 3   retailer_pg     3549 non-null   object
 4   mdm_brand_text  3549 non-null   object
 5   mdm_model_text  3549 non-null   object
dtypes: int64(1), object(5)
memory usage: 166.5+ KB


In [536]:
#Get counts for item_id - assess if it can be split into categorical data
data['item_id'].value_counts()

82981040     290
138176095    226
87099837     116
88210952      71
79429232      51
            ... 
81343196       5
122769301      5
124336576      5
119795873      5
64840970       5
Name: item_id, Length: 260, dtype: int64

In [6]:
#Get counts for retailer_pg  - messy data so needs to be processed first
data['retailer_pg'].value_counts()

GLA                                 1278
CRISTALES                            900
LENTES                               304
LENTE                                177
UNASSIGNED                           168
LENTES OFTALMICAS                    124
LENTE|ORGANICA|PROGRESIVA             83
LENTE|ORGANICA|MONOFOCAL              80
LENTES//ORGANICA MONOFOCAL            64
LENTES//ORGANICA PROGRESIVA           40
?253                                  31
LENTES ORGANICA MONOFOCAL             30
SF06                                  30
LENTE|ORGANICA PROGRESIVA             28
LENTE|ORGANICA MONOFOCAL              26
VIDRES                                25
LENTES|ORGANICA MONOFOCAL             24
LENTES OFTÂŒMICAS                     21
LENTES/CRISTALES                      19
LENTES|ORGANICA PROGRESIVA            13
MONOFOCAL                             12
LENTES ORGANICA PROGRESIVA            11
PROGRESIVO                            10
722803 - LENTES PROMOCION              6
LENTES OFTÁLMICA

In [7]:
#Get counts for main_text - messy data so needs to be processed first
data['main_text'].value_counts()

BALANSIS 1.60 (RX)                                                    24
ID1.60LIFESTL3IURBAN(RX)                                              22
HILUX 1.60 EYAS (RX)                                                  13
?ROD  L2LK?  -  ?ROD  SU?                                             12
ZEISS ES SPH 1.6  -  LOTUTEC                                          12
                                                                      ..
?ESS  GFB100?  -  ?ESS  676?                                           1
?EO  GFB540?  -  ?EO  676?                                             1
?ES-  G8E634?  -  ?ES-  675?                                           1
?NIKA 113?  -  ?NIKA 199?                                              1
CRISTALES 34244 MI LIFESTYLE 3I 1.5 BC PROGRESIVO ORGÁNICO HVLL BC     1
Name: main_text, Length: 2815, dtype: int64

In [537]:
#Counts by country
country_count = data['country_name'].value_counts()
country_count

SPAIN      2270
GERMANY    1279
Name: country_name, dtype: int64

In [13]:
#Plot counts by country
sns.set(style="darkgrid")
sns.barplot(x = country_count.index, y = country_count.values, alpha=0.9)
plt.title('Frequency Distribution of Countries')
plt.ylabel('Frequency', fontsize=12)
plt.xlabel('Country', fontsize=12)
plt.show() 

NameError: name 'country_count' is not defined

In [14]:
#Denote country name into numerical categories
from sklearn.preprocessing import LabelEncoder
l = LabelEncoder()
categorical_col = ['country_name']
for col in categorical_col:
    data[col] = l.fit_transform(data[col])
data.head()

Unnamed: 0,item_id,main_text,country_name,retailer_pg,mdm_brand_text,mdm_model_text
0,138176095,ID1.60LIFESTL3IURBAN(RX),1,CRISTALES,HOYA,HOYALUX ID LIFESTYLE 3-I HVLL
1,138176095,ID1.60LIFESTL3IURBAN(RX),1,CRISTALES,HOYA,HOYALUX ID LIFESTYLE 3-I HVLL
2,138176095,ID1.60LIFESTL3IURBAN(RX),1,CRISTALES,HOYA,HOYALUX ID LIFESTYLE 3-I HVLL
3,138176095,ID1.60LIFESTL3IURBAN(RX),1,CRISTALES,HOYA,HOYALUX ID LIFESTYLE 3-I HVLL
4,138176095,ID1.60LIFESTL3IURBAN(RX),1,CRISTALES,HOYA,HOYALUX ID LIFESTYLE 3-I HVLL


In [15]:
#Try splitting dataframe into countries
data_germany = data[data['country_name'] == 0]
data_spain = data[data['country_name'] == 1]
data_spain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2270 entries, 0 to 3548
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   item_id         2270 non-null   int64 
 1   main_text       2270 non-null   object
 2   country_name    2270 non-null   int32 
 3   retailer_pg     2270 non-null   object
 4   mdm_brand_text  2270 non-null   object
 5   mdm_model_text  2270 non-null   object
dtypes: int32(1), int64(1), object(4)
memory usage: 115.3+ KB


In [16]:
#Check retailer_pg counts for Spain only
data_spain['retailer_pg'].value_counts()

CRISTALES                           900
LENTES                              304
LENTE                               177
UNASSIGNED                          168
LENTES OFTALMICAS                   124
LENTE|ORGANICA|PROGRESIVA            83
LENTE|ORGANICA|MONOFOCAL             80
LENTES//ORGANICA MONOFOCAL           64
LENTES//ORGANICA PROGRESIVA          40
?253                                 31
LENTES ORGANICA MONOFOCAL            30
SF06                                 30
LENTE|ORGANICA PROGRESIVA            28
LENTE|ORGANICA MONOFOCAL             26
VIDRES                               25
LENTES|ORGANICA MONOFOCAL            24
LENTES OFTÂŒMICAS                    21
LENTES/CRISTALES                     19
LENTES|ORGANICA PROGRESIVA           13
MONOFOCAL                            12
LENTES ORGANICA PROGRESIVA           11
PROGRESIVO                           10
722803 - LENTES PROMOCION             6
LENTE|MONOFOCAL                       5
LENTES//MONOFOCAL//ORGÁNICA           5


In [17]:
#Check retailer_pg counts for Germany only
data_germany['retailer_pg'].value_counts()

GLA    1278
GL        1
Name: retailer_pg, dtype: int64

In [18]:
#Use Global model
#Delete columns not needed for model
data.drop(['country_name', 'mdm_brand_text', 'mdm_model_text'], axis = 'columns', inplace=True)

In [19]:
data.head()

Unnamed: 0,item_id,main_text,retailer_pg
0,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES
1,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES
2,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES
3,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES
4,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES


In [20]:
# rename columns for model
data = data.rename(columns={"item_id":"y", "main_text":"text", "retailer_pg":"pg"})
data

Unnamed: 0,y,text,pg
0,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES
1,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES
2,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES
3,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES
4,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES
...,...,...,...
3544,25278162,CRISTALES 34212 MI MF XPERIENCE 1.5 BASIC HV P...,CRISTALES
3545,115519055,CRISTALES 34186 MI HILUX 1.6 SHV FAB MONOFOCAL...,CRISTALES
3546,112904161,CRISTALES 34317 NULUX ACTIVE TF 1.5 HV ORGÁNIC...,CRISTALES
3547,138176095,CRISTALES 34244 MI LIFESTYLE 3I 1.5 HVLL BC PR...,CRISTALES


In [21]:
#list of stop words from nltk
lst_stopwords = nltk.corpus.stopwords.words("english")
lst_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [22]:
#create a function to remove these stop words, remove certain characters, stem the text and lemmatize the text in main text
'''
Preprocess a string.
:parameter
    :param text: string - name of column containing text
    :param lst_stopwords: list - list of stopwords to remove
    :param flg_stemm: bool - whether stemming is to be applied
    :param flg_lemm: bool - whether lemmitisation is to be applied
:return
    cleaned text
'''
def preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).strip())
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()
    
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

In [23]:
#create a function to remove these stop words, remove certain characters, stem the text and lemmatize the text for pg
'''
Preprocess a string.
:parameter
    :param text: string - name of column containing text
    :param lst_stopwords: list - list of stopwords to remove
    :param flg_stemm: bool - whether stemming is to be applied
    :param flg_lemm: bool - whether lemmitisation is to be applied
:return
    cleaned text
'''
def preprocess_pg(pg, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (remove punctuations and characters and then strip)
    pg = re.sub(r'[^\w\s]', '', str(pg).strip())
            
    ## Tokenize (convert from string to list)
    lst_pg = pg.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_pg = [word for word in lst_pg if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_pg = [ps.stem(word) for word in lst_pg]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_pg = [lem.lemmatize(word) for word in lst_pg]
            
    ## back to string from list
    pg = " ".join(lst_pg)
    return pg

In [24]:
#Apply the function for text
data["text_clean"] = data["text"].apply(lambda x: 
          preprocess_text(x, flg_stemm=False, flg_lemm=True, 
          lst_stopwords=lst_stopwords))

In [25]:
#Apply the function for pg
data["pg_clean"] = data["pg"].apply(lambda x: 
          preprocess_pg(x, flg_stemm=False, flg_lemm=True, 
          lst_stopwords=lst_stopwords))
data.head()

Unnamed: 0,y,text,pg,text_clean,pg_clean
0,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES,ID160LIFESTL3IURBANRX,CRISTALES
1,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES,ID160LIFESTL3IURBANRX,CRISTALES
2,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES,ID160LIFESTL3IURBANRX,CRISTALES
3,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES,ID160LIFESTL3IURBANRX,CRISTALES
4,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES,ID160LIFESTL3IURBANRX,CRISTALES


In [26]:
#Combine text and pg
data["text_combined"] = data.text_clean + data.pg_clean
y = data['y']
data.head()

Unnamed: 0,y,text,pg,text_clean,pg_clean,text_combined
0,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES,ID160LIFESTL3IURBANRX,CRISTALES,ID160LIFESTL3IURBANRXCRISTALES
1,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES,ID160LIFESTL3IURBANRX,CRISTALES,ID160LIFESTL3IURBANRXCRISTALES
2,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES,ID160LIFESTL3IURBANRX,CRISTALES,ID160LIFESTL3IURBANRXCRISTALES
3,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES,ID160LIFESTL3IURBANRX,CRISTALES,ID160LIFESTL3IURBANRXCRISTALES
4,138176095,ID1.60LIFESTL3IURBAN(RX),CRISTALES,ID160LIFESTL3IURBANRX,CRISTALES,ID160LIFESTL3IURBANRXCRISTALES


In [27]:
# split dataset
data_train, data_test = model_selection.train_test_split(data, test_size=0.3)
# get target
y_train = data_train["y"].values
y_test = data_test["y"].values

In [28]:
#TfIdf, fit and transform, use SelectKBest
vectorizer = feature_extraction.text.TfidfVectorizer(max_features=None, ngram_range=(1,1))
texts = data_train["text_combined"]
x_train = vectorizer.fit_transform(texts)
selector = SelectKBest(chi2, k=min(50, x_train.shape[1]))
selector.fit(x_train, y_train)
x_train = selector.transform(x_train)
dic_vocabulary = vectorizer.vocabulary_

In [29]:
dic_vocabulary

{'zeiss': 2180,
 'officelens': 1640,
 'plus': 1749,
 'room': 1892,
 '153': 175,
 'blueprotect': 908,
 '6570': 609,
 '0unassigned': 77,
 'cristales': 980,
 '11413': 109,
 'estelux': 1145,
 '16': 185,
 'hmccristales': 1282,
 'ormix': 1674,
 'crizal': 986,
 'prevencia': 1790,
 'estoc': 1146,
 '75075275lente': 729,
 'rod': 1873,
 'l2lk': 1423,
 'sugla': 2015,
 'varilux': 2121,
 'comfort': 962,
 '3o': 420,
 'sun': 2017,
 'color': 954,
 'cris': 978,
 'verde': 2124,
 'clentes': 944,
 'ro': 1871,
 'i3iky2': 1341,
 'eyezen': 1166,
 'orma': 1672,
 '15': 167,
 'rx': 1900,
 'forte': 1186,
 'uv': 2111,
 'sin': 1968,
 'epsmonofocal': 1112,
 'rh': 1862,
 '1162': 115,
 '090gla': 71,
 'einst': 1088,
 'f360': 1169,
 'lineis': 1467,
 'ese': 1123,
 '675gla': 629,
 '98': 810,
 'eps': 1104,
 'active': 818,
 '085': 68,
 'eyecode': 1164,
 'prevenciacristales': 1791,
 'trans': 2076,
 '75000050': 697,
 'grislentes': 1238,
 '350': 406,
 'ess': 1140,
 'stockcristales': 2004,
 'lente': 1433,
 'essilor': 1142,
 'wo

In [30]:
n = x_train.size
n

377

In [31]:
classifier = naive_bayes.MultinomialNB()

model = pipeline.Pipeline([
            ("vectorizer",TfidfVectorizer()),
            ("selector",SelectKBest(chi2, k=50)),
            ("classifier",classifier)])

In [32]:
# train classifier
model["classifier"].fit(vectorizer, y_train)

ValueError: Expected 2D array, got scalar array instead:
array=TfidfVectorizer().
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [33]:
# test
X_test = data_test["text_combined"].values

In [486]:
predicted = model.predict(X_test)
predicted_prob = model.predict_proba(X_test)

ValueError: could not convert string to float: '804307 VARILUX COMFORT 30 ORMIX 16 CRIZAL SUN UV PHYSIOTINTS GRIS C OJO IZQ ESF 125 CIL 15 EJE 90 ADIC 25 DIAM 6570LENTESORGANICA PROGRESIVA'

In [402]:
#Evaluate model
classes = np.unique(y_test)
y_test_array = pd.get_dummies(y_test, drop_first=False).values

precision = precision_score(y_test, predicted, average='micro', zero_division=0)
print('Precision score: {0:0.2f}'.format(precision))

recall = recall_score(y_test, predicted, average='micro', zero_division=0)
print('Recall score: {0:0.2f}'.format(recall))

f1 = f1_score(y_test, predicted, average='micro', zero_division=0)
print('f1 score: {0:0.2f}'.format(recall))

Precision score: 0.07
Recall score: 0.07
f1 score: 0.07
