In [378]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import sklearn
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import log_loss, roc_auc_score, recall_score, precision_score, average_precision_score, f1_score, classification_report, accuracy_score, plot_roc_curve, plot_precision_recall_curve, plot_confusion_matrix
import nltk
from nltk import word_tokenize
import re
from sklearn import feature_extraction, feature_selection, model_selection, naive_bayes, pipeline, manifold, preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [379]:
data = pd.read_excel(r"C:\Users\gog10\GfK_spectacle_lenses_data.xlsx")

In [380]:
#Replace null values in the data
data.loc[data['main_text'].isnull(), 'main_text'] = 'Not Known'
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3549 entries, 0 to 3548
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   item_id         3549 non-null   int64 
 1   main_text       3549 non-null   object
 2   country_name    3549 non-null   object
 3   retailer_pg     3549 non-null   object
 4   mdm_brand_text  3549 non-null   object
 5   mdm_model_text  3549 non-null   object
dtypes: int64(1), object(5)
memory usage: 166.5+ KB


In [381]:
#Get counts for brand text
data['mdm_brand_text'].value_counts()

ESSILOR           1266
HOYA               651
ZEISS              421
INDO               292
RODENSTOCK         279
SYNCHRONY          164
RUPP & HUBRACH     127
MAILSHOP            81
PRATS               57
SHAMIR              42
MEDICAL VISION      30
NIKA                29
KODAK               19
SEIKO               15
NIKON               11
RAY BAN             10
EYEART              10
TOKAI                9
WETZLICH             9
AMERICAN OPTIC       7
L.O.A.               7
BBGR                 7
VISALL               6
Name: mdm_brand_text, dtype: int64

In [382]:
#Delete columns not needed for model
data.drop(['item_id','country_name', 'retailer_pg', 'mdm_model_text'], axis = 'columns', inplace=True)

In [383]:
data.head()

Unnamed: 0,main_text,mdm_brand_text
0,ID1.60LIFESTL3IURBAN(RX),HOYA
1,ID1.60LIFESTL3IURBAN(RX),HOYA
2,ID1.60LIFESTL3IURBAN(RX),HOYA
3,ID1.60LIFESTL3IURBAN(RX),HOYA
4,ID1.60LIFESTL3IURBAN(RX),HOYA


In [384]:
# rename columns for model
data = data.rename(columns={"mdm_brand_text":"y", "main_text":"text"})
data

Unnamed: 0,text,y
0,ID1.60LIFESTL3IURBAN(RX),HOYA
1,ID1.60LIFESTL3IURBAN(RX),HOYA
2,ID1.60LIFESTL3IURBAN(RX),HOYA
3,ID1.60LIFESTL3IURBAN(RX),HOYA
4,ID1.60LIFESTL3IURBAN(RX),HOYA
...,...,...
3544,CRISTALES 34212 MI MF XPERIENCE 1.5 BASIC HV P...,HOYA
3545,CRISTALES 34186 MI HILUX 1.6 SHV FAB MONOFOCAL...,HOYA
3546,CRISTALES 34317 NULUX ACTIVE TF 1.5 HV ORGÁNIC...,HOYA
3547,CRISTALES 34244 MI LIFESTYLE 3I 1.5 HVLL BC PR...,HOYA


In [385]:
#list of stop words from nltk
lst_stopwords = nltk.corpus.stopwords.words("english")
lst_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [386]:
#create a function to remove these stop words, remove certain characters, stem the text and lemmatize the text
'''
Preprocess a string.
:parameter
    :param text: string - name of column containing text
    :param lst_stopwords: list - list of stopwords to remove
    :param flg_stemm: bool - whether stemming is to be applied
    :param flg_lemm: bool - whether lemmitisation is to be applied
:return
    cleaned text
'''
def preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).strip())
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()
    
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

In [387]:
'''
Preprocess a string.
:parameter
    :param text: string - name of column containing text
    :param lst_stopwords: list - list of stopwords to remove
    :param flg_stemm: bool - whether stemming is to be applied
    :param flg_lemm: bool - whether lemmitisation is to be applied
:return
    cleaned text
'''
def preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).strip())
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

In [388]:
#Apply the function 
data["text_clean"] = data["text"].apply(lambda x: 
          preprocess_text(x, flg_stemm=False, flg_lemm=True, 
          lst_stopwords=lst_stopwords))
data.head()

Unnamed: 0,text,y,text_clean
0,ID1.60LIFESTL3IURBAN(RX),HOYA,ID160LIFESTL3IURBANRX
1,ID1.60LIFESTL3IURBAN(RX),HOYA,ID160LIFESTL3IURBANRX
2,ID1.60LIFESTL3IURBAN(RX),HOYA,ID160LIFESTL3IURBANRX
3,ID1.60LIFESTL3IURBAN(RX),HOYA,ID160LIFESTL3IURBANRX
4,ID1.60LIFESTL3IURBAN(RX),HOYA,ID160LIFESTL3IURBANRX


In [389]:
## split dataset
data_train, data_test = model_selection.train_test_split(data, test_size=0.3)
## get target
y_train = data_train["y"].values
y_test = data_test["y"].values

In [390]:
## Tf-Idf (advanced variant of BoW)
vectorizer = feature_extraction.text.TfidfVectorizer(max_features=1000, ngram_range=(1,2))

In [392]:
texts = data_train["text_clean"]
vectorizer.fit(texts)
X_train = vectorizer.transform(texts)

dic_vocabulary = vectorizer.vocabulary_

In [401]:
from sklearn.feature_selection import SelectKBest

SelectKBest(k=50).fit_transform(X_train, y_train)

<2484x50 sparse matrix of type '<class 'numpy.float64'>'
	with 1634 stored elements in Compressed Sparse Row format>

In [402]:
feature_names = vectorizer.get_feature_names()
feature_names

['000',
 '000 0000',
 '0000',
 '0016',
 '0025',
 '0050',
 '0075',
 '010',
 '0100',
 '0125',
 '0150',
 '0200',
 '0225',
 '023',
 '025',
 '025 cil',
 '025 ess',
 '0250',
 '0275',
 '0300',
 '0325',
 '0350',
 '0375',
 '04',
 '04 orma',
 '045',
 '05',
 '05 eje',
 '050',
 '06',
 '06 ormix',
 '060',
 '075',
 '075 eje',
 '075 ess',
 '085',
 '085 f360',
 '085 stylis',
 '088',
 '088 150',
 '090',
 '095',
 '0a',
 '10',
 '100',
 '100 adic',
 '100 ess',
 '10042',
 '10045',
 '10053',
 '10053 duravision',
 '104',
 '105',
 '10679',
 '10680',
 '10680 cz',
 '11',
 '1127',
 '1127 rh',
 '1162',
 '1162 rh',
 '125',
 '125 eje',
 '125 ess',
 '13103',
 '13370',
 '13435',
 '13501',
 '13881',
 '14',
 '145205',
 '15',
 '15 algénero',
 '15 blanco',
 '15 cil',
 '15 color',
 '15 crizal',
 '15 duravision',
 '15 eje',
 '15 eps',
 '15 gris',
 '15 organico',
 '15 short',
 '15 sol',
 '15 tint',
 '15 xperio',
 '150',
 '150 ess',
 '150 trueform',
 '1523',
 '16',
 '16 6570',
 '16 active',
 '16 as',
 '16 color',
 '16 crizal

In [403]:
vectorizer.vocabulary_

{'160': 113,
 'rod': 822,
 'su': 892,
 '160 rod': 118,
 'rod su': 829,
 '2ro': 180,
 'i3gk': 551,
 'nu': 699,
 '2ro nu': 181,
 'essilor': 452,
 'varilux': 955,
 'liberty': 614,
 'short': 853,
 'ormix': 730,
 '16': 90,
 'trans': 929,
 '6570': 231,
 'essilor varilux': 459,
 'varilux liberty': 958,
 'liberty short': 617,
 'short ormix': 858,
 'ormix 16': 731,
 '16 trans': 111,
 'superfin': 902,
 'indopolar': 580,
 'gris': 504,
 'nat10': 678,
 '70': 243,
 'tratnatural10': 934,
 'cara': 325,
 'base': 307,
 'superfin indopolar': 903,
 '7sy': 260,
 '13103': 64,
 'zeiss': 989,
 'gs': 508,
 'individual': 572,
 '15': 71,
 'duravision': 400,
 'platinum': 758,
 'zeiss gs': 991,
 'gs individual': 509,
 '15 duravision': 77,
 'duravision platinum': 401,
 'lifestyle': 622,
 '167': 123,
 'super': 898,
 'hv': 547,
 'hoya': 537,
 'rh': 807,
 '1162': 59,
 '090': 40,
 'rh 1162': 812,
 '1162 rh': 60,
 'rh 090': 809,
 'es': 427,
 'sph': 874,
 'bp': 319,
 'zeiss es': 990,
 'es sph': 430,
 'sph 16': 875,
 'mod

In [404]:
array_of_feature = X_train.toarray()
array_of_feature

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [405]:
classifier = naive_bayes.MultinomialNB()

## pipeline
model = pipeline.Pipeline([("vectorizer", vectorizer),  
                           ("classifier", classifier)])
## train classifier
model["classifier"].fit(X_train, y_train)
## test
X_test = data_test["text_clean"].values
predicted = model.predict(X_test)
predicted_prob = model.predict_proba(X_test)

In [406]:
classes = np.unique(y_test)
y_test_array = pd.get_dummies(y_test, drop_first=False).values

precision = precision_score(y_test, predicted, average='micro', zero_division=0)
print('Precision score: {0:0.2f}'.format(precision))

recall = recall_score(y_test, predicted, average='micro', zero_division=0)
print('Recall score: {0:0.2f}'.format(recall))

f1 = f1_score(y_test, predicted, average='micro', zero_division=0)
print('f1 score: {0:0.2f}'.format(recall))

Precision score: 0.92
Recall score: 0.92
f1 score: 0.92


In [407]:
data_reduced

<2484x50 sparse matrix of type '<class 'numpy.float64'>'
	with 1634 stored elements in Compressed Sparse Row format>

In [408]:
X_train

<2484x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 19563 stored elements in Compressed Sparse Row format>