# Advanced ML Models

# 1)- Import key modules

In [1]:
# support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function

# I am an engineer. I care only about error not warning. So, let's be maverick and ignore warnings.
import warnings
warnings.filterwarnings('ignore')

In [2]:
import re    # for regular expressions 
import nltk  # for text manipulation 
import string 
import numpy as np 
import pandas as pd 
import string 

#For Visuals
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from matplotlib import rcParams
rcParams['figure.figsize'] = 11, 8
%config InlineBackend.figure_format = 'svg'
%matplotlib inline

In [3]:
#models and evaluation

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from nltk.classify.scikitlearn import SklearnClassifier # notice its from ntlk not sklearn
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
# Evaluation packages
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import confusion_matrix

In [4]:
#pip install version_information
%reload_ext version_information
%version_information pandas,numpy, nltk, seaborn, matplotlib

Software,Version
Python,3.7.7 64bit [MSC v.1916 64 bit (AMD64)]
IPython,7.13.0
OS,Windows 10 10.0.17763 SP0
pandas,1.0.3
numpy,1.18.1
nltk,3.5
seaborn,0.10.1
matplotlib,3.1.3
Fri Jun 26 17:31:42 2020 W. Europe Daylight Time,Fri Jun 26 17:31:42 2020 W. Europe Daylight Time


# 2)- Loading Data

In [5]:
data=pd.read_excel('clean_3655_eng.xlsx')
data=data.rename(columns={'Unnamed: 0':'random_columns'}) # a trick to tackle random index values
data.shape

(3655, 5)

In [6]:
data.head(2)

Unnamed: 0,random_columns,clean,firstmessage,dep,firstusedtextblock
0,0,helloi tri appli voucher order receiv mail did...,Hello:<br><br>I tried to apply a voucher to th...,Shipping issues,nichtkombiwb
1,1,wow wow wow im love acryl cover pro photo book...,WOW WOW WOW! I'm so in love with my acrylic co...,Customer feedback,feedback


### Keeping response of Bot as target variable

In [7]:
# select all samples that are above 100 atleast
#counts=data['firstusedtextblock'].value_counts()
#df = data.loc[data['firstusedtextblock'].isin(counts.index[counts > 30])]
#f.shape

In [8]:
#df.firstusedtextblock.value_counts()

# 3)- Vectorization

- word2vec

In [9]:
#filling any clean values in data with other

df=data.fillna('Other')

In [10]:
df.isnull().sum()

random_columns        0
clean                 0
firstmessage          0
dep                   0
firstusedtextblock    0
dtype: int64

In [11]:
features=df['clean']
labels=df['dep']
print(features.shape)
print(labels.shape)

(3655,)
(3655,)


### Word2Vec Embedding

In [12]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import gensim
from tqdm import tqdm 
tqdm.pandas(desc="progress-bar") 
from gensim.models.doc2vec import TaggedDocument

In [13]:
tokenized_text = df['clean'].apply(lambda x: x.split()) # tokenizing

model_w2v = gensim.models.Word2Vec(
            tokenized_text,
            size=200, # desired no. of features/independent variables
            window=5, # context window size
            min_count=2,
            sg = 1, # 1 for skip-gram model
            hs = 0,
            negative = 10, # for negative sampling i.e class with other types
            workers= 2, # no.of cores
            seed = 34) 

model_w2v.train(tokenized_text, total_examples= len(data['clean']), epochs=20)

(2167305, 2636780)

In [14]:
model_w2v.wv['saal']

array([ 7.11725056e-02,  1.97871223e-01,  1.90359533e-01, -4.12314286e-04,
        1.00462802e-01,  4.86835778e-01, -2.36125439e-01,  5.76793075e-01,
        1.99946359e-01,  2.42573321e-01, -2.06356317e-01,  4.33460660e-02,
        5.68535626e-01, -3.41432661e-01,  2.69370228e-01, -1.28365248e-01,
       -8.93084258e-02,  1.70563430e-01,  9.45942923e-02, -3.79822522e-01,
       -5.22737861e-01, -6.56142011e-02,  3.63920063e-01,  1.65488988e-01,
        2.51693487e-01,  1.05079897e-01, -1.57427803e-01,  2.17064142e-01,
       -1.92808993e-02,  5.02826497e-02, -5.31077087e-02,  5.12462795e-01,
        1.20442227e-01,  2.29578599e-01, -4.27521877e-02, -2.95929909e-01,
       -4.13798213e-01,  3.30416918e-01, -1.03047132e-01,  3.23975295e-01,
       -3.40579063e-01,  8.01496208e-02, -2.20354170e-01,  1.92271695e-01,
        2.25173414e-01,  5.27483150e-02, -2.83559501e-01,  1.33370250e-01,
        3.67361195e-02, -1.39613226e-01,  1.40060801e-02,  2.29654491e-01,
       -9.43983197e-02,  

In [15]:
len(model_w2v.wv['saal'])

200

In [16]:
type(model_w2v)

gensim.models.word2vec.Word2Vec

##### 3.1.Preparing Vectors for text data

In [17]:
def word_vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_w2v[word].reshape((1, size))
            count += 1.
        except KeyError: # handling the case where the token is not in vocabulary           
            continue
    if count != 0:
        vec /= count
    return vec

##### 3.2.Preparing word2vec feature set

In [18]:
wordvec_arrays = np.zeros((len(tokenized_text), 200)) 
for i in range(len(tokenized_text)):
    wordvec_arrays[i,:] = word_vector(tokenized_text[i], 200)
    wordvec_df = pd.DataFrame(wordvec_arrays)
wordvec_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,-0.052964,-0.3091,-0.156003,-0.273249,0.180147,0.022185,-0.1256,0.005796,-0.111066,0.255735,...,0.220305,0.09524,0.212674,-0.300422,0.14542,0.08909,0.127892,0.057989,0.103869,0.001788
1,0.126613,-0.115837,-0.098505,-0.132175,0.183805,0.117226,-0.113368,0.084798,0.042672,0.091176,...,0.203327,0.095713,0.129161,-0.267642,0.087197,0.010871,-0.159869,0.003409,-0.067836,0.049211
2,0.046056,-0.161739,-0.398194,-0.153053,0.323608,-0.052838,0.022553,-0.090688,0.06887,0.30539,...,0.081925,0.230162,-0.013826,-0.078384,0.098563,0.039796,0.085175,0.066815,-0.005773,-0.118223
3,0.049999,-0.100877,-0.135427,-0.074134,0.218544,-0.098632,0.04904,0.093452,0.07791,0.136556,...,0.119776,0.20937,0.012688,-0.385181,0.136862,0.140701,-0.002442,-0.02557,0.099062,0.059733
4,0.016818,-0.233915,-0.09078,-0.109749,0.140512,0.111652,-0.130679,-0.117094,0.028222,0.274613,...,0.304981,-0.04572,0.102768,-0.271182,0.29581,0.187507,0.195698,0.078559,-0.189366,0.055345


In [19]:
wordvec_df.shape

(3655, 200)

# 4)- Model

In [20]:
from sklearn.model_selection import train_test_split 
from sklearn.metrics import f1_score, accuracy_score, roc_curve,roc_auc_score,confusion_matrix, classification_report

In [21]:
X=wordvec_df
y=df['dep']
print(X.shape)
print(y.shape)

(3655, 200)
(3655,)


In [22]:
# splitting data into training and validation set
xtrain_word2vec, xvalid_word2vec, ytrain, yvalid = train_test_split(X, y,random_state=42,test_size=0.2)

In [23]:
print(xtrain_word2vec.shape)
print(xvalid_word2vec.shape)
print(ytrain.shape)
print(yvalid.shape)

(2924, 200)
(731, 200)
(2924,)
(731,)


### XGBoost
Extreme Gradient Boosting (xgboost) is an advanced implementation of gradient boosting algorithm. It has both linear model solver and tree learning algorithms. Its ability to do parallel computation on a single machine makes it extremely fast. It also has additional features for doing cross validation and finding important variables. There are many parameters which need to be controlled to optimize the model.

Some key benefits of XGBoost are:

Regularization - helps in reducing overfitting
Parallel Processing - XGBoost implements parallel processing and is blazingly faster as compared to GBM.
Handling Missing Values - It has an in-built routine to handle missing values.
Built-in Cross-Validation - allows user to run a cross-validation at each iteration of the boosting process

**Notice there is no sklearn ready made model therefore; I needed to use XGBoost from its main librrary**

In [24]:
from xgboost import XGBClassifier

##### XGBoost using word2vecfeatures

In [25]:
xgb_model = XGBClassifier(max_depth=6, n_estimators=1000).fit(xtrain_word2vec, ytrain)
prediction = xgb_model.predict_proba(xvalid_word2vec)
prediction_class = xgb_model.predict(xvalid_word2vec)
accuracy_score(yvalid, prediction_class)

0.6060191518467852

## Saving model

In [26]:
import pickle
filename = 'word2vec_xgb_model.sav'
pickle.dump(xgb_model, open(filename, 'wb'))

In [27]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

In [28]:
prediction = loaded_model.predict_proba(xvalid_word2vec)
prediction_class = loaded_model.predict(xvalid_word2vec)

In [29]:
accuracy_score(yvalid, prediction_class)

0.6060191518467852

In [30]:
print(classification_report(yvalid, prediction_class))

                                                      precision    recall  f1-score   support

                                   Customer feedback       0.64      0.59      0.61        78
                       Data protection (Datenschutz)       1.00      0.75      0.86         4
                                   Discovery voucher       0.00      0.00      0.00         4
                                           Marketing       0.73      0.48      0.58        23
                                    Order management       0.65      0.84      0.73       301
                                 Payment (Bezahlung)       0.00      0.00      0.00        12
                                   Product (Produkt)       0.67      0.21      0.32        19
                                   Production delays       1.00      0.11      0.20         9
                    Professional area (Profibereich)       0.71      0.29      0.42        17
                                   Reseller workflow       

In [31]:
print(labels.nunique())
print(yvalid.nunique())
print(ytrain.nunique())

22
18
22


Other values are also very consistent.

- accuracy = 57.8%
- precision = 58%
- recall = 58%
- f-score = 55%
- (test samples=731)
- No. of classes in test data = 18
- No. of classes in train data = 22
- Total Classes = 22

### api

- key: text
- output1: class prediction
- output2: probability of each class (higher is better)

In [32]:
texts=["I was asked to test a saal photobook and I was so delighted with the result! It arrived with in 10 days and was of such high quality, with a white leather look cover and an acrylic glass to protect the front photo. It has made a lovely lockdown gift for my best friend."]

In [33]:
wordvec_arrays = np.zeros((len(texts), 200)) 
for i in range(len(texts)):
    wordvec_arrays[i,:] = word_vector(texts[i], 200)
    texts_df = pd.DataFrame(wordvec_arrays)
texts_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,-0.130298,-0.10718,-0.263207,-0.112456,0.150111,-0.073012,-0.06689,-0.129732,-0.109085,0.201318,...,0.239544,0.044957,0.216894,-0.271231,0.088904,0.121324,0.052762,0.001223,0.115125,0.033772


In [34]:
prediction = loaded_model.predict_proba(texts_df)
prediction_class = loaded_model.predict(texts_df)

In [35]:
print('"{}"'.format(texts))
print("  - Predicted as: '{}'".format(prediction_class))
print("")

"['I was asked to test a saal photobook and I was so delighted with the result! It arrived with in 10 days and was of such high quality, with a white leather look cover and an acrylic glass to protect the front photo. It has made a lovely lockdown gift for my best friend.']"
  - Predicted as: '['Order management']'



In [36]:
pd.DataFrame(loaded_model.predict_proba(texts_df), columns=loaded_model.classes_)

Unnamed: 0,Customer feedback,Data protection (Datenschutz),Discovery voucher,Inkasso,Marketing,Musterbuch,Order management,Payment (Bezahlung),Product (Produkt),Production delays,...,Reseller workflow,Rücksendung,Samplebook-ProLine,ShareWithSaal,Shipping issues,Software/Webshop/App,Special conditions,product complaints - colours (Reklamation Farben),product complaints - products (Reklamation Produkte),product complaints - software (Reklamation Software)
0,0.000286,0.038072,0.001403,0.001161,0.007178,0.001681,0.924992,0.000775,0.00162,0.000813,...,0.001763,0.006769,0.000461,0.000994,0.00454,0.000512,0.002329,0.000745,0.000328,0.001511


# END OF NOTEBOOK CODE