In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_style("dark")
plt.style.use('ggplot')

from sklearn.metrics import classification_report

  import pandas.util.testing as tm


## 1. Load data from train

In [3]:
from sqlalchemy import create_engine

engine = create_engine('sqlite:///../data/data.db', echo=True)
conn = engine.connect()

df = pd.read_sql("SELECT * FROM train",conn)

2021-04-25 07:27:22,666 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2021-04-25 07:27:22,667 INFO sqlalchemy.engine.base.Engine ()
2021-04-25 07:27:22,668 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2021-04-25 07:27:22,668 INFO sqlalchemy.engine.base.Engine ()
2021-04-25 07:27:22,670 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_info("SELECT * FROM train")
2021-04-25 07:27:22,670 INFO sqlalchemy.engine.base.Engine ()
2021-04-25 07:27:22,671 INFO sqlalchemy.engine.base.Engine PRAGMA temp.table_info("SELECT * FROM train")
2021-04-25 07:27:22,672 INFO sqlalchemy.engine.base.Engine ()
2021-04-25 07:27:22,672 INFO sqlalchemy.engine.base.Engine SELECT * FROM train
2021-04-25 07:27:22,673 INFO sqlalchemy.engine.base.Engine ()


In [4]:
df.shape

(5698, 3)

In [5]:
df.label.value_counts()

824.company-brand.general-satisfaction.0                   2682
824.account-management.account-access.0                     847
824.company-brand.convenience.0                             799
824.online-experience.updates-versions.0                    565
824.company-brand.competitor.0                              520
824.account-management.fingerprint-facial-recognition.0     199
824.staff-support.agent-named.0                              32
824.staff-support.email.0                                    17
824.online-experience.language.0                             13
824.purchase-booking-experience.choice-variety.0             11
824.logistics-rides.speed.0                                   6
824.attributes.size-fit.0                                     3
824.logistics-rides.order-accuracy.0                          2
824.attributes.cleanliness.0                                  1
824.attributes.taste-flavour.0                                1
Name: label, dtype: int64

In [6]:
## remove the label < 20 posts
df = df[~df['label'].isin([
                      '824.staff-support.email.0',
                      '824.online-experience.language.0',
                      '824.purchase-booking-experience.choice-variety.0',
                      '824.logistics-rides.speed.0',
                      '824.attributes.size-fit.0',
                      '824.logistics-rides.order-accuracy.0',
                      '824.attributes.taste-flavour.0',
                      '824.attributes.cleanliness.0'])]

In [7]:
label_df = pd.merge(df['comment'],pd.get_dummies(df['label']),left_index=True, right_index=True)

In [8]:
print("Label number:", len(label_df.drop('comment',axis=1).columns))

Label number: 7


In [9]:
## Clean text

In [10]:
import re
def remove_punctuation_marks(text):
    return re.sub(r'[^\w\s]'," ",text)

In [12]:
label_df['clean_comment'] = label_df['comment'].apply(remove_punctuation_marks)

In [13]:
label_df['clean_comment'] = label_df['clean_comment'].map(lambda x: ' '.join(i for i in x.split() if len(i)>1))

## 2. train-val split

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
train, val = train_test_split(label_df,test_size=0.1,random_state=101)

In [16]:
train.shape, val.shape

((5079, 9), (565, 9))

## 3. Model Comparison

### 3.1 Baseline model: Logistic regression

In [17]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [23]:
model_base = Pipeline([
    ('tfidf',TfidfVectorizer(ngram_range=(1, 2),max_df=0.9, min_df=5,stop_words='english',lowercase=True)),
#     ('clf',MultiOutputClassifier(estimator=LogisticRegression()))
    ('clf',MultiOutputClassifier(estimator=LogisticRegression(class_weight='balanced'))) ## added class weight as the labels are imbalanced. ##
])

In [24]:
model_base.fit(train['clean_comment'],train.drop(['clean_comment','comment'],axis=1).values)
prediction = model_base.predict(val['clean_comment'])

In [22]:
print("Unweighted output:\n", classification_report(val.drop(['clean_comment','comment'],axis=1),
                                                    prediction,
                                                    target_names=val.drop(['clean_comment','comment'],axis=1).columns)
     )

Unweighted output:
                                                          precision    recall  f1-score   support

                824.account-management.account-access.0       0.78      0.31      0.44        95
824.account-management.fingerprint-facial-recognition.0       1.00      0.07      0.13        14
                         824.company-brand.competitor.0       0.67      0.21      0.31        68
                        824.company-brand.convenience.0       0.62      0.32      0.42        72
               824.company-brand.general-satisfaction.0       0.74      0.82      0.78       269
               824.online-experience.updates-versions.0       0.86      0.41      0.55        44
                        824.staff-support.agent-named.0       0.00      0.00      0.00         3

                                              micro avg       0.74      0.54      0.62       565
                                              macro avg       0.67      0.30      0.38       565
        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
print("Weighted output:\n",classification_report(val.drop(['clean_comment','comment'],axis=1),
                                                 prediction,
                                                 target_names=val.drop(['clean_comment','comment'],axis=1).columns)
     )

Weighted output:
                                                          precision    recall  f1-score   support

                824.account-management.account-access.0       0.61      0.73      0.66        95
824.account-management.fingerprint-facial-recognition.0       0.39      0.86      0.53        14
                         824.company-brand.competitor.0       0.56      0.79      0.66        68
                        824.company-brand.convenience.0       0.52      0.74      0.61        72
               824.company-brand.general-satisfaction.0       0.74      0.83      0.78       269
               824.online-experience.updates-versions.0       0.69      0.95      0.80        44
                        824.staff-support.agent-named.0       0.17      0.67      0.27         3

                                              micro avg       0.63      0.81      0.71       565
                                              macro avg       0.52      0.80      0.62       565
          

  _warn_prf(average, modifier, msg_start, len(result))


## 4. Save models

In [26]:
import joblib

In [27]:
joblib.dump(model_base,"../models/model_base.pkl")

['../models/model_base.pkl']