In [1]:
import os
os.chdir('..')
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from sklearn.utils import Bunch
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from aspectsent.metric import ClassificationReport

%load_ext autoreload 
%autoreload 2

# Linear SVM as baselines for Aspect detection and polarity detection 

## Read and explore

In [2]:
df_train = pd.read_csv('./Restaurants/sub_task_1/ABSA16_Restaurants_Train_SB1_v2.csv')
df_test = pd.read_csv('./Restaurants/sub_task_1/EN_REST_SB1_TEST_gold.csv')
df_train['category'] = df_train['category'].str.split('#').str[0]
df_test['category'] = df_test['category'].str.split('#').str[0]
df_train.head()

Unnamed: 0,rid,sid,sentence,category,target,from,to,polarity
0,1004293,1004293:0,Judging from previous posts this used to be a ...,RESTAURANT,place,51,56,negative
1,1004293,1004293:1,"We, there were four of us, arrived at noon - t...",SERVICE,staff,75,80,negative
2,1004293,1004293:2,"They never brought us complimentary noodles, i...",SERVICE,,0,0,negative
3,1004293,1004293:3,The food was lousy - too sweet or too salty an...,FOOD,food,4,8,negative
4,1004293,1004293:3,The food was lousy - too sweet or too salty an...,FOOD,portions,52,60,negative


In [3]:
df_train.dropna(subset=['category', 'polarity','sentence'],inplace=True)

## Encode data

In [4]:
label_encoder = LabelEncoder()
train = Bunch()
test = Bunch()

train.sentence = df_train.sentence
test.sentence = df_test.sentence
train.cat = label_encoder.fit_transform(df_train.category.astype('category'))
test.cat = label_encoder.transform(df_test.category.astype('category'))
polarity2code = {'negative':-1, 'neutral':0, 'positive':1}
train.polarity = df_train.polarity.map(polarity2code)
test.polarity = df_test.polarity.map(polarity2code)

## Fit and Test 

In [5]:
tfidf_encoder = TfidfVectorizer(stop_words='english', strip_accents='unicode',min_df=2, max_df=0.98, max_features=1000)
svm_aspect = make_pipeline(tfidf_encoder,LinearSVC())
svm_aspect.fit(train.sentence,train.cat)

report = ClassificationReport()
test_scores_aspect = report.classifiction_report(svm_aspect,test.sentence,test.cat,threshold=0.2)
test_scores_aspect.round(decimals=3)

Unnamed: 0,accuracy,precision,recall,f1_score,auc,average_precision_score
0,0.698,0.661,0.561,0.607,0.079,0.645
1,0.698,0.571,0.316,0.407,0.185,0.467
2,0.698,0.776,0.806,0.79,0.111,0.861
3,0.698,0.375,0.231,0.286,0.117,0.291
4,0.698,0.603,0.643,0.622,0.149,0.642
5,0.698,0.673,0.69,0.682,0.089,0.758
micro,,0.698,0.698,0.698,0.92,0.762


In [6]:
svm_polar = make_pipeline(tfidf_encoder,LinearSVC())
svm_polar.fit(train.sentence,train.polarity)

test_scores_polar = report.classifiction_report(svm_polar,test.sentence,test.polarity)
test_scores_polar.round(decimals=3)

Unnamed: 0,accuracy,precision,recall,f1_score,auc,average_precision_score
-1,0.749,0.51,0.598,0.551,0.181,0.557
0,0.749,1.0,0.114,0.204,0.19,0.351
1,0.749,0.839,0.845,0.842,0.157,0.93
micro,,0.749,0.749,0.749,0.907,0.842


# Inspection

In [7]:
import eli5
from eli5.lime import TextExplainer

In [8]:
eli5.show_weights(svm_aspect, top=10, target_names=label_encoder.classes_)

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5
+2.102,atmosphere,,,,
+2.022,decor,,,,
+1.731,ambience,,,,
+1.724,shows,,,,
+1.685,romantic,,,,
+1.541,jazz,,,,
+1.425,space,,,,
+1.385,music,,,,
+1.381,scene,,,,
+1.379,relax,,,,

Weight?,Feature
+2.102,atmosphere
+2.022,decor
+1.731,ambience
+1.724,shows
+1.685,romantic
+1.541,jazz
+1.425,space
+1.385,music
+1.381,scene
+1.379,relax

Weight?,Feature
+2.089,wine
+2.020,beer
+1.899,drinks
+1.724,sake
+1.719,wines
+1.450,bottles
+1.258,drink
+1.135,tea
+1.068,priced
+1.048,waited

Weight?,Feature
+2.625,food
+1.968,sushi
+1.796,dessert
+1.701,cuisine
+1.635,toppings
+1.617,chicken
+1.543,creative
+1.541,bagels
+1.521,lobster
… 528 more positive …,… 528 more positive …

Weight?,Feature
+1.827,view
+1.452,watching
+1.125,location
+1.098,right
+1.057,river
+0.852,far
+0.838,east
+0.812,end
+0.754,street
… 193 more positive …,… 193 more positive …

Weight?,Feature
+1.566,unless
… 324 more positive …,… 324 more positive …
… 652 more negative …,… 652 more negative …
-1.467,dishes
-1.495,atmosphere
-1.498,staff
-1.502,ordered
-1.685,cuisine
-1.704,sushi
-1.722,wine

Weight?,Feature
+2.934,service
+2.196,staff
+1.970,waiter
+1.909,asked
+1.894,waitress
+1.724,apology
+1.649,reservation
+1.605,asking
+1.590,hostess
+1.566,delivery


In [9]:
eli5.show_weights(svm_polar, top=10, target_names=list(polarity2code.keys()))

Weight?,Feature,Unnamed: 2_level_0
Weight?,Feature,Unnamed: 2_level_1
Weight?,Feature,Unnamed: 2_level_2
+2.083,worst,
+1.964,rude,
+1.895,horrible,
+1.757,asking,
+1.559,overpriced,
+1.513,bland,
… 469 more positive …,… 469 more positive …,
… 520 more negative …,… 520 more negative …,
-1.486,phenomenal,
-1.493,wonderful,

Weight?,Feature
+2.083,worst
+1.964,rude
+1.895,horrible
+1.757,asking
+1.559,overpriced
+1.513,bland
… 469 more positive …,… 469 more positive …
… 520 more negative …,… 520 more negative …
-1.486,phenomenal
-1.493,wonderful

Weight?,Feature
+1.631,ok
+1.500,packed
+1.499,scallion
+1.422,upscale
+1.409,decent
+1.371,okay
+1.244,tho
+1.169,outstanding
+1.138,request
+1.091,consider

Weight?,Feature
+1.772,lamb
+1.696,wonderful
+1.614,excellent
… 529 more positive …,… 529 more positive …
… 453 more negative …,… 453 more negative …
-1.588,ok
-1.635,overpriced
-1.709,asking
-1.719,horrible
-1.771,pay


In [10]:
i = 300
print('True Aspect:',label_encoder.inverse_transform(test.cat[i]))
print('True Polarity:',test.polarity[i])
print('Detected Polarity:',svm_polar.predict(test.sentence.iloc[[i]]))
print('Detected Aspect:')
eli5.show_prediction(svm_aspect.named_steps['linearsvc'],
                     test.sentence[i],
                     vec= svm_aspect.named_steps['tfidfvectorizer'],
                     top_targets=1, 
                     target_names=label_encoder.classes_ ,
                     top=4,
                     show_feature_values=True
                    )

True Aspect: SERVICE
True Polarity: 1
Detected Polarity: [1]
Detected Aspect:


  if diff:


Contribution?,Feature,Value
2.068,Highlighted in text (sum),
-0.679,<BIAS>,1.0


In [20]:
print('Detected Polarity')
eli5.show_prediction(svm_polar.named_steps['linearsvc'],
                     test.sentence[355],
                     vec= svm_aspect.named_steps['tfidfvectorizer'],
                     top_targets=1, 
                     target_names=list(polarity2code.keys()) ,
                     top=10,
                     #show_feature_values=True,
                     force_weights=True,
                    )

Detected Polarity


Contribution?,Feature
0.54,people
0.515,poor
0.321,customers
0.162,<BIAS>
-0.021,great
-0.058,pizza
-0.145,service
-0.263,just

Contribution?,Feature
0.889,Highlighted in text (sum)
0.162,<BIAS>


# Apect detection as a multi-label classification problem 

In [14]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report,f1_score

## From a multi-class problem to a multi-label problem 

In multi-class classification formulation, we assume that each sentence belongs to exactly one aspect category. 

By reformulating it as multi-label classification problem, each sentence can belong to many aspect categories at the same time.

For example: the 3rd sentence in review 1004293 has two categories

In [15]:
df_train = pd.read_csv('./Restaurants/sub_task_1/ABSA16_Restaurants_Train_SB1_v2.csv')
df_test = pd.read_csv('./Restaurants/sub_task_1/EN_REST_SB1_TEST_gold.csv')
df_train['category'] = df_train['category'].str.split('#').str[0]
df_test['category'] = df_test['category'].str.split('#').str[0]

def collate_categories(df):
    grp = df.groupby('sid')
    categories = grp['category'].apply(lambda x: x.str.cat(sep=',')).str.split(',')
    categories_grouped =grp[['rid','sid','sentence']].head(1).join(categories,on='sid')
    return categories_grouped.reset_index(drop=True)
df_train = collate_categories(df_train)
df_test = collate_categories(df_test)

In [16]:
mlb = MultiLabelBinarizer(sparse_output=True)
train.sentence = df_train.sentence  
train.cat = mlb.fit_transform(df_train.category)
test.sentence = df_test.sentence  
test.cat = mlb.transform(df_test.category)

In [17]:
svm_multi_aspect = make_pipeline(tfidf_encoder,OneVsRestClassifier(LinearSVC(C=1.5),n_jobs=-1))
svm_multi_aspect.fit(train.sentence,train.cat)
y_pred = svm_multi_aspect.predict(test.sentence)

In [18]:
print(classification_report(test.cat,y_pred))
print('micro avg f1:',f1_score(test.cat,y_pred,average='micro'))

             precision    recall  f1-score   support

          0       0.88      0.65      0.75        57
          1       0.68      0.43      0.53        35
          2       0.84      0.80      0.82       257
          3       1.00      0.31      0.47        13
          4       0.68      0.59      0.63       193
          5       0.89      0.80      0.84       145

avg / total       0.81      0.70      0.75       700

micro avg f1: 0.7513389441469014


In [19]:
i = 355
print('True Aspect:',mlb.inverse_transform(test.cat[i]))
print('Detected Aspect:',mlb.inverse_transform(y_pred[i]))
eli5.show_prediction(svm_multi_aspect.named_steps['onevsrestclassifier'],
                     test.sentence[i],
                     vec= svm_multi_aspect.named_steps['tfidfvectorizer'],
                     top_targets=2, 
                     target_names=mlb.classes_ ,
                     top=10,
                     #show_feature_values=True,
                     force_weights=True,
                     #show=['WEIGHTS','targets','transition_features',
                     #                         'feature_importances','INFO','ALL']
                    )

True Aspect: [('FOOD', 'SERVICE')]
Detected Aspect: [('FOOD', 'SERVICE')]


Contribution?,Feature
Contribution?,Feature
+1.631,service
+0.247,customers
+0.178,people
+0.020,great
-0.071,just
-0.080,poor
-0.116,pizza
-0.743,<BIAS>
+0.908,pizza
+0.176,just

Contribution?,Feature
1.631,service
0.247,customers
0.178,people
0.02,great
-0.071,just
-0.08,poor
-0.116,pizza
-0.743,<BIAS>

Contribution?,Feature
0.908,pizza
0.176,just
0.052,great
-0.016,people
-0.113,customers
-0.137,poor
-0.157,service
-0.576,<BIAS>

Contribution?,Feature
1.808,Highlighted in text (sum)
-0.743,<BIAS>

Contribution?,Feature
0.713,Highlighted in text (sum)
-0.576,<BIAS>
