In [1]:
import os
os.chdir('..')
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from sklearn.utils import Bunch
from sklearn.preprocessing import LabelEncoder, FunctionTransformer, LabelBinarizer, PolynomialFeatures
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from aspectsent.metric import ClassificationReport

%load_ext autoreload 
%autoreload 2

# Linear SVM as baselines for Aspect detection and polarity detection 

## Read and explore

In [2]:
df_train = pd.read_csv('./Restaurants/sub_task_1/ABSA16_Restaurants_Train_SB1_v2.csv')
df_test = pd.read_csv('./Restaurants/sub_task_1/EN_REST_SB1_TEST_gold.csv')
df_train['category'] = df_train['category'].str.split('#').str[0]
df_test['category'] = df_test['category'].str.split('#').str[0]
df_train.head()

Unnamed: 0,rid,sid,sentence,category,target,from,to,polarity
0,1004293,1004293:0,Judging from previous posts this used to be a ...,RESTAURANT,place,51,56,negative
1,1004293,1004293:1,"We, there were four of us, arrived at noon - t...",SERVICE,staff,75,80,negative
2,1004293,1004293:2,"They never brought us complimentary noodles, i...",SERVICE,,0,0,negative
3,1004293,1004293:3,The food was lousy - too sweet or too salty an...,FOOD,food,4,8,negative
4,1004293,1004293:3,The food was lousy - too sweet or too salty an...,FOOD,portions,52,60,negative


In [3]:
df_train.dropna(subset=['category', 'polarity','sentence'],inplace=True)

## Encode data

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin
import scipy.sparse as sp
class AspectSentenceEncoder(BaseEstimator,TransformerMixin):  
    def __init__(self,interaction=True, min_df=2, max_df=0.98, max_features=1000):
        self.min_df = min_df
        self.max_df = max_df
        self.max_features = max_features
        self.interaction = interaction
    
    def fit(self,data,y):
        self.lb = lb = LabelBinarizer(sparse_output=False)
        self.tfidf_encoder= encoder = TfidfVectorizer(
            stop_words='english', strip_accents='unicode',
            min_df=self.min_df, max_df=self.max_df, max_features=self.max_features)    
        encoder.fit(data['sentence'])
        lb.fit(data['category'])
        if self.interaction:
            self.add_interactions = PolynomialFeatures(interaction_only=True)
        return self
        
    def transform(self,data):
        tfidf_X = self.tfidf_encoder.transform(data['sentence'])
        cat_X = self.lb.transform(data['category'])
        stacked = sp.hstack((tfidf_X,cat_X))
        if self.interaction:
            return self.add_interactions.fit_transform(stacked.todense())
        return stacked

In [5]:
polarity2code = {'negative':-1, 'neutral':0, 'positive':1}
df_train['y'] = df_train.polarity.map(polarity2code)
df_test['y'] = df_test.polarity.map(polarity2code)

## Fit and Test 

In [57]:
svm_aspect = make_pipeline(AspectSentenceEncoder(interaction=False),LinearSVC(C=0.1))
svm_aspect.fit(df_train,df_train.y)

report = ClassificationReport()

test_scores_aspect = report.classifiction_report(svm_aspect,df_test,df_test.y,threshold=0.2)
test_scores_aspect.round(decimals=3)

Unnamed: 0,accuracy,precision,recall,f1_score,auc,average_precision_score
-1,0.802,0.652,0.578,0.613,0.149,0.645
0,0.802,0.0,0.0,0.0,0.175,0.345
1,0.802,0.842,0.935,0.886,0.136,0.94
micro,,0.802,0.802,0.802,0.926,0.876


In [58]:
svm_aspect = make_pipeline(AspectSentenceEncoder(interaction=True),LinearSVC(C=0.2))
svm_aspect.fit(df_train,df_train.y)

report = ClassificationReport()

test_scores_aspect = report.classifiction_report(svm_aspect,df_test,df_test.y,threshold=0.2)
test_scores_aspect.round(decimals=3)

Unnamed: 0,accuracy,precision,recall,f1_score,auc,average_precision_score
-1,0.781,0.592,0.598,0.595,0.154,0.632
0,0.781,0.0,0.0,0.0,0.182,0.344
1,0.781,0.841,0.899,0.869,0.14,0.936
micro,,0.781,0.781,0.781,0.923,0.869
