In [1]:
# import packages
import numpy as np
import pandas as pd
import spacy
import en_coref_md
from nltk.corpus import stopwords
from nltk.corpus import sentiwordnet as swn
import string
import nltk
import csv
import os
from inflection import singularize
from collections import Counter
import itertools
import keras
import xml.etree.ElementTree as etree
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [2]:
nlp = spacy.load('en_core_web_sm')

## Build Architecture
create subdirectories to store preprocessing models, aspect classifiers and sentiment classifiers

In [142]:
# used to store pre-trained transformers and classifiers
base_dir = "models"
os.mkdir(base_dir)
def build_architecture(base_dir, model_type):

    """function: generate subdirectories to store preprocessing, aspect classifiers and sentiment classifiers
       params: base_dir: base directories
               model_type: type of models
       return: subdirectories paths generated"""

    # save model types, SVM+DNN, CNN+SVM,...
    model_dir = os.path.join(base_dir, model_type)
    os.mkdir(model_dir)

    # preprocessing folder to save transformers to disk
    preprocessing_folder = os.path.join(model_dir, "preprocessing")
    os.mkdir(preprocessing_folder)

    # weight-learning classifier folder to save aspect_classifier to disk
    aspect_classifier_folder = os.path.join(model_dir, "aspect_classifier")
    os.mkdir(aspect_classifier_folder)
    
    # sentiment classifier folder to save sentiment_classifier to disk
    sentiment_classifier_folder = os.path.join(model_dir, "sentiment_classifier")
    os.mkdir(sentiment_classifier_folder)
    
    return preprocessing_folder, aspect_classifier_folder, sentiment_classifier_folder

In [108]:
preprocessing_folder, aspect_classifier_folder, sentiment_classifier_folder = build_architecture(base_dir, "SVM+SVM")

## Parse text

In [112]:
## pre-identified categories: AMBIENCE, FOOD, RESTAURANT, SERVICE, DRINKS, LOCATION
def parse_text(path):
    
    """function: parse xml files, only suitable for this project's input format
       params: path: input file path
       return: return dataframe storing necessary information """
    
    tree = etree.parse(path)
    reviews = tree.getroot()
    annotated_reviews = []
    for review in reviews:
        # restaurant id
        rid = review.attrib["rid"]
        for sentences in review:
            for sentence in sentences:
                sentence_review = {}
                sentence_review["sentence"] = sentence[0].text
                sentence_review["rid"] = rid
                if len(sentence) > 1:
                    sentence_review["is_ambience"] = 0
                    sentence_review["is_food"] = 0
                    sentence_review["is_restaurant"] = 0
                    sentence_review["is_service"] = 0
                    sentence_review["is_other"] = 0
                
                
                    for opinion in sentence[1]: # opinions
                        pairs = opinion.attrib
                    
                        entity = pairs["category"].split("#")[0]
                    
                        # combine DRINKS & LOCATION into OTHER category
                        if entity == "DRINKS" or entity == "LOCATION":
                            sentence_review["is_other"] = 1
                        elif entity == "AMBIENCE":
                            sentence_review["is_ambience"] = 1
                        elif entity == "FOOD":
                            sentence_review["is_food"] = 1
                        elif entity == "RESTAURANT":
                            sentence_review["is_restaurant"] = 1
                        elif entity == "SERVICE":
                            sentence_review["is_service"] = 1
                    
                        # assume only one sentiment expressed towards one aspect
                        sentence_review["target"] = pairs["target"]
                        sentence_review["polarity"] = pairs["polarity"]

                annotated_reviews.append(sentence_review)
                
    df = pd.DataFrame(annotated_reviews)
    
    return df


In [113]:
def normalize(sent):
    
    """function: normalize input sentence by lemmentization
       params: raw sentence
       return: normalized sentence"""
    
    sent = nlp(sent)
    return " ".join([str(token.lemma_) for token in sent])

In [114]:
# train
train_path_sb1 = "Annotation/restaurant/ABSA16_Restaurants_Train_SB1_v2.xml"

# test
test_path_sb1 = "Annotation/restaurant/EN_REST_SB1_TEST.xml"

In [25]:
# train
gold_df = parse_text(train_path_sb1)

# exclude if polarity is nan
train_df = gold_df[~gold_df.polarity.isnull()]
train_df = train_df.reset_index(drop= True)

# test
gold_df = parse_text(test_path_sb1)

# exclude if polarity is nan
test_df = gold_df[~gold_df.polarity.isnull()]
test_df = test_df.reset_index(drop= True)

In [29]:
# train
train_df["normalize_sentence"] = train_df.sentence.apply(lambda x: normalize(x))
train_df.head(5)

# test
test_df["normalize_sentence"] = test_df.sentence.apply(lambda x: normalize(x))
test_df.head(5)

Unnamed: 0,is_ambience,is_food,is_other,is_restaurant,is_service,polarity,rid,sentence,target,normalize_sentence
0,0,1,0,0,0,positive,en_BlueRibbonSushi_478218171,Yum!,,yum !
1,0,1,0,0,0,positive,en_BlueRibbonSushi_478218171,Serves really good sushi.,sushi,serve really good sushi .
2,0,1,0,0,0,neutral,en_BlueRibbonSushi_478218171,Not the biggest portions but adequate.,portions,not the big portion but adequate .
3,0,1,0,0,0,positive,en_BlueRibbonSushi_478218171,Green Tea creme brulee is a must!,Green Tea creme brulee,green tea creme brulee be a must !
4,0,1,0,0,0,positive,en_BlueRibbonSushi_478218171,Don't leave the restaurant without it.,,do not leave the restaurant without -PRON- .


### Save parsed text to csv

In [30]:
import os
parsed_dir = "Annotation/restaurant/"
os.mkdir(os.path.join(parsed_dir, "parsed"))

In [31]:
train_save_path = "Annotation/restaurant/parsed/ABSA16_Restaurants_Train_SB1.csv"
train_df.to_csv(train_save_path, encoding='utf-8', index=False)

test_save_path = "Annotation/restaurant/parsed/ABSA16_Restaurants_Test_SB1.csv"
test_df.to_csv(test_save_path, encoding='utf-8', index=False)

## Polarity Transformation

In [115]:
from sklearn import preprocessing
lb = preprocessing.LabelEncoder()
y_train = lb.fit_transform(train_df.polarity)

y_test = lb.transform(test_df.polarity)

In [116]:
class_weight = {}
weights = len(y_train) / (3 * np.bincount(y_train))
for w in range(weights.shape[0]):
    class_weight[w] = weights[w]

In [117]:
class_weight

{0: 1.037037037037037, 1: 8.895833333333334, 2: 0.5199391171993911}

In [118]:
# encode sentiment polarity: one-hot encoder
enc = preprocessing.LabelBinarizer()
y_train_NN = enc.fit_transform(train_df.polarity)

y_test_NN = enc.transform(test_df.polarity)

## Text tf-dif Transformation

In [119]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
MAX_FEATURES = 1500

count_vect = CountVectorizer(stop_words = 'english', max_features = MAX_FEATURES)
tfidf_transformer = TfidfTransformer()

# train
X_train_counts = count_vect.fit_transform(train_df["normalize_sentence"])
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print (X_train_tfidf.shape)

# test
X_test_counts = count_vect.transform(test_df["normalize_sentence"])
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
print (X_test_tfidf.shape)

(1708, 1500)
(587, 1500)


In [121]:
# save transformers to preprocessing folders
import pickle

filename = 'count_vect_transformer.sav'
filename_path = os.path.join(preprocessing_folder, filename)
pickle.dump(count_vect, open(filename_path, 'wb'))

filename = 'tfidf_transformer.sav'
filename_path = os.path.join(preprocessing_folder, filename)
pickle.dump(tfidf_transformer, open(filename_path, 'wb'))

In [122]:
# create int_to_word & word_to_int index
int_to_word = {}

for i in range(1, len(count_vect.get_feature_names())):
    int_to_word[i] = count_vect.get_feature_names()[i]
    
word_to_int = dict([(word, index) for (index, word) in int_to_word.items()])

## Aspect-specific Sentiment Analysis

### Train classifier for each aspect

In [124]:
from sklearn.svm import LinearSVC

def classify_aspect(train, target):
    
    """function: build binary classifier to classify whether or not sentence belongs to an aspect or not
       params: train: train data
               target: binary target
       return: trained SVM's model, coefficients and intercept """
    
    
    aspect_class_weight = {}
    weights = len(list(target))/ (2 * np.bincount(list(target)))
    for w in range(weights.shape[0]):
        aspect_class_weight[w] = weights[w]
        
    model = LinearSVC(fit_intercept = False, class_weight = aspect_class_weight)
    model.fit(train, target)
    
    return model, model.coef_, model.intercept_

In [125]:
# train classifier for each aspect
# return model, coefficients, intercept
is_ambience,is_ambience_coef, is_ambience_bias =  classify_aspect(X_train_tfidf, train_df.is_ambience)

is_food,is_food_coef, is_food_bias =  classify_aspect(X_train_tfidf, train_df.is_food)

is_other, is_other_coef, is_other_bias =  classify_aspect(X_train_tfidf, train_df.is_other)

is_restaurant, is_restaurant_coef, is_restaurant_bias =  classify_aspect(X_train_tfidf, train_df.is_restaurant)

is_service, is_service_coef, is_service_bias = classify_aspect(X_train_tfidf, train_df.is_service)


In [127]:
# save trained classifiers to aspect_classifier folder
import pickle
filename = 'is_ambience_classifier.sav'
file_path = os.path.join(aspect_classifier_folder, filename)
pickle.dump(is_ambience, open(file_path, 'wb'))

filename = 'is_food_classifier.sav'
file_path = os.path.join(aspect_classifier_folder, filename)
pickle.dump(is_food, open(file_path, 'wb'))

filename = 'is_other_classifier.sav'
file_path = os.path.join(aspect_classifier_folder, filename)
pickle.dump(is_other, open(file_path, 'wb'))

filename = 'is_restaurant_classifier.sav'
file_path = os.path.join(aspect_classifier_folder, filename)
pickle.dump(is_restaurant, open(file_path, 'wb'))

filename = 'is_service_classifier.sav'
file_path = os.path.join(aspect_classifier_folder, filename)
pickle.dump(is_service, open(file_path, 'wb'))

In [128]:
def get_relevance(coef, bias, data_tfidf, index):
    
    """get contribution score for word in each sentence for a specific aspect
    
       params: coef: aspect classifier output coefficients
               bias: aspect classifier output bias
               tfidf_data: text data transformed after tfidf
               index: index of the sentence in tfidf_data
               
       return: series of words' contribution score for the sentence"""
    relevance = np.multiply(coef, data_tfidf.toarray()[index])[0]
    
    return relevance

In [129]:
# sample: 
get_relevance(is_restaurant_coef, is_restaurant_bias, X_train_tfidf, 0).shape

(1500,)

In [130]:
def get_aspect_vectors(aspect, coef, bias, df, data_tfidf, y_polarity):
    
    """function: get sentences belong to indicated aspects,
                 vtheir contribution vectors and ground truth sentiment polarity
    
       params: aspect: string, indicate aspect
               coef: aspect classifier output coefficient
               bias: aspect classifier output bias
               tfidf_data: data transformed after tfidf vectorization
               df: dataframe stores data points
               data_tfidf: text data after tf-idf transformation
               y_polarity: ground truth sentiment polarity for each sentence
               
       return: dataframe"""
            
    aspect_vectors = []
    aspect_indexes = df[df[aspect] == 1].index.tolist()

    for ind in aspect_indexes:
        vectors = {}
        
        sentence = df.normalize_sentence[ind]
        vect = get_relevance(coef, bias, data_tfidf, ind)
        vectors["rid"] = df.rid[ind]
        vectors["train_df_index"] = ind
        vectors["sentence"] = sentence
        vectors["vector"] = vect
        vectors["polarity"] = y_polarity[ind] # add polarity after label binarizer
        
        aspect_vectors.append(vectors)
        
    new_df = pd.DataFrame(aspect_vectors)
    
    return new_df

In [131]:
def classify_sentiment(x, y_true, target_names):
    model = LinearSVC()
    model.fit(x, y_true)

    return model

In [143]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

def classify_sentiment_NN(x, y_true):
    
    """function: build a DNN model as sentiment classifier
       params: x: word contributions vectors for sentences
               y_true: ground truth sentiment polarity
               
       return: trained model"""

    model = Sequential()

    model.add(Dense(128, activation = 'relu'))
    model.add(Dense(32, activation = 'relu'))
    model.add(Dense(3, activation = "sigmoid"))
#    model.summary()

    model.compile(optimizer='rmsprop',
                 loss='categorical_crossentropy',
                 metrics=['acc'])
    
    history = model.fit(x, y_true,
                    epochs=10,
                    batch_size=50, verbose = 0 ) #, class_weight = class_weight)
    
    return model

In [71]:
def save_model(model_folder, aspect, model):
    """function: save deep learning model to folder
       params: model_folder: subdirectories
               aspect: indicates a specific model
               model: model including weights and architecture information
    """
    model_weight = aspect + '_model_weights.h5'
    model_arch = aspect + '_model_architecture.json'
    
    # Save the weights
    model.save_weights(os.path.join(model_folder, model_weight))

    # Save the model architecture
    with open(os.path.join(model_folder, model_arch), 'w') as f:
        f.write(model.to_json())
    print ("model saved")


In [134]:
def pipeline(model_folder, aspect, is_aspect_coef, is_aspect_bias,
            train_df, X_train_tfidf, y_train, test_df, X_test_tfidf, y_test):
    
    """function: traditional model pipeline to train sentiment classifiers, predict and report train & test confusion matrix
       params: model_folder: directories to store DNN model
               aspect: string: to indicate which aspect
               is_aspect_coef: aspect classifier output coefficients
               is_aspect_bias: aspect classifier output bias
               
               train_df: train dataframe
               X_train_tfidf: context data after tfidf transformation
               y_train_NN: ground truth after transformation
               
               test_df: test dataframe
               X_test_tfidf: context data after tfidf transformation
               y_test_NN: ground truth after transformation
    """
    
    # aspect-TRAIN 
    aspect_df = get_aspect_vectors(aspect, is_aspect_coef, is_aspect_bias, train_df, X_train_tfidf, y_train )
    aspect_train = np.array([list(data) for data in aspect_df.vector])
    aspect_y = np.array(list(aspect_df.polarity))
    
    # aspect-TEST
    aspect_df_test = get_aspect_vectors(aspect, is_aspect_coef, is_aspect_bias, test_df, X_test_tfidf, y_test)
    aspect_test = pd.np.array([list(data) for data in aspect_df_test.vector])
    aspect_test_y = np.array(list(aspect_df_test.polarity))

    # train model
    aspect_model = classify_sentiment(aspect_train, aspect_y)
    
    # save trained sentiment_classifier to disk 
    filename = aspect + '_classifier.sav'
    file_path = os.path.join(model_folder, filename)
    pickle.dump(aspect_model, open(file_path, 'wb'))

    # predict-TRAIN
    aspect_pred = aspect_model.predict(aspect_train)
    # predict-TEST
    aspect_test_pred = aspect_model.predict(aspect_test)

    print ("TRAIN")
    print ("Accuracy score for baseline linear SVM in predicting sentiment: {}".format(accuracy_score(aspect_y, aspect_pred)))
    print (" ")
    print (classification_report(aspect_y, aspect_pred, target_names=target_names))
    
    print ("TEST")
    print ("Accuracy score for baseline linear SVM in predicting sentiment: {}".format(accuracy_score(aspect_test_y, aspect_test_pred)))
    print (" ")
    print (classification_report(aspect_test_y, aspect_test_pred, target_names=target_names))
    

In [133]:
def pipeline_NN(model_folder, aspect, is_aspect_coef, is_aspect_bias, 
                train_df, X_train_tfidf, y_train_NN, test_df, X_test_tfidf, y_test_NN):
    
    """function: deep learning model pipeline to train sentiment classifiers, predict and report train & test confusion matrix
       params: model_folder: directories to store DNN model
               aspect: string: to indicate which aspect
               is_aspect_coef: aspect classifier output coefficients
               is_aspect_bias: aspect classifier output bias
               
               train_df: train dataframe
               X_train_tfidf: context data after tfidf transformation
               y_train_NN: ground truth after transformation
               
               test_df: test dataframe
               X_test_tfidf: context data after tfidf transformation
               y_test_NN: ground truth after transformation
               
    """
    
    # aspect-TRAIN 
    aspect_df = get_aspect_vectors(aspect, is_aspect_coef, is_aspect_bias, train_df, X_train_tfidf, y_train_NN )
    aspect_train = pd.np.array([list(data) for data in aspect_df.vector])
    aspect_y = np.array(list(aspect_df.polarity))
    
    # aspect-TEST
    aspect_df_test = get_aspect_vectors(aspect, is_aspect_coef, is_aspect_bias, test_df, X_test_tfidf, y_test_NN)
    aspect_test = pd.np.array([list(data) for data in aspect_df_test.vector])
    aspect_test_y = pd.np.array(list(aspect_df_test.polarity))

    # train model
    aspect_model = classify_sentiment_NN(aspect_train, aspect_y, lb.classes_)    
    # save trained sentiment_classifier_NN to disk    
    save_model(model_folder, aspect, aspect_model)
    
    # predict-TRAIN
    aspect_pred = np.argmax(aspect_model.predict(aspect_train), axis=1)
    # predict-TEST
    aspect_test_pred = np.argmax(aspect_model.predict(aspect_test), axis=1)

    
    # convert true-label-TRAIN
    aspect_y = np.argmax(aspect_y, axis = 1)
    # convert true-label-TEST
    aspect_test_y = np.argmax(aspect_test_y, axis = 1)
    
    
    print ("TRAIN")
    print ("Accuracy score for baseline linear SVM in predicting sentiment: {}".format(accuracy_score(aspect_y, aspect_pred)))
    print (" ")
    print (classification_report(aspect_y, aspect_pred, target_names=target_names))
    
    print ("TEST")
    print ("Accuracy score for baseline linear SVM in predicting sentiment: {}".format(accuracy_score(aspect_test_y, aspect_test_pred)))
    print (" ")
    print (classification_report(aspect_test_y, aspect_test_pred, target_names=target_names))
    

#### Train & Test Ambience Sentiment Analysis

In [136]:
pipeline(sentiment_classifier_folder, "is_ambience", is_ambience_coef, is_ambience_bias)

TRAIN
Accuracy score for baseline linear SVM in predicting sentiment: 0.8893805309734514
 
             precision    recall  f1-score   support

   negative       0.97      0.60      0.74        47
    neutral       1.00      0.44      0.62         9
   positive       0.88      0.99      0.93       170

avg / total       0.90      0.89      0.88       226

TEST
Accuracy score for baseline linear SVM in predicting sentiment: 0.7894736842105263
 
             precision    recall  f1-score   support

   negative       0.00      0.00      0.00         7
    neutral       0.00      0.00      0.00         3
   positive       0.82      0.96      0.88        47

avg / total       0.67      0.79      0.73        57



  'precision', 'predicted', average, warn_for)


#### Train & Test Food Sentiment Analysis

In [137]:
pipeline(sentiment_classifier_folder, "is_food", is_food_coef, is_food_bias)

TRAIN
Accuracy score for baseline linear SVM in predicting sentiment: 0.8375165125495376
 
             precision    recall  f1-score   support

   negative       0.91      0.62      0.74       225
    neutral       1.00      0.04      0.07        28
   positive       0.82      0.98      0.89       504

avg / total       0.85      0.84      0.82       757

TEST
Accuracy score for baseline linear SVM in predicting sentiment: 0.6848249027237354
 
             precision    recall  f1-score   support

   negative       0.29      0.22      0.25        54
    neutral       0.00      0.00      0.00        15
   positive       0.76      0.87      0.81       188

avg / total       0.62      0.68      0.65       257



  'precision', 'predicted', average, warn_for)


#### Train & Test Other Sentiment Analysis

In [138]:
pipeline(sentiment_classifier_folder, "is_other", is_other_coef, is_other_bias)

TRAIN
Accuracy score for baseline linear SVM in predicting sentiment: 0.8773584905660378
 
             precision    recall  f1-score   support

   negative       1.00      0.24      0.38        17
    neutral       1.00      1.00      1.00         2
   positive       0.87      1.00      0.93        87

avg / total       0.89      0.88      0.84       106

TEST
Accuracy score for baseline linear SVM in predicting sentiment: 0.7659574468085106
 
             precision    recall  f1-score   support

   negative       0.00      0.00      0.00         8
    neutral       0.00      0.00      0.00         1
   positive       0.80      0.95      0.87        38

avg / total       0.65      0.77      0.70        47



  'precision', 'predicted', average, warn_for)


#### Train & Test Restaurant Sentiment Analysis

In [139]:
pipeline(sentiment_classifier_folder, "is_restaurant", is_restaurant_coef, is_restaurant_bias)

TRAIN
Accuracy score for baseline linear SVM in predicting sentiment: 0.8404255319148937
 
             precision    recall  f1-score   support

   negative       0.92      0.56      0.69       149
    neutral       1.00      0.22      0.36        23
   positive       0.82      0.98      0.90       392

avg / total       0.86      0.84      0.82       564

TEST
Accuracy score for baseline linear SVM in predicting sentiment: 0.6839378238341969
 
             precision    recall  f1-score   support

   negative       0.62      0.17      0.27        58
    neutral       0.00      0.00      0.00         7
   positive       0.69      0.95      0.80       128

avg / total       0.64      0.68      0.61       193



  'precision', 'predicted', average, warn_for)


#### Train & Test Service Sentiment Analysis

In [140]:
pipeline(sentiment_classifier_folder, "is_service", is_service_coef, is_service_bias)

TRAIN
Accuracy score for baseline linear SVM in predicting sentiment: 0.883054892601432
 
             precision    recall  f1-score   support

   negative       0.89      0.90      0.90       207
    neutral       1.00      0.20      0.33        15
   positive       0.87      0.91      0.89       197

avg / total       0.89      0.88      0.88       419

TEST
Accuracy score for baseline linear SVM in predicting sentiment: 0.7034482758620689
 
             precision    recall  f1-score   support

   negative       0.69      0.83      0.75        71
    neutral       0.00      0.00      0.00         9
   positive       0.73      0.66      0.69        65

avg / total       0.66      0.70      0.68       145



  'precision', 'predicted', average, warn_for)
