In [1]:
# import packages
import numpy as np
import pandas as pd
import spacy
import en_coref_md
from nltk.corpus import stopwords
from nltk.corpus import sentiwordnet as swn
import string
import nltk
import csv
import os
from inflection import singularize
from collections import Counter
import itertools
import keras
import pickle
import xml.etree.ElementTree as etree
import matplotlib.pyplot as plt
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [44]:
nlp = spacy.load('en_core_web_sm')
MAXLEN = 2342

ind_to_polarity = {0:'negative', 1:"neutral", 2:"positive"}

In [3]:
## pre-identified categories: AMBIENCE, FOOD, SERVICE, PROMOTION, LOCATION
def parse_text(path):
    
    """function: parse xml inputs only suitable for specific format
       params: path: input path
       return: converted dataframe"""
    
    df = pd.read_csv(path)
    use_df = df[~df.Category.isnull()].reset_index(drop=True)
    
    all_reviews = []
    for index in range(use_df.shape[0]):
        sentence_review = {}
        sentence_review["sentence"] = use_df["Sentence"][index]
        sentence_review["is_food"] = 0
        sentence_review["is_location"] = 0
        sentence_review["is_ambience"] = 0
        sentence_review["is_service"] = 0
        sentence_review["is_promotion"] = 0
    

        string = use_df["Category"][index]
        if "FOOD" in string:
            sentence_review["is_food"] = 1
        if "LOCATION" in string:
            sentence_review["is_location"] = 1
        if "AMBIENCE" in string:
            sentence_review["is_ambience"] = 1
        if "SERVICE" in string:
            sentence_review["is_service"] = 1
        if "PROMOTION" in string:
            sentence_review["is_promotion"] = 1
    
        all_reviews.append(sentence_review)
    
    new_df = pd.DataFrame(all_reviews)  
    
    return new_df


In [4]:
# normalization
def normalize(sent):
    """function: normalize input sentence by lemmentization
       params: raw sentence
       return: normalized sentence"""
    
    sent = nlp(sent)
    return " ".join([str(token.lemma_) for token in sent])

In [32]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

def load_transformer(folder_dir, filename):
    """function: load pre-trained models
       params: folder_dir: folder directories that store models
               filename: model name
       return: model """
    
    if filename in os.listdir(folder_dir):
        tf = pickle.load(open(os.path.join(folder_dir, filename), 'rb'))
        print ("Load " + filename)
    else:
        tf = None
        
    return tf

In [33]:
def load_aspect_classifier(folder_dir, aspect):
    """function: load pre-trained models
       params: folder_dir: folder directories that store models
               aspect: string: indicate aspect
       return: model """
    
    filename = aspect + '_classifier.sav'
    
    if filename in os.listdir(folder_dir):
        classifier = pickle.load(open(os.path.join(folder_dir, filename), 'rb'))
        print ("Load " + filename)
    else:
        classifier = None
        
    return classifier

In [37]:
from keras.models import model_from_json

def load_sentiment_classifier(folder_dir, aspect):
    """function: load pre-trained models
       params: folder_dir: folder directories that store models
               aspect: string: indicate aspect
       return: model """
        
    model_weight = aspect + '_model_weights.h5'
    model_arch = aspect + '_model_architecture.json'
    
    # Model reconstruction from JSON file
    with open(os.path.join(folder_dir, model_arch), 'r') as f:
        model = model_from_json(f.read())

    # Load weights into the new model
    model.load_weights(os.path.join(folder_dir, model_weight))
    
    print ("Load sentiment classifier")
    return model

In [38]:
def transform_text(df, folder_dir, count_vect_tf, tfidf_tf):
    """function: convert reviews to vectors by tf-idf transformer
       params: df: dataframe that stores normalized reviews
               folder_dir: folder directories that store transformer
               count_vect_tf: count vectorizer file name
               tfidf_tf: tfidf transformer name
               
       return: transformed vectors """
    count_vect = load_transformer(folder_dir, count_vect_tf)
    tfidf_transformer = load_transformer(folder_dir, tfidf_tf)
    
    X_counts = count_vect.transform(df["normalize_sentence"])
    X_tfidf = tfidf_transformer.transform(X_counts)
    
    return X_tfidf

In [39]:
def get_relevance(coef, bias, tfidf_data, index):
    
    """get contribution score for word in each sentence for a specific aspect
    
       params: coef: classifier output coefficient
               bias: classifier output bias
               tfidf_data: data transformed after tfidf
               index: index of the sentence in tfidf_data
               
       return: series of words' contribution score for the sentence"""
    relevance = np.multiply(coef, tfidf_data.toarray()[index])[0]
    
    return relevance

In [40]:
def get_aspect_vectors(aspect, coef, bias, df, data_tfidf):
    
    """get sentences belong to indicated aspects and their contribution vectors
    
       params: aspect: string, indicate aspect
               coef: aspect classifier's coefficient
               bias: aspect classifier's bias
               tfidf_data: data transformed after tfidf vectorization
               
       return: dataframe"""
            
    aspect_vectors = []
    aspect_indexes = df[df[aspect] == 1].index.tolist()

    for ind in aspect_indexes:
        vectors = {}
        
        sentence = df.normalize_sentence[ind]
        vect = get_relevance(coef, bias, data_tfidf, ind)
        vectors["train_df_index"] = ind
        vectors["sentence"] = sentence
        vectors["vector"] = vect
        
        aspect_vectors.append(vectors)
        
    new_df = pd.DataFrame(aspect_vectors)
    
    return new_df

In [51]:
def pipeline(data_path, model_combos, aspect ):
    
    # parse data
    train_df = parse_text(data_path)
    # normalize reviews
    train_df["normalize_sentence"] = train_df.sentence.apply(lambda x: normalize(x))
    
    # subdirectories storing all necessary models
    # preprocessing folder
    preprocessing_folder = os.path.join(model_combos, 'preprocessing')
    # load pre-trained classifiers
    aspect_classifier_folder = os.path.join(model_combos, 'aspect_classifier')
    sentiment_classifier_folder = os.path.join(model_combos, 'sentiment_classifier')
     
    # transform text data
    count_vect_tf = 'count_vect_transformer.sav'
    tfidf_tf = 'tfidf_transformer.sav'
    X_train_tfidf  = transform_text(train_df, preprocessing_folder, count_vect_tf, tfidf_tf)
    
    # load coefficients learnt from aspect classifiers
    is_aspect = load_aspect_classifier(aspect_classifier_folder, aspect)
    is_aspect, is_aspect_coef, is_aspect_bias = is_aspect, is_aspect.coef_, is_aspect.intercept_
    
    # build new dataset for sentiment classifier
    aspect_df = get_aspect_vectors(aspect, is_ambience_coef, is_ambience_bias, train_df, X_train_tfidf )
    aspect_train = np.array([list(data) for data in aspect_df.vector])
    
    # load sentiment classifiers
    aspect_model = load_sentiment_classifier(sentiment_classifier_folder, aspect)
    aspect_pred = np.argmax(aspect_model.predict(aspect_train), axis=1)
    polarity_pred = [ind_to_polarity[i] for i in aspect_pred]
    aspect_df["polarity_pred"] = polarity_pred

    output = aspect_df[["sentence","polarity_pred"]]
    
    return output

In [55]:
def main():
    try:
        data_path = str(input('Enter your data path: '))
    except ValueError:
        print ("Not a path.")
        
    try:
        model_combos = str(input("Choose your model combinations: "))
    except ValueError:
        print ("Not a combination.")
        
    try: 
        aspect = str(input("Choose an aspect from 'ambience', 'food', 'service, or 'other': "))
        aspect = "is_" + aspect
    except ValueError:
        print ("Not a valid aspect.")

    output = pipeline(data_path, model_combos, aspect)
    print (output)

if __name__ =="__main__":
    main()

Enter your data path: Annotation/Train_new.csv
Choose your model combinations: models/SVM+DNN
Choose an aspect from 'ambience', 'food', 'service, or 'other': service
Load count_vect_transformer.sav
Load tfidf_transformer.sav
Load is_service_classifier.sav
Load sentiment classifier
                                             sentence polarity_pred
0   -PRON- , there be four of -PRON- , arrive at n...      negative
1   chow fun be dry ; pork shu mai be more than us...      negative
2           can not wait wait for -PRON- next visit .      positive
3   i happen to have a policy that go along with a...      negative
4   the food here be rather good , but only if -PR...      positive
5   the hostess be rude to the point of be offensi...      positive
6     the staff be incredibly helpful and attentive .      negative
7        -PRON- server be very helpful and friendly .      negative
8   wait staff be blantently unappreciative of -PR...      negative
9                          the staff b