In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from skmultilearn.problem_transform import LabelPowerset
import numpy as np
import pickle

In [None]:
tree = ET.parse('../data/Restaurants_Train.xml')
root = tree.getroot()

In [2]:


labeled_reviews = []
for sentence in root.findall("sentence"):
    entry = {}
    aterms = []
    aspects = []
    if sentence.find("aspectTerms"):
        for aterm in sentence.find("aspectTerms").findall("aspectTerm"):
            aterms.append(aterm.get("term"))
    if sentence.find("aspectCategories"):
        for aspect in sentence.find("aspectCategories").findall("aspectCategory"):
            aspects.append(aspect.get("category"))
    entry["text"], entry["terms"], entry["aspects"]= sentence[0].text, aterms, aspects
    labeled_reviews.append(entry)
labeled_df = pd.DataFrame(labeled_reviews)
print("there are",len(labeled_reviews),"reviews in this training set")

there are 3044 reviews in this training set


In [3]:
# Save annotated reviews
labeled_df.to_pickle("../pickled_files/annotated_reviews_df.pkl")
labeled_df.head()

Unnamed: 0,text,terms,aspects
0,But the staff was so horrible to us.,[staff],[service]
1,"To be completely fair, the only redeeming fact...",[food],"[food, anecdotes/miscellaneous]"
2,"The food is uniformly exceptional, with a very...","[food, kitchen, menu]",[food]
3,Where Gabriela personaly greets you and recomm...,[],[service]
4,"For those that go once and don't enjoy it, all...",[],[anecdotes/miscellaneous]


# Fixing coreferencing

In [4]:
import neuralcoref
import spacy
nlp = spacy.load('en')

neuralcoref.add_to_pipe(nlp)

# Define function for replacing pronouns using neuralcoref
def replace_pronouns(text):
    doc = nlp(text)
    return doc._.coref_resolved

In [5]:
#read annotated reviews dataframe
annotated_reviews_df = pd.read_pickle("../pickled_files/annotated_reviews_df.pkl")
annotated_reviews_df.head(3)

Unnamed: 0,text,terms,aspects
0,But the staff was so horrible to us.,[staff],[service]
1,"To be completely fair, the only redeeming fact...",[food],"[food, anecdotes/miscellaneous]"
2,"The food is uniformly exceptional, with a very...","[food, kitchen, menu]",[food]


In [6]:
# Create a new column with fixed correferencing
annotated_reviews_df["fixed_review"] = annotated_reviews_df.text.map(lambda x: replace_pronouns(x))

In [7]:
annotated_reviews_df.to_pickle("../pickled_files/annotated_reviews_df2.pkl")

# Read pickled file with replaced pronouns
annotated_reviews_df = pd.read_pickle("../pickled_files/annotated_reviews_df2.pkl")
annotated_reviews_df.head(3)

Unnamed: 0,text,terms,aspects,fixed_review
0,But the staff was so horrible to us.,[staff],[service],But the staff was so horrible to us.
1,"To be completely fair, the only redeeming fact...",[food],"[food, anecdotes/miscellaneous]","To be completely fair, the only redeeming fact..."
2,"The food is uniformly exceptional, with a very...","[food, kitchen, menu]",[food],"The food is uniformly exceptional, with a very..."


In [8]:
# multilabelBinarizer transforms aspects into arrays
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(annotated_reviews_df.aspects)
X = annotated_reviews_df.fixed_review

# Split data into train and test set 75% 25%
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=0)

In [9]:
filename = '../pickled_files/mlb.pkl'
pickle.dump(mlb, open(filename, 'wb'))
X_train.to_pickle("../pickled_files/X_train.pkl")
X_test.to_pickle("../pickled_files/X_test.pkl")
filename = '../pickled_files/y_train.pkl'
pickle.dump(y_train, open(filename, 'wb'))
filename = '../pickled_files/y_test.pkl'
pickle.dump(y_test, open(filename, 'wb'))

# Train Best model (Multinomial Naive Bayes) using full dataset 

In [10]:
#Train Best model (Multinomial Naive Bayes) using full dataset 
text_clf = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1, 1))),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', LabelPowerset(MultinomialNB(alpha=1e-2))),])
text_clf = text_clf.fit(X, y)

# Saving model

In [11]:
filename = '../pickled_files/NB_model.pkl'
pickle.dump(text_clf, open(filename, 'wb'))