In [1]:
import pickle
import re
import xml.etree.ElementTree as ET

import neuralcoref
import nltk
import pandas as pd
import spacy
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MultiLabelBinarizer
from skmultilearn.problem_transform import LabelPowerset

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Nika\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# # Uncomment this
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')

In [2]:
tree = ET.parse('../data/Restaurants_Train.xml')
root = tree.getroot()

# Parsing Restaurants_train.xml file
The code below parses XML file and creates DataFrame with the following columns: ```Aspects, Terms, Text (review)```

In [3]:
labeled_reviews = []
for sentence in root.findall("sentence"):
    entry = {}
    aterms = []
    aspects = []
    if sentence.find("aspectTerms"):
        for aterm in sentence.find("aspectTerms").findall("aspectTerm"):
            aterms.append(aterm.get("term"))
    if sentence.find("aspectCategories"):
        for aspect in sentence.find("aspectCategories").findall("aspectCategory"):
            aspects.append(aspect.get("category"))
    entry["text"], entry["terms"], entry["aspects"]= sentence[0].text, aterms, aspects
    labeled_reviews.append(entry)

labeled_df = pd.DataFrame(labeled_reviews)

print("there are",len(labeled_reviews),"reviews in this training set")

there are 3044 reviews in this training set


In [4]:
# Save annotated reviews
labeled_df.to_pickle("../pickled_files/annotated_reviews_df.pkl")
labeled_df.head()

Unnamed: 0,text,terms,aspects
0,But the staff was so horrible to us.,[staff],[service]
1,"To be completely fair, the only redeeming fact...",[food],"[food, anecdotes/miscellaneous]"
2,"The food is uniformly exceptional, with a very...","[food, kitchen, menu]",[food]
3,Where Gabriela personaly greets you and recomm...,[],[service]
4,"For those that go once and don't enjoy it, all...",[],[anecdotes/miscellaneous]


# Fixing Co-referencing

In [5]:
nlp = spacy.load('en')

neuralcoref.add_to_pipe(nlp)

# Define function for replacing pronouns using neuralcoref
def replace_pronouns(text):
    doc = nlp(text)
    return doc._.coref_resolved

In [6]:
example_coreferencing = replace_pronouns("I drove Joe home because he lives near my apartment.")
example_coreferencing

'I drove Joe home because Joe lives near my apartment.'

In [7]:
# read annotated reviews DataFrame

annotated_reviews_df = pd.read_pickle("../pickled_files/annotated_reviews_df.pkl")
annotated_reviews_df.head(3)

Unnamed: 0,text,terms,aspects
0,But the staff was so horrible to us.,[staff],[service]
1,"To be completely fair, the only redeeming fact...",[food],"[food, anecdotes/miscellaneous]"
2,"The food is uniformly exceptional, with a very...","[food, kitchen, menu]",[food]



### The code below fixes co-referencing in reviews and creates new column "fixed_review"


In [8]:
# Create a new column with fixed co-referencing

annotated_reviews_df["fixed_review"] = annotated_reviews_df["text"].map(lambda x: replace_pronouns(x))

# Lemmatization of the reviews

In [9]:
lemmatizer = WordNetLemmatizer()

In [10]:
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

In [11]:
print(lemmatize_sentence("I love playing football."))

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Nika\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


I love play football .


## Lemmatize Sentence and fix output

In [12]:
annotated_reviews_df["fixed_review"] = annotated_reviews_df["text"].map(lambda x: lemmatize_sentence(x))

In [22]:
# removes spaces before punctuations
# removes spaces between separated words do n't returns don't

def fix_output(text):
    text = text.replace(" n't", "n't")
    return re.sub(r'\s([,?.!"](?:\s|$))', r'\1', text)

In [24]:
annotated_reviews_df["fixed_review"] = annotated_reviews_df["fixed_review"].apply(remove_punct_spaces)

In [25]:
annotated_reviews_df.head()

Unnamed: 0,text,terms,aspects,fixed_review
0,But the staff was so horrible to us.,[staff],[service],But the staff be so horrible to us.
1,"To be completely fair, the only redeeming fact...",[food],"[food, anecdotes/miscellaneous]","To be completely fair, the only redeeming fact..."
2,"The food is uniformly exceptional, with a very...","[food, kitchen, menu]",[food],"The food be uniformly exceptional, with a very..."
3,Where Gabriela personaly greets you and recomm...,[],[service],Where Gabriela personaly greets you and recomm...
4,"For those that go once and don't enjoy it, all...",[],[anecdotes/miscellaneous],"For those that go once and don't enjoy it, all..."


In [26]:
annotated_reviews_df.to_pickle("../pickled_files/annotated_reviews_df2.pkl")

# Read pickled file with replaced pronouns

annotated_reviews_df = pd.read_pickle("../pickled_files/annotated_reviews_df2.pkl")
annotated_reviews_df.head(3)

Unnamed: 0,text,terms,aspects,fixed_review
0,But the staff was so horrible to us.,[staff],[service],But the staff be so horrible to us.
1,"To be completely fair, the only redeeming fact...",[food],"[food, anecdotes/miscellaneous]","To be completely fair, the only redeeming fact..."
2,"The food is uniformly exceptional, with a very...","[food, kitchen, menu]",[food],"The food be uniformly exceptional, with a very..."


The code below transforms "aspects" which are categorical into vectors for model training.

The data-set is split into training set and test set 75%, 25% respectively

In [27]:
# multilabelBinarizer transforms aspects into arrays

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(annotated_reviews_df["aspects"])
X = annotated_reviews_df["fixed_review"]


# Split data into train and test set 75% 25%

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=0)

Saving training sets and test sets as pickle files.

In [28]:
filename = '../pickled_files/mlb.pkl'
pickle.dump(mlb, open(filename, 'wb'))
X_train.to_pickle("../pickled_files/X_train.pkl")
X_test.to_pickle("../pickled_files/X_test.pkl")

filename = '../pickled_files/y_train.pkl'
pickle.dump(y_train, open(filename, 'wb'))
filename = '../pickled_files/y_test.pkl'
pickle.dump(y_test, open(filename, 'wb'))

# Train best model (Multinomial Naive Bayes) using full data-set

In [29]:
#Train Best model (Multinomial Naive Bayes) using full data-set

text_clf = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1, 1))),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', LabelPowerset(MultinomialNB(alpha=1e-2))),])

text_clf = text_clf.fit(X, y)

Saving model

In [30]:
filename = '../pickled_files/NB_model.pkl'
pickle.dump(text_clf, open(filename, 'wb'))