In [1]:
import nltk
import string
import pandas as pd
import pickle
from random import shuffle

from nltk import FreqDist
from nltk.classify import accuracy
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import spacy
import spacy.cli
from collections import defaultdict

In [2]:
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\norbe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\norbe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
stemmer = PorterStemmer()
wnl = WordNetLemmatizer()

eng_stopwords = stopwords.words("english")
dataset = pd.read_csv("updated_dataset.csv")
dataset.head()

Unnamed: 0,restaurant,text,label
0,Restaurant A,"Contrary to other reviews, I have zero complai...",positive
1,Restaurant B,Last summer I had an appointment to get new ti...,negative
2,Restaurant C,"Friendly staff, same starbucks fair you get an...",positive
3,Restaurant D,The food is good. Unfortunately the service is...,negative
4,Restaurant E,Even when we didn't have a car Filene's Baseme...,positive


In [4]:
def preprocessing(doc):
    words = [stemmer.stem(wnl.lemmatize(word)) for word in word_tokenize(doc.lower())]

    return {word:True for word in words if word not in string.punctuation and word not in eng_stopwords and word.isalpha()}

In [5]:
def trainModel():
    feature_sets = [(preprocessing(text), label) for text,label in zip(dataset['text'], dataset['label'])]

    shuffle(feature_sets)

    idx = int(len(feature_sets) * 0.85)

    train_set, test_set = feature_sets[:idx], feature_sets[idx:]

    classifier = nltk.NaiveBayesClassifier.train(train_set)

    acc = accuracy(classifier, test_set)
    print(acc)

    classifier.show_most_informative_features(5)

    file = open('model.pickle', "wb")
    pickle.dump(classifier, file)
    file.close()

In [6]:
def readModel():
    try:
        file = open('model.pickle', "rb")
        classifier = pickle.load(file)
        classifier.show_most_informative_features(5)
        file.close()
    except:
        classifier = trainModel()
    return classifier

In [7]:
def write():
    while True:
        review = input(">=2")

        words = review.split()

        if len(words) > 1:
            return review
        print("bruh")

In [19]:
def analyze(classifier, review):
    if len(review) == 0:
        print("no review")
        return ""

    clean_review = [word for word in word_tokenize(review) if word not in string.punctuation and word not in eng_stopwords and word.isalpha()]
    clean_review = [stemmer.stem(wnl.lemmatize(word)) for word in clean_review]

    result = classifier.classify(FreqDist(clean_review))
    print(f"Classification: {result}")

    return result

In [24]:
def rec(review):
    corpus = dataset["text"]
    restaurant = dataset["restaurant"]

    tf = TfidfVectorizer()
    matrix = tf.fit_transform(corpus)
    query = tf.transform([review])

    sim = cosine_similarity(matrix, query)

    df = pd.DataFrame({
        "restaurant": restaurant,
        "similarity": sim.flatten()
    })

    rec = df[df["similarity"] > 0.3]

    top_rec = rec.sort_values(by="similarity", ascending=False).head(10)

    return top_rec.reset_index(drop=True)

In [10]:
nlp = spacy.load('en_core_web_sm')
ner_labels = nlp.get_pipe('ner').labels

for label in ner_labels:
    print(f"{label}: {spacy.explain(label)}")

CARDINAL: Numerals that do not fall under another type
DATE: Absolute or relative dates or periods
EVENT: Named hurricanes, battles, wars, sports events, etc.
FAC: Buildings, airports, highways, bridges, etc.
GPE: Countries, cities, states
LANGUAGE: Any named language
LAW: Named documents made into laws.
LOC: Non-GPE locations, mountain ranges, bodies of water
MONEY: Monetary values, including unit
NORP: Nationalities or religious or political groups
ORDINAL: "first", "second", etc.
ORG: Companies, agencies, institutions, etc.
PERCENT: Percentage, including "%"
PERSON: People, including fictional
PRODUCT: Objects, vehicles, foods, etc. (not services)
QUANTITY: Measurements, as of weight or distance
TIME: Times smaller than a day
WORK_OF_ART: Titles of books, songs, etc.


In [28]:
def ner():
    ce = defaultdict(set)

    for review in dataset['text']:
        doc = nlp(review)
        for ent in doc.ents:
            if ent.label_ in ["GPE", "ORG", "LANGUAGE"]:
                ce[ent.label_].add(ent.text)
    
    print("CNE:")
    for cat, ent in ce.items():
        print(f"{cat}: {', '.join(sorted(ent))}")


In [25]:
if __name__ == "__main__":
    classifier = readModel()

    review = ""

    sentiment = ""

    while True:
        print("Review: ", "No Review" if len(review) == 0 else review)
        print("Sentiment: ", "None" if len(sentiment) == 0 else sentiment)
        print("1. Write")
        print("2. an")
        print("3. rec")
        print("4. NER")
        print("5. Exit")

        choice = int(input(">>"))

        if choice == 1:
            review = write()
        elif choice == 2:
            sentiment = analyze(classifier, review)
        elif choice == 3:
            if review:
                recommendations = rec(review)
                print("Recommendations!")
                if recommendations.empty:
                    print("No")
                else:
                    for idx, row in recommendations.iterrows():
                        print(f"{idx+1}: {row['restaurant']}")
                        print(f"Similarity: {row['similarity']}")
            else:
                print("write a review")
        elif choice == 4:
            ner()
        elif choice == 5:
            break
        else:
            print("Choose")

Most Informative Features
                 perfect = True           positi : negati =     14.2 : 1.0
                 terribl = True           negati : positi =     13.1 : 1.0
                 horribl = True           negati : positi =     10.6 : 1.0
                  receiv = True           negati : positi =      8.0 : 1.0
                 disgust = True           negati : positi =      6.7 : 1.0
Review:  No Review
Sentiment:  None
1. Write
2. an
3. rec
4. NER
5. Exit
Review:  Contrary to other reviews, I have zero complaints about the service or the prices. I have been getting tire service here for the past 5 years now, and compared to my experience with places like Pep Boys, these guys are experienced and know what they're doing. \nAlso, this is one place that I do not feel like I am being taken advantage of, just because of my gender. Other auto mechanics have been notorious for capitalizing on my ignorance of cars, and have sucked my bank account dry. But here, my service and road