In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier, accuracy

from sklearn.feature_extraction.text import TfIdfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from IPython.display import clear_output

import pandas as pd
import string
import pickle
import spacy
import random

In [None]:
import nltk
nltk.download('stopwords')

import nltk
nltk.download('punkt')

import nltk
nltk.download('averaged_perceptron_tagger')

import nltk
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
def enter_to_continue():
    input("Press Enter to continue...")

In [88]:
clssifier = None
review = ""
category = ""

def load_dataset():
  df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/NLP_Lab_Quiz/movie-review.csv")
  return df

def get_tag(tag):
  if tag == 'j':
    return 'a'
  elif tag in ['nn', 'vb', 'rb']:
    return tag[0]
  else:
    return None

def preprocess(word_list):
  word_list = [word for word in word_list if word not in stopwords.words('english')]
  word_list = [word for word in word_list if word not in string.punctuation]
  word_list = [word for word in word_list if word.isalpha()]

  stemming = PorterStemmer()
  word_list = [stemming.stem(word) for word in word_list]

  tagging = pos_tag(word_list)
  wnl = WordNetLemmatizer()
  word_list = [wnl.lemmatize(word, pos = get_tag(tag)) for word, tag in tagging]

  return word_list

In [87]:
def train_model():
  df = load_model().sample(n=3000)

  review_list = [str(review) for review in df['review'].to_list()]
  sentiment_list = [str(sentiment) for sentiment in df['sentimentScore'].to_list()]

  word_list = []

  for sentence in review_list:
    words = word_tokenize(sentence)
    for word in words:
      word_list.append(word.lower())

  word_list = preprocess(word_list)

  labeled_data = list(zip(review_list, sentiment_list))

  feature_sets = []

  for review, sentiment in labeled_data:
    feature = {}

    check_word = word_tokenize(review)
    check_word = preprocess(check_word)

    for word in word_list:
      feature[word] = word in check_word

    feature_sets.append((feature, sentiment))

  random.shufle()

  train_count = int(len(feature_sets))
  train_dataset = feature_sets[:train_count]
  test_dataset = feature_sets[train_count:]

  classifier = NaiveBayesClassifier(train_dataset)
  print(f"Accuracy : {accuracy(classifier, test_dataset)}")

  file = open('model.pickle', 'wb')
  pickle.dump(classifier, file)
  file.close()

  return classifier

In [None]:
def print_menu():
  global review
  display_review = "No review"
  global category
  display_category = "None"

  if display_review != "":
    review = display_review

  if display_category != "":
    category = display_category

  print("Movie Recommendation Based on reviews")
  print(f"Your review : {display_review}")
  print(f"Review category ; {display_category}")
  print("1. Input review")
  print("2. View movie recommendation")
  print("3. view ner")
  print("4. Exit")
  choice = input(">> ")
  return choice

In [92]:
def write_review():
  clear_output()

  global review
  global category
  global classifier
  print("Input your review")
  review_input = input(">> ")

  if len(review_input.split(' ')) < 20:
    print("review must atleast contain 20 words")
    enter_to_continue()
    return

  review = review_input

  words = word_tokenize(review_input)
  words = preprocess(words)

  feature = FreqDist(words)
  category = classifier.classify(feature)

  print(f"Review Classified as {category}")

  enter_to_continue()


In [93]:
def view_movie_recommendation():
  clear_output()

  global review
  df = load_dataset()

  review_list = [str(review) for review in df['review'].to_list()]
  title_list = [str(title) for title in df['title'].to_list()]

  vectorizer = TfidfVectorizer()
  matrix = vectorizer.fit_transform(review_list)

  query_matrix = vectorizer.transform([review])
  cosine_similarities = cosine_similarity(query_matrix, matrix).flatten()

  related_docs_indices = cosine_similarities.argsort()[::-1][:3]

  for i, idx in enumerate(related_docs_indices):
    print(f"{i+1}. {title_list[idx]}")
  enter_to_continue()

In [89]:
# def view_ner():
#     clear_output()

#     df = load_dataset().sample(n=3000)
#     reviews = df['review'].to_string()

#     spacy_nlp = spacy.load('en_core_web_sm')
#     doc = spacy_nlp(reviews)

#     categories = {}

#     for ent in doc.ents:
#         label = ent.label_

#         if label not in ['LANGUAGE', 'GPE']:
#             continue

#         if label not in categories:
#             categories[label] = []

#         categories[label].append(ent.text)

#     for label, entities in categories.items():
#         print(f"{label}: {', '.join(entities)}")

#     if len(categories) == 0:
#         print("No entities found")

#     enter_to_continue()

def view_ner():
  clear_output()

  df = load_dataset().sample(n=3000)
  review_string = df['review'].to_string()

  spacy_nlp = spacy.load('en_core_web_sm')
  doc = spacy_nlp(review_string)

  categories = {}

  for ent in doc.ents:
    label = ent.label_

    if label not in ['LANGUAGE', 'GPE']:
      continue

    if label not in categories:
      categories[label] = []

    categories[label].append(ent.text)

  for label, entities in categories.item():
    print(f"{label} : {', '.join(entities)}")

  if len(categories) == 0:
    print("No entities found")
  enter_to_continue()

In [82]:
def main():
    global classifier
    # try:
    #     file = open('model.pickle', 'rb')
    #     classifier = pickle.load(file)
    #     file.close()
    # except FileNotFoundError:
    #     classifier = train_model()
    try:
      file = open('model.pickle', 'rb')
      classifier = pickle.load(file)
      file.close()
    except FileNotFoundError:
      classifier = train_model()

    while True:
        clear_output()
        choice = print_menu()
        if choice == '1':
            write_review()
        elif choice == '2':
            view_movie_recommendation()
        elif choice == '3':
            view_ner()
        elif choice == '4':
            break

    print("Goodbye!")

In [None]:
# main()