In [46]:
from nltk.tokenize import word_tokenize
from nltk.classify import NaiveBayesClassifier, accuracy
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.tag import pos_tag
from nltk.stem import SnowballStemmer,WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import pickle
import spacy
from string import punctuation
from random import shuffle

# Loading Dataset Function

In [47]:
def loadingDataset():
    data = pd.read_csv('Movie Dataset.csv').sample(n=1000)
    data.dropna(inplace=True)
    return data

# Helper Object

In [48]:
stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()
eng_stop_words = stopwords.words('english')
symbol = punctuation

# Preprocessing Text

In [49]:
def get_label(tag):
    if tag == 'jj':
        return 'a'
    elif tag in ['nn','rb','vb']:
        return tag[0]
    else:
        return None
    
def lemmatizing(word_list):
    tagging  = pos_tag(word_list)
    lemmatized_list = []
    for word, tag in tagging:
        label = get_label(tag)
        if label != None:
            lemmatized_list.append(lemmatizer.lemmatize(word,label))
        else:
            lemmatized_list.append(lemmatizer.lemmatize(word))
    return lemmatized_list

In [50]:
def Preprocessed_text(data): 
    reviews = data['review'].to_list()
    sentiments = data['sentimentScore'].to_list()

    word_list = []

    for sentence in reviews:
        words = word_tokenize(sentence)
        for word in words:
            word_list.append(word.lower())

    word_list = [word.lower() for word in word_list if word not in symbol and word not in eng_stop_words and word.isalpha()]
    word_list = lemmatizing(word_list)

    feature_set = []

    labeled_list = list(zip(reviews,sentiments))

    for sentence, label in labeled_list:
        words = []
        words = word_tokenize(sentence)
        words = [word.lower() for word in words if word not in symbol and word not in eng_stop_words and word.isalpha()]
        words = lemmatizing(words)

        feature = {}

        for word in words:
            feature[word] = (word in word_list)
        feature_set.append((feature,label))

    return feature_set

In [51]:
def modelling():
    data = loadingDataset()
    feature_set = Preprocessed_text(data)
    shuffle(feature_set)
    training_count = int(0.8*len(feature_set))
    training_data = feature_set[:training_count]
    testing_data = feature_set[training_count:]
    
    model = NaiveBayesClassifier.train(training_data)

    file = open('model.pickle','wb')
    pickle.dump(model,file)
    file.close()

    print(model.show_most_informative_features(n=10))
    print(accuracy(model,testing_data))
    return model

In [52]:
def checkModel():
    try:
        file = open('model.pickle','rb')
        model = pickle.load(file)
        file.close()
    except:
        model = modelling()
    return model

In [53]:
def writeReview(model):
    while True:
        review = input('Write your review: ')
        reviews = word_tokenize(review)
        if (len(reviews)>=20):
            break
    category = model.classify(FreqDist(reviews))
    return review,category

In [54]:
def movieRecommendation(data,user_review):
    review = list(data['review'])
    vectorizer  = TfidfVectorizer()
    data_vec = vectorizer.fit_transform(review)
    user_vec = vectorizer.transform([user_review])

    similarities = cosine_similarity(user_vec,data_vec).flatten()
    top_indices = similarities.argsort()[-2:][::-1]

    counter = 1
    for index in top_indices:
        print(f"{counter}: {data.iloc[index]['title']}")
        counter +=1

In [55]:
def NER(data):
    nlp = spacy.load('en_core_web_sm')

    category = {
        'LOC' : set(),
        'LANGUAGE': set()
    }

    for review in data['review']:
        doc = nlp(review)

        for ent in doc.ents:
            if ent.label_ == 'LOC':
                category['LOC'].add(ent.text)
            elif ent.label_ == 'LANGUAGE':
                category['LANGUAGE'].add(ent.text)
            else:
                pass
    
    for label, text in category.items():
        print(f"{label}: {', '.join(text)}")

In [56]:
user_review = 0
category = 0

model = checkModel()
data = loadingDataset()

while True:
    print('MOVIE RECOMMENDATION APPLICATION BASED ON REVIEWS')
    print(f"YOUR REVIEW: {'NO REVIEW' if user_review == 0 else user_review}")
    print(f"YOUR REVIEW CATEGORY: {'UNKNOWN' if category == 0 else category}")
    print('1. WRITE YOUR REVIEW')
    print('2. MOVIE RECOMMENDATION')
    print('3. NER')
    print('4. EXIT')
    option = int(input('>> '))

    if option == 1:
        user_review, category = writeReview(model)
    elif option == 2:
        movieRecommendation(data,user_review)
    elif option == 3:
        NER(data)
    elif option == 4:
        break
    

MOVIE RECOMMENDATION APPLICATION BASED ON REVIEWS
YOUR REVIEW: NO REVIEW
YOUR REVIEW CATEGORY: UNKNOWN
1. WRITE YOUR REVIEW
2. MOVIE RECOMMENDATION
3. NER
4. EXIT
LOC: The Big Short, Berry and Stone, Northern California, Wind River, Asia, Hudson, Paradise, Soho, The Wild Bunch&#44
LANGUAGE: English
MOVIE RECOMMENDATION APPLICATION BASED ON REVIEWS
YOUR REVIEW: NO REVIEW
YOUR REVIEW CATEGORY: UNKNOWN
1. WRITE YOUR REVIEW
2. MOVIE RECOMMENDATION
3. NER
4. EXIT


ValueError: invalid literal for int() with base 10: ''