In [3]:
import pandas as pd
import pickle
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.classify import NaiveBayesClassifier, accuracy
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from random import shuffle
from string import punctuation
import spacy


In [5]:
en_stop_words = stopwords.words('english')
nlp = spacy.load('en_core_web_sm')
stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()
symbol = punctuation

In [6]:
def loadingDataset():
    data = pd.read_csv('Movie Dataset.csv').sample(n=1000)
    data.dropna(inplace=True)
    return data

In [7]:
def get_label(tag):
    if tag == 'jj':
        return 'a'
    elif tag in ['nn','rb','vb']:
        return tag[0]
    else:
        return None

In [8]:
def lemmatizing(word_list):
    tagging = pos_tag(word_list)
    lemmatized_list = []

    for word, tag in tagging:
        label = get_label(tag)
        if label!= None:
            lemmatized_list.append(lemmatizer.lemmatize(word,label))
        else:
            lemmatized_list.append(lemmatizer.lemmatize(word))
    return lemmatized_list

In [9]:
def preprocessingText(data):
    reviews = data['review'].to_list()
    sentiments = data['sentimentScore'].to_list()

    word_list = []
    for sentence in reviews:
        words = word_tokenize(sentence)
        for word in words:
            word_list.append(word.lower())
    
    word_list = [word.lower() for word in word_list if word not in en_stop_words and word not in symbol and word.isalpha()]
    word_list = lemmatizing(word_list)

    feature_set = []

    labeled_list = list(zip(reviews,sentiments))

    for sentence, label in labeled_list:
        words = []
        words = word_tokenize(sentence)
        words = [word.lower() for word in words if word not in en_stop_words and word not in symbol and word.isalpha()]
        words = lemmatizing(words)

        feature = {}

        for word in words:
            feature[word] = (word in word_list)
        feature_set.append((feature,label))
    
    return feature_set

    

In [10]:
def modelling():
    data = loadingDataset()
    feature_set = preprocessingText(data)
    shuffle(feature_set)

    count = int(0.8*len(feature_set))
    train_data = feature_set[:count]
    test_data = feature_set[count:]

    model = NaiveBayesClassifier.train(train_data)
    file = open('model.pickle','wb')
    pickle.dump(model,file)
    file.close()
    
    print(model.show_most_informative_features(n=10))
    print(accuracy(model,test_data))


    return model

In [12]:
def get_model():
    try:
        file = open('model.pickle','rb')
        model = pickle.load(file)
        file.close()
    except:
        model = modelling()
    return model

In [14]:
def writeReview(model):
    while True:
        review = input('WRITE YOUR REVIEW: ')
        reviews = word_tokenize(review)
        if len(reviews) > 20:
            break
    category = model.classify(FreqDist(reviews))
    return review,category


In [18]:
def movieRecommendation(data,user_review):
    data_reviews = list(data['review'])
    
    vectorizer = TfidfVectorizer()
    data_vec = vectorizer.fit_transform(data_reviews)
    user_vec = vectorizer.transform([user_review])

    similarity = cosine_similarity(data_vec,user_vec).flatten()
    top_indices = similarity.argsort()[-2:][::-1]

    counter = 1

    for index in top_indices:
        print(f"{counter}: {data.iloc[index]['title']}")
        counter+=1


In [19]:
def NER(data):
    reviews = list(data['review'])

    doc = nlp(str(reviews))

    category = {}

    for ent in doc.ents:
        if ent.label_ not in category:
            category[ent.label_] = []
        category[ent.label_].append(ent.text)

    for label,text in category.items():
        print(f"{label}: {', '.join(text)}")

In [20]:
user_review = 0
category = 0

model = get_model()
data = loadingDataset()

while True:
    print('MOVIE RECOMMENDATION MENU')
    print(f"YOUR REVIEW: {'NO REVIEW' if user_review == 0 else user_review}")
    print(f"CATEGORY: {'UNKNOWN' if category == 0 else category}")
    print('1. WRITE REVIEW')
    print('2. MOVIE RECOMMENDATION')
    print('3. NER')
    option = int(input('>> '))
    if option == 1:
        user_review, category = writeReview(model)
    elif option == 2:
        movieRecommendation(data,user_review)
    elif option == 3:
        NER(data)        



MOVIE RECOMMENDATION MENU
YOUR REVIEW: NO REVIEW
CATEGORY: UNKNOWN
1. WRITE REVIEW
2. MOVIE RECOMMENDATION
3. NER
WORK_OF_ART: 'More, Green Hornet, The House With a Clock, The Devil Made Me Do, 'Getting, The Chronicles of Narnia: the Lion, Despite Nolfi's, Snow White and the Huntsman, Thor: The Dark World, Beauty and the Beast, 'Lost World, Fascinatingly, Catch Me If You Can resembles, For Your Consideration, Hangover, LL Cool J, Due Date, 'The Chadwick Boseman&#47;T&#8217;Challa, 'The Conjuring 2, Kicking and Screaming, Pleasant Surprises Abound, Hidden Figures, Bad News Bears, 'Thor: Ragnarok, This Is Where I Leave You drops us, If Night of the Living Dead, The Descent, The Shining, 'As Fantine&#44, Although Valdemort, The Green Knight, Tarantino, 'Black Adam, The Light Between Oceans, Mary and the Witch's Flower, 'Shame', 'In The Mood For Love, 'Plays, Squish" and "Aargh, Depp's Wonka, 'The Darkest Hour, Hidden Figures, Witherspoon, The Founder, Exodus: Gods and Kings, Snow White an

ValueError: invalid literal for int() with base 10: ''