In [1]:
import pandas as pd
import pickle
import spacy
import os

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

df = pd.read_csv('Latihan Soal UAP/imdb-movies-dataset.csv')
model = 'model.pickle'

In [2]:
df.isnull().sum()

Title     4971
Rating    5151
Review    5184
dtype: int64

In [3]:
df = df.dropna()
df.isnull().sum()

Title     0
Rating    0
Review    0
dtype: int64

In [4]:
nlp = spacy.load("en_core_web_sm")
stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()
stopword = stopwords.words('english')

In [15]:
reviews = []
categories = []
# movie_df =
vectorizer = TfidfVectorizer()
tfidf_matrix = None
NB = None
last_review = 'no review yet'
last_cat = 'no data'

In [6]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    clean_text = [
        lemmatizer.lemmatize(stemmer.stem(word))
        for word in tokens
        if word.isalpha() and word not in stopword
    ]
    return " ".join(clean_text)

In [7]:
def load_train_model():
    global NB, vectorizer, df, tfidf_matrix

    if os.path.exists(model):
        with open(model, 'rb') as file:
            NB, vectorizer, df, tfidf_matrix = pickle.load(file)
            print('Model loaded')

    else:
        print('Training new model')
        df = df[['Title', 'Review', 'Rating']].dropna()
        df['Processed'] = df['Review'].apply(preprocess_text)
        df['Label'] = df['Rating'].apply(lambda x: 'positive' if float(x) > 7 else 'negative')

        x_train, _, y_train, _ = train_test_split(df['Processed'], df['Label'], test_size=0.2)
        x_train_vector = vectorizer.fit_transform(x_train)
        NB = MultinomialNB()
        NB.fit(x_train_vector, y_train)

        tfidf_matrix = vectorizer.transform(df['Processed'])

        with open(model, 'wb') as file:
            pickle.dump((NB, vectorizer, df, tfidf_matrix), file)
        print('Model trained and saved')
        

In [58]:
tfidf_matrix

<4786x22579 sparse matrix of type '<class 'numpy.float64'>'
	with 519886 stored elements in Compressed Sparse Row format>

In [8]:
def classify_review(review):
    processed =  preprocess_text(review)
    vector = vectorizer.transform([processed])
    return NB.predict(vector)[0]

In [24]:
def recommen_movies(review):
    if not review:
        print('no reviews')
        return
    
    query_vector = vectorizer.transform([preprocess_text(review)])
    sim = cosine_similarity(query_vector, tfidf_matrix).flatten()
    top = sim.argsort()[-2:][::1]
    print('RECOMMENDED MOVIES')
    for i in top:
        print(f"{df.iloc[i]['Title']} -> {sim[i]:.2f} similarity")

In [57]:
def extract_entitites(review):
    doc = nlp(review)
    entities = {}

    for ent in doc.ents:
        entities.setdefault(ent.label_, set()).add(ent.text.strip())

    print('Named Entities:')
    for label, texts in entities.items():
        print(f'{label}: {", ".join(texts)}')

test = "This English movie is so good, i watched at London"
extract_entitites(test)

Named Entities:
LANGUAGE: English
GPE: London


In [56]:
def write_review():
    global last_review, last_cat
    while True:
        review = input('Write your review (min 20 words): ')
        if len(review.split()) < 20:
            print('Your review must be at leats 20 words')

        else:
            category = classify_review(review)
            reviews.append(review)
            categories.append(category)
            print(review)
            print(f'Your review has been saved and classified as {category}')
            last_review = review
            last_cat = category
            break

In [54]:
def menu():
    while True:
        print('\nMOVIE RECOMMENDATION APPLICATION')
        print(f'Your review: {last_review}')
        print(f'Your category: {last_cat}')
        print('1. Write your review')
        print('2. View movie recommendation')
        print('3. View named entity recognition')
        print('4. Exit')
        choice = input('Select a menu (1-4): ')
        print('Your input:', choice)

        if choice == '1':
            write_review()
        elif choice == '2':
            if not reviews:
                print('No review yet')
            else:
                recommen_movies(reviews[-1])
        elif choice == '3':
            if not reviews:
                print('No review yet')
            else:
                extract_entitites(reviews[-1])
        elif choice == '4':
            print('Exiting...')
            break
        else:
            print('Invalid input')

In [55]:
if __name__ == "__main__":
    load_train_model()
    menu()

Model loaded

MOVIE RECOMMENDATION APPLICATION
Your review: i really like this movie, it reminds me of my dog who passed away last year. my family cried in the middle of the movie, it was so meaningful
Your category: positive
1. Write your review
2. View movie recommendation
3. View named entity recognition
4. Exit
Your input: 1
i watched it in London and it was amazing. this English movie was really meaningful and warm to watch, it really teaches me something about life and thankful for what you have
Your review has been saved and classified as positive

MOVIE RECOMMENDATION APPLICATION
Your review: i watched it in London and it was amazing. this English movie was really meaningful and warm to watch, it really teaches me something about life and thankful for what you have
Your category: positive
1. Write your review
2. View movie recommendation
3. View named entity recognition
4. Exit
Your input: 2
RECOMMENDED MOVIES
The Courier -> 0.18 similarity
White Bird -> 0.23 similarity

MOVIE 