In [20]:
import os
import pickle # buat create model kita
import string # buat ilangin tanda baca
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from random import shuffle # buat ngacak urutan dari dataset kita
# nltk.download('punkt') # biar stemmer udah keload bahasanya
# nltk.download('stopwords') # buat stopwordsnya
import spacy  # Import spaCy for NER
# Load spaCy's English model
#python -m spacy download en_core_web_sm

In [21]:
nlp = spacy.load('en_core_web_sm')
stop_words = set(stopwords.words('english'))
punc_list = set(string.punctuation)
stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()

In [22]:
def preprocess(text):
    word_list = word_tokenize(text.lower())
    word_list = [word for word in word_list if word not in stop_words]
    word_list = [word for word in word_list if word not in punc_list]
    word_list = [word for word in word_list if word.isalpha()]
    word_list = [stemmer.stem(word) for word in word_list]
    word_list = [lemmatizer.lemmatize(word) for word in word_list]
    
    # freq_dist = FreqDist(word_list)
    # words_features = [word for word, freq in freq_dist.most_common(100)]

    return {word: True for word in word_list}

def training_model(dataset):
    features_set = [(preprocess(review), label) for title, review, label in zip(dataset['title'], dataset['review'], dataset['sentimentScore'])]
    
    shuffle(features_set)
    split_index = int(len(features_set)*0.8)
    train_set = features_set[:split_index]      
    test_set = features_set[split_index:]
    
    classifier = NaiveBayesClassifier.train(train_set)
    accuracy_train = accuracy(classifier, test_set)
    print(f"Accuracy: {accuracy_train*100}%")
    
    file = open('model.pickle', 'wb')
    pickle.dump(classifier, file)
    file.close()

    return classifier

In [23]:
def load_model(dataset):
    if os.path.exists('model.pickle'):
        file = open('model.pickle', 'rb')
        model = pickle.load(file)
        file.close()
    else:
        model = training_model(dataset)
    return model

In [29]:
def movie_recommendation(query, dataset):
    vectorizer = TfidfVectorizer()
    title_list = list(dataset['title'])
    review_list = list(dataset['review'])
    
    matrix = vectorizer.fit_transform(review_list)
    query_v = vectorizer.transform([query])
    
    similarities = cosine_similarity(query_v, matrix)
    print(similarities.argsort()[-2:][::-1])
    data = {'Title': title_list, 'Review': review_list, 'Similarity': similarities[0]}
    df = pd.DataFrame(data)
    df_sorted = df.sort_values(by='Similarity', ascending=False).reset_index(drop=True)
    
    top_two_movies = []
    
    for index, row in df_sorted.iterrows():
        title = row['Title']
        
        if title not in top_two_movies:
            top_two_movies.append(title)
        
        if len(top_two_movies) == 2:
            break
    
    for movie in top_two_movies:
        print(movie)


In [25]:
def categorize_named_entities(dataset):
    all_entities = {
        "LOC": set(),
        "LANGUAGE": set()
    }
    
    for review in dataset['review']:
        doc = nlp(review)
        for ent in doc.ents:
            if ent.label_ == "LOC":
                all_entities["LOC"].add(ent.text)
            elif ent.label_ == "LANGUAGE":
                all_entities["LANGUAGE"].add(ent.text)
    
    
    print("Summary of Named Entities:")
    for label, entities in all_entities.items():
        entity = ", ".join(entities)
        print(f"{label}: {entity}")

In [30]:
dataset = pd.read_csv('Movie Dataset.csv').sample(n=1000)
dataset.dropna(inplace=True)
classifier = load_model(dataset)

review = "this is my honest review, its actually good"
review_tokenized = word_tokenize(review)
category = classifier.classify(FreqDist(review_tokenized))
print("YOUR REVIEW : ", review)
print("YOUR REVIEW CATEGORY : ", category, end="\n\n")

categorize_named_entities(dataset)
movie_recommendation(review, dataset)

YOUR REVIEW :  this is my honest review, its actually good
YOUR REVIEW CATEGORY :  POSITIVE

Summary of Named Entities:
LOC: Africa, Fantasy Island, Bay, Hippocratic, the Middle East, Rock Hudson, Red Planet, the North Pole, Isle of Dogs
LANGUAGE: Hebrew
[[784 750 410 409 408 751 405 753 754 401 399 398 397 391 413 758 387 760
  385 384 383 381 764 766 377 376 373 769 371 388 369 414 749 463 462 461
  460 458 457 909 453 735 736 448 447 737 416 738 739 740 438 437 742 430
  429 428 743 746 747 420 418 442 733 367 774 797 800 804 299 298 297 296
  295 290 807 808 809 813 307 283 815 279 816 274 273 272 819 267 266 263
  262 259 820 281 361 795 310 355 354 777 778 351 350 348 347 346 341 339
  338 782 309 783 333 330 659 328 785 325 323 320 319 790 315 313 793 334
  257 729 468 608 677 678 605 680 603 601 600 681 598 597 596 595 609 594
  590 589 588 686 687 581 578 577 691 573 571 568 567 683 699 675 616 656
  655 662 653 650 648 646 645 644 643 641 664 665 615 638 636 635 634 633
  632