In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.probability import FreqDist
from nltk import NaiveBayesClassifier, classify, accuracy
import string
import random
import nltk
import pickle

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-

True

# 1. Preprocess the Data

In [2]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('iphone_se_review.csv')

print(df.columns)

# Extract the relevant columns
df = df[['Ratings', 'Reviews;;;;']]

# Convert the 'Ratings' column to numeric values
df['Ratings'] = pd.to_numeric(df['Ratings'], errors='coerce')

# Separate positive and negative reviews
positive_reviews = df[df['Ratings'] >= 4]['Reviews;;;;'].tolist()
negative_reviews = df[df['Ratings'] < 4]['Reviews;;;;'].tolist()

# Write the reviews to separate files
with open('positive.txt', 'w', encoding='utf-8') as pos_file:
    for review in positive_reviews:
        pos_file.write(review + '\n')

with open('negative.txt', 'w', encoding='utf-8') as neg_file:
    for review in negative_reviews:
        neg_file.write(review + '\n')


Index(['Ratings', 'Comment', 'Reviews;;;;'], dtype='object')


In [3]:
# Load the reviews from files
positive = open("positive.txt", "r", encoding="utf-8").read()
negative = open("negative.txt", "r", encoding="utf-8").read()

# 2. Stopwords, Lemmatizing, Stemming

In [4]:
# Preprocess the text
stopwords_list = stopwords.words("english")
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [5]:
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.lower() not in stopwords_list]
    tokens = [word for word in tokens if word not in string.punctuation]
    tokens = [word for word in tokens if word.isalpha()]
    stems = [stemmer.stem(word) for word in tokens]
    lemmas = [lemmatizer.lemmatize(word) for word in tokens]
    return stems + lemmas

In [6]:
# Create a labeled dataset
labeled_sentences = [(preprocess_text(s), 'pos') for s in positive.split('\n')] + [(preprocess_text(s), 'neg') for s in negative.split('\n')]


# 3. Pos Tag and NER

In [7]:
from nltk.tag import pos_tag

In [8]:
list_words = word_tokenize(positive) + word_tokenize(negative)
tagged = pos_tag(list_words)
print(tagged)

[('Great', 'NNP'), ('camera', 'NN'), ('for', 'IN'), ('pics', 'NNS'), ('and', 'CC'), ('videos', 'NNS'), ('Battery', 'NNP'), ('life', 'NN'), ('is', 'VBZ'), ('good', 'JJ'), ('so', 'RB'), ('far', 'RB'), ('with', 'IN'), ('some', 'DT'), ('setting', 'VBG'), ('turn', 'NN'), ('of', 'IN'), ('which', 'WDT'), ('i', 'VBP'), ('never', 'RB'), ('use', 'NN'), ('and', 'CC'), ('when', 'WRB'), ('i', 'NN'), ('use', 'VBP'), ('i', 'JJ'), ('turn', 'VBP'), ('those', 'DT'), ('on', 'IN'), ('and', 'CC'), ('i', 'JJ'), ('use', 'NN'), ('it', 'PRP'), ('in', 'IN'), ('power', 'NN'), ('saving', 'VBG'), ('mode', 'NN'), ('all', 'PDT'), ('the', 'DT'), ('time', 'NN'), ('so', 'IN'), ('a', 'DT'), ('full', 'JJ'), ('day', 'NN'), ('with', 'IN'), ('light', 'JJ'), ('gaming', 'NN'), ('of', 'IN'), ('1hr', 'CD'), ('or', 'CC'), ('more', 'JJR'), ('using', 'VBG'), ('camera', 'NN'), ('for', 'IN'), ('1hr', 'CD'), ('or', 'CC'), ('more', 'JJR'), ('listening', 'JJ'), ('music', 'NN'), ('in', 'IN'), ('my', 'PRP$'), ('car', 'NN'), ('on', 'IN'),

# 4. WordNet

In [9]:
from nltk.corpus import wordnet

In [10]:
word = "Great"
synsets = wordnet.synsets(word)

In [11]:
for synset in synsets:
    print(f"{synset}: {synset.definition()}")
    for lemma in synset.lemmas():
        print(f"Synonim: {lemma.name()}")
        for antonym in lemma.antonyms():
            print(f"Antonym: {antonym.name()}")

Synset('great.n.01'): a person who has achieved distinction and honor in some field
Synonim: great
Synset('great.s.01'): relatively large in size or number or extent; larger than others of its kind
Synonim: great
Synset('great.s.02'): of major significance or importance
Synonim: great
Synonim: outstanding
Synset('great.s.03'): remarkable or out of the ordinary in degree or magnitude or effect
Synonim: great
Synset('bang-up.s.01'): very good
Synonim: bang-up
Synonim: bully
Synonim: corking
Synonim: cracking
Synonim: dandy
Synonim: great
Synonim: groovy
Synonim: keen
Synonim: neat
Synonim: nifty
Synonim: not_bad
Synonim: peachy
Synonim: slap-up
Synonim: swell
Synonim: smashing
Synset('capital.s.03'): uppercase
Synonim: capital
Synonim: great
Synonim: majuscule
Synset('big.s.13'): in an advanced stage of pregnancy
Synonim: big
Synonim: enceinte
Synonim: expectant
Synonim: gravid
Synonim: great
Synonim: large
Synonim: heavy
Synonim: with_child


# 5. Corpora

In [12]:
from nltk.corpus import gutenberg

import nltk
nltk.download('gutenberg') # download file dari nltk

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [13]:
# load corpora from web
from urllib import request

url = "https://www.gutenberg.org/files/63919/63919.txt"
corpus = request.urlopen(url).read().decode('utf8')
print(corpus)

The Project Gutenberg EBook of Captain Chaos, by D. Allen Morrissey

This eBook is for the use of anyone anywhere in the United States and most
other parts of the world at no cost and with almost no restrictions
whatsoever.  You may copy it, give it away or re-use it under the terms of
the Project Gutenberg License included with this eBook or online at
www.gutenberg.org.  If you are not located in the United States, you'll have
to check the laws of the country where you are located before using this ebook.

Title: Captain Chaos

Author: D. Allen Morrissey

Release Date: November 29, 2020 [EBook #63919]

Language: English

Character set encoding: ASCII

*** START OF THIS PROJECT GUTENBERG EBOOK CAPTAIN CHAOS ***




Produced by Greg Weeks, Mary Meehan and the Online
Distributed Proofreading Team at http://www.pgdp.net









                             CAPTAIN CHAOS

                         By D. ALLEN MORRISSEY

          _Science equipped David Corbin with borrowed time;
        s

# 6. Training with Naive Bayes Classification

In [14]:
# Shuffle and split the dataset into training and testing sets
random.shuffle(labeled_sentences)
split_index = int(len(labeled_sentences) * 0.7)
training_data = labeled_sentences[:split_index]
testing_data = labeled_sentences[split_index:]

In [15]:
# Create a dictionary of word frequencies
all_words = [word for words, label in labeled_sentences for word in words]
all_words_freq = FreqDist(all_words)

In [16]:
# Use the 5000 most common words as features
common_words = list(all_words_freq.keys())[:5000]

In [17]:
# Convert the dataset into a feature set
def extract_features(words):
    features = {word: (word in words) for word in common_words}
    return features

training_features = [(extract_features(words), label) for words, label in training_data]
testing_features = [(extract_features(words), label) for words, label in testing_data]

In [18]:
# Train the classifier
classifier = NaiveBayesClassifier.train(training_features)

# Predict the labels for the testing features
test_labels = [label for features, label in testing_features]
predicted_labels = [classifier.classify(features) for features, label in testing_features]

# Evaluate the classifier
print(f"Accuracy: {classify.accuracy(classifier, testing_features) * 100:.2f}%")

Accuracy: 88.44%


# 7. Load Model with Pickle

In [19]:
file = open('model.pickle', 'wb')
pickle.dump(classifier, file)
file.close()

In [21]:
file = open("model.pickle", "rb")
classifier = pickle.load(file)
file.close()

review = input("Input review here: ")
word = word_tokenize(review)
result = classifier.classify(FreqDist(word))
print(result)

pos
