In [2]:
import pandas as pd
import numpy as np

dataset = pd.read_csv('./Restaurant_Reviews.tsv', delimiter='\t', quoting=3)
dataset.head(10)


Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could te...,0
8,The fries were great too.,1
9,A great touch.,1


In [3]:
# Cleaning the texts

import re  # regular expression
import nltk  # natural language toolkit

nltk.download('wordnet')  # Named Entities (The White House, Istana Negara, etc)
nltk.download('stopwords')  # words with no meaning

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
# from nltk.stem import WordNetLemmatizer


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
# Pre-process our reviews
corpus = []
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])  # remove punctuation and numbers
    review = review.lower()  # Lowercase
    review = review.split()  # tokenize
    stemmer = PorterStemmer()  # stemming
    review = [stemmer.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)


In [5]:
# Creating the Bag of Words model

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)  # vocabulary size
X = cv.fit_transform(corpus).toarray()

y = dataset.iloc[:, 1].values


In [6]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)


In [7]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

cm = confusion_matrix(y_test, y_pred)

accuracy = accuracy_score(y_test, y_pred, normalize=True)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')


In [9]:
print('Accuracy = ', accuracy)
print('Precision = ', precision)
print('Recall = ', recall)
print('F1 = ', f1)



Accuracy =  0.73
Precision =  0.6842105263157895
Recall =  0.883495145631068
F1 =  0.7711864406779662


SPAM CSV

In [10]:
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords

# To remove HTML tags from the document
from bs4 import BeautifulSoup

# Removing numbers, punctuations, i.e., regular expressions from the document
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn import metrics
from sklearn.metrics import accuracy_score, f1_score

from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB


In [11]:
train_data = pd.read_csv('spam.csv')
train_data.head(16)


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [12]:
train_data.shape

(5572, 2)

In [14]:
train_data.text[0]

sample1 = BeautifulSoup(train_data.text[0], "html.parser")
letters_only = re.sub("[^a-zA-Z]", " ", sample1.get_text())
print(letters_only)
lower_case = letters_only.lower()
words = lower_case.split()


Go until jurong point  crazy   Available only in bugis n great world la e buffet    Cine there got amore wat   


In [15]:
# Removing stopwords from sample1 so that relevant words can be filtered out and stored in words

words = [w for w in words if w not in stopwords.words("english")]
print(words)


['go', 'jurong', 'point', 'crazy', 'available', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amore', 'wat']


In [16]:
# the above code cleans only one review, let's make a function to clean all the reviews
def review_to_words(raw_review):

    # remove html using BeautifulSoup
    review_text = BeautifulSoup(raw_review, "html.parser").get_text()

    # removing raw letters, numbers, punctuations
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)

    # creating an array, resolving whitespaces
    words = letters_only.lower().split()

    # create an array of stopwords so that we don't have to access corpus to search for a stopword
    stop = set(stopwords.words("english"))

    # removing stopwords from the raw_review
    meaningful_words = [w for w in words if w not in stop]

    # return a string with only the words that are important
    return " ".join(meaningful_words)


In [17]:
num_rev = train_data.text.size
print(num_rev)

5572


In [18]:
cleaned_rev = []
for i in range(num_rev):
    cleaned_rev.append(review_to_words(train_data.text[i]))

# Creating a function, vectorizer to convert the words into vectors
vectorizer = CountVectorizer(
    analyzer="word",
    preprocessor=None,
    stop_words="english",
    max_features=5000
)


  review_text = BeautifulSoup(raw_review, "html.parser").get_text()


In [19]:
# converting reviews from text into features
train_data_features = vectorizer.fit_transform(cleaned_rev)

# change the classifier into array
train_data_features = train_data_features.toarray()

X = train_data_features

# dependent variable, y will be 1 for positive and 0 for negative review
y = train_data.label

print(X.shape)
print(y.shape)


(5572, 5000)
(5572,)


In [20]:
# splitting the training data into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)
naive = MultinomialNB()
classifier = naive.fit(X_train, y_train)
predict = classifier.predict(X_test)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(predict, y_test)

cm
# Calculate accuracy
accuracy = cm.trace() / cm.sum()
print(accuracy)


0.9798994974874372


Eksperimen

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [22]:
# Membaca dataset
data = pd.read_csv('spam.csv')

# Memisahkan fitur dan label
X = data['text']  # Ganti dengan nama kolom teks
y = data['label']  # Ganti dengan nama kolom label


In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(X)


In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Membaca dataset (pastikan file 'spam.csv' tersedia)
data = pd.read_csv('spam.csv')

# Memisahkan fitur (teks) dan label
X = data['text']  # Ganti dengan nama kolom teks
y = data['label']  # Ganti dengan nama kolom label

# Vectorizing teks menggunakan TfidfVectorizer
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(X)

# Definisi model
models = {
    "Naive Bayes": MultinomialNB(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier()
}

# Definisi pembagian data
splits = {
    "80%:20%": 0.2,
    "70%:30%": 0.3
}

# Menjalankan eksperimen
results = []

for model_name, model in models.items():
    for split_name, test_size in splits.items():
        # Membagi data
        X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=test_size, random_state=42)

        # Melatih model
        model.fit(X_train, y_train)

        # Memprediksi
        y_pred = model.predict(X_test)

        # Menghitung metrik evaluasi dengan 2 angka di belakang koma
        precision = round(precision_score(y_test, y_pred, average='weighted', zero_division=1), 2)
        recall = round(recall_score(y_test, y_pred, average='weighted', zero_division=1), 2)
        f1 = round(f1_score(y_test, y_pred, average='weighted', zero_division=1), 2)
        acc = round(accuracy_score(y_test, y_pred), 2)

        # Menyimpan hasil untuk setiap skenario
        results.append({
            "Model": model_name,
            "Split": split_name,
            "Precision": precision,
            "Recall": recall,
            "F1-Score": f1,
            "Accuracy": acc  # Accuracy ditempatkan setelah F1-Score
        })

# Menampilkan hasil
results_df = pd.DataFrame(results)

# Menampilkan hasil evaluasi keseluruhan
print(results_df)

# Menyimpan ke file CSV jika diperlukan
results_df.to_csv("experiment_results_with_metrics.csv", index=False)


           Model    Split  Precision  Recall  F1-Score  Accuracy
0    Naive Bayes  80%:20%       0.96    0.96      0.96      0.96
1    Naive Bayes  70%:30%       0.96    0.96      0.95      0.96
2            SVM  80%:20%       0.98    0.98      0.98      0.98
3            SVM  70%:30%       0.98    0.98      0.97      0.98
4            KNN  80%:20%       0.92    0.91      0.89      0.91
5            KNN  70%:30%       0.92    0.91      0.89      0.91
6  Decision Tree  80%:20%       0.97    0.97      0.97      0.97
7  Decision Tree  70%:30%       0.97    0.97      0.97      0.97
