# Sentiment Analysis dengan NLP dan KNN

Mata Kuliah: Advance Machine Learning

Nama: Muhammad Ikhwan Fathulloh

In [1]:
!pip install Sastrawi

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m102.4/209.7 kB[0m [31m4.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [2]:
# Sentiment Analysis dengan NLP dan KNN menggunakan Sastrawi dan TF-IDF

# Import library
import pandas as pd
import numpy as np
import re
import string
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [3]:
# Inisialisasi Sastrawi Stopword Remover dan Stemmer
factory = StopWordRemoverFactory()
stopword_remover = factory.create_stop_word_remover()
stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()

# Load dataset
url = "https://raw.githubusercontent.com/Muhammad-Ikhwan-Fathulloh/Advanced-Machine-Learning-Course/refs/heads/main/KNN/Datasets/sentiment_cellular.csv"
data = pd.read_csv(url, encoding='latin-1')

In [4]:
# Case folding
def casefolding(text):
    return text.lower()

data['Text Tweet'] = data['Text Tweet'].apply(casefolding)

# Cleansing
def cleansing(text):
    text = re.sub(r'[?|$|.|!_:")(-+,]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\b[a-zA-Z]\b', '', text)  # Remove single characters
    text = re.sub('\s+', ' ', text)  # Remove multiple spaces
    return text.strip()

data['Text Tweet'] = data['Text Tweet'].apply(cleansing)

# Tokenization and Stopword removal using Sastrawi
def sastrawi_tokenization(text):
    # Tokenisasi dan hapus stopwords menggunakan Sastrawi
    text = stopword_remover.remove(text)
    return text.split()

data['Text Tweet'] = data['Text Tweet'].apply(sastrawi_tokenization)

# Stemming with Sastrawi
def stemming(tokens):
    # Lemmatization (Stemming) menggunakan Sastrawi
    return ' '.join([stemmer.stem(token) for token in tokens])

data['Text Tweet'] = data['Text Tweet'].apply(stemming)

# Save cleaned data to CSV
data.to_csv('cleaned_sentiment_data.csv', index=False)

In [5]:
data = pd.read_csv('cleaned_sentiment_data.csv')
data

Unnamed: 0,Sentiment,Text Tweet
0,positive,usermention boikot providername guna produk ba...
1,positive,sakti balik alhamdulillah providername
2,negative,selamat pagi providername bantu di kamar sinya...
3,negative,dear providername akhir jaring data lot banget...
4,negative,selamat malam dusta providername
...,...,...
295,positive,pantesan lancar sinyal providername lancar sek...
296,positive,alhamdulillah lancar pakai providername
297,positive,untung pakai internet providername lancar jadi...
298,positive,tempat ramai di lokasi wisata providername tet...


In [7]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(data['Text Tweet'])

# Memeriksa bentuk dari matriks hasil TF-IDF
print("Bentuk TF-IDF:", X.shape)

Bentuk TF-IDF: (300, 819)


In [17]:
# Mengonversi matriks TF-IDF ke dalam DataFrame
tfidf_df = pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Menyimpan hasil TF-IDF ke CSV
tfidf_df.to_csv('tfidf_matrix.csv', index=False)

In [18]:
tfidf_df = pd.read_csv('tfidf_matrix.csv')
tfidf_df

Unnamed: 0,acara,aceh,ada,adhan,aja,ajaib,ajar,akan,akhir,akses,...,yah,yang,yess,yg,yonder,youtube,youtubenya,youtubetahun,yuk,zalim
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.399489,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
296,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
297,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
298,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# Target variable
y = data['Sentiment']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=33)

# KNN Model
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)

# Predictions
y_pred = knn.predict(X_test)

In [9]:
# Evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average="weighted"))
print("Recall:", recall_score(y_test, y_pred, average="weighted"))
print("F1 Score:", f1_score(y_test, y_pred, average="weighted"))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8
Precision: 0.800976800976801
Recall: 0.8
F1 Score: 0.7967097532314924

Classification Report:
               precision    recall  f1-score   support

    negative       0.79      0.89      0.84        35
    positive       0.81      0.68      0.74        25

    accuracy                           0.80        60
   macro avg       0.80      0.78      0.79        60
weighted avg       0.80      0.80      0.80        60



In [14]:
import joblib

# Menyimpan model yang telah dilatih
joblib.dump(knn, 'knn_model.pkl')

# Menyimpan TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [16]:
# Memuat model dan vectorizer
model = joblib.load('knn_model.pkl')
tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Prediksi untuk teks baru
new_text = ["Bagus nih"]
new_text_tfidf = tfidf_vectorizer.transform(new_text)
prediction = model.predict(new_text_tfidf)

print("Prediksi Sentimen:", prediction)

Prediksi Sentimen: ['positive']
