In [26]:
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk


In [27]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\taufi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [28]:
df = pd.read_csv('https://raw.githubusercontent.com/Neldi30/Analisis-Sentimen/refs/heads/main/Final_Combined_Dataset.csv')
df.head(10)

Unnamed: 0,content,score,label
0,babi emng update terus ya sebulan mau 2 atau 3...,3,positif
1,"Still liking this game, please continue suppor...",5,positif
2,kebanyakan apdet brayyy jadi mals mainnya kalo...,3,positif
3,seru bangett,5,positif
4,bgus,5,positif
5,"Tolong dong chat global di kembalikan lagi, ja...",3,positif
6,gamenya bagus tapi sayang kalo di hapus akun h...,4,positif
7,keren,4,positif
8,mantab,5,positif
9,cukup santai dan membuat otak berpikir keras k...,5,positif


In [29]:
print(df.columns)


Index(['content', 'score', 'label'], dtype='object')


In [30]:
# Membersihkan teks
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    text = str(text).lower()                          # Ubah ke huruf kecil
    text = re.sub(r'http\S+|www\S+', '', text)        # Hapus URL
    text = re.sub(r'[^a-zA-Z\s]', '', text)           # Hapus karakter non-huruf
    text = re.sub(r'\s+', ' ', text).strip()          # Hapus spasi berlebih

    # Tokenisasi, stopwords removal, dan stemming
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Terapkan pembersihan ke kolom 'content' pada dataframe
df['clean_content'] = df['content'].apply(clean_text)

In [46]:
# Ekstraksi fitur TF-IDF
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # Menggunakan bigrams
X = tfidf.fit_transform(df['clean_content'])

# Target
y = df['label']

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Menampilkan dimensi data
print(f"Shape of X: {X.shape}")
print(f"Shape of X_train: {X_train.shape}, X_test: {X_test.shape}")

Shape of X: (10000, 5000)
Shape of X_train: (8000, 5000), X_test: (2000, 5000)


In [47]:
# Model 1: Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_test)
lr_acc = accuracy_score(y_test, lr_preds)

# Model 2: Multinomial Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
nb_preds = nb_model.predict(X_test)
nb_acc = accuracy_score(y_test, nb_preds)

In [48]:
# Menampilkan akurasi
print("Logistic Regression Accuracy: ", lr_acc)
print("Naive Bayes Accuracy: ", nb_acc)

# Menampilkan classification report untuk kedua model
print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, lr_preds))

print("\nNaive Bayes Classification Report:")
print(classification_report(y_test, nb_preds))


Logistic Regression Accuracy:  0.8415
Naive Bayes Accuracy:  0.854

Logistic Regression Classification Report:
              precision    recall  f1-score   support

     negatif       0.77      0.65      0.70       579
     positif       0.87      0.92      0.89      1421

    accuracy                           0.84      2000
   macro avg       0.82      0.78      0.80      2000
weighted avg       0.84      0.84      0.84      2000


Naive Bayes Classification Report:
              precision    recall  f1-score   support

     negatif       0.77      0.71      0.74       579
     positif       0.89      0.91      0.90      1421

    accuracy                           0.85      2000
   macro avg       0.83      0.81      0.82      2000
weighted avg       0.85      0.85      0.85      2000

