In [1]:
# Basic
import pandas as pd
import numpy as np
import re

# ML
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Imbalance handling
from sklearn.utils.class_weight import compute_class_weight


In [2]:
from google.colab import files
uploaded = files.upload()


Saving 20191002-reviews.csv to 20191002-reviews.csv


In [3]:
df = pd.read_csv('20191002-reviews.csv')

df = df[['rating', 'reviewContent']]
df = df.dropna()
df.head()


Unnamed: 0,rating,reviewContent
0,5,bagus mantap dah sesui pesanan
1,4,"Bagus, sesuai foto"
2,5,okkkkk mantaaaaaaapppp ... goood
3,4,bagus sesuai
7,1,bima


In [4]:
df = df[df['rating'].isin([1, 2, 5])]

def label_sentiment(r):
    if r == 5:
        return 1   # positif
    else:
        return 0   # negatif

df['label'] = df['rating'].apply(label_sentiment)

df['label'].value_counts()


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,82896
0,9135


In [5]:
def clean_text(text):
    text = text.lower()

    # gabung negasi
    text = re.sub(r'tidak\s+bagus', 'tidak_bagus', text)
    text = re.sub(r'tidak\s+sesuai', 'tidak_sesuai', text)
    text = re.sub(r'tidak\s+puas', 'tidak_puas', text)
    text = re.sub(r'tidak\s+rekomendasi', 'tidak_rekomendasi', text)

    # hapus karakter aneh
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()

    return text

df['clean_review'] = df['reviewContent'].apply(clean_text)
df[['reviewContent','clean_review']].head()


Unnamed: 0,reviewContent,clean_review
0,bagus mantap dah sesui pesanan,bagus mantap dah sesui pesanan
2,okkkkk mantaaaaaaapppp ... goood,okkkkk mantaaaaaaapppp goood
7,bima,bima
8,baru 10 bulan layarnya dah bergaris,baru bulan layarnya dah bergaris
9,"Pesan rabu sore,minggu sore sampe,,barang sesu...",pesan rabu sore minggu sore sampe barang sesua...


In [6]:
X = df['clean_review']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [7]:
tfidf = TfidfVectorizer(
    max_features=15000,
    ngram_range=(1,2),
    min_df=5
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [8]:
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)

class_weight_dict = {
    0: class_weights[0],
    1: class_weights[1]
}

class_weight_dict


{0: np.float64(5.037219485495347), 1: np.float64(0.5550998250799204)}

In [9]:
model = LogisticRegression(
    class_weight=class_weight_dict,
    max_iter=1000
)

model.fit(X_train_tfidf, y_train)


In [10]:
y_pred = model.predict(X_test_tfidf)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.68      0.96      0.79      1827
           1       1.00      0.95      0.97     16580

    accuracy                           0.95     18407
   macro avg       0.84      0.95      0.88     18407
weighted avg       0.96      0.95      0.95     18407

[[ 1750    77]
 [  827 15753]]


In [11]:
test_reviews = [
    "barang rusak dan tidak sesuai",
    "sangat kecewa dengan produk ini",
    "pengiriman lama dan kualitas buruk",
    "tidak bagus sama sekali",
    "produk bagus dan sesuai deskripsi"
]

test_clean = [clean_text(x) for x in test_reviews]
test_vec = tfidf.transform(test_clean)

pred = model.predict(test_vec)

for t, p in zip(test_reviews, pred):
    print(t, "→", "POSITIF" if p==1 else "NEGATIF")


barang rusak dan tidak sesuai → NEGATIF
sangat kecewa dengan produk ini → NEGATIF
pengiriman lama dan kualitas buruk → NEGATIF
tidak bagus sama sekali → NEGATIF
produk bagus dan sesuai deskripsi → POSITIF


In [12]:
import joblib

# simpan model
joblib.dump(model, 'sentiment_model.pkl')

# simpan vectorizer
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

print("Model dan TF-IDF berhasil disimpan")


Model dan TF-IDF berhasil disimpan


In [13]:
from google.colab import files

files.download('sentiment_model.pkl')
files.download('tfidf_vectorizer.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>