In [16]:
# Nama:Rafif Zahran Haryadi 
# NIM: 2109106029
# Kelas: Informatika A 2023 

#------------
# POSTTEST 5
#------------

# Dataset Books
# Dataset ini berisi informasi tentang buku-buku, termasuk judul, penulis, genre, dan peringkat

In [17]:
#Import Library yang akan digunakan

import re, pickle, numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

np.random.seed(42)
tf.random.set_seed(42)


In [18]:
#Input data dari csv

df = pd.read_csv('IMDB_Dataset.csv')

#label biner: positive->1, negative->0
df['label'] = (df['sentiment'].str.lower()=='positive').astype(int)
df = df[['review','label']]

df.head()

Unnamed: 0,review,label
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [19]:
#Cleansing text data yang tidak berkaitan

def clean_text(s):
    s = re.sub(r"<br\s*/?>", " ", str(s))
    s = re.sub(r"<.*?>", " ", s)
    s = re.sub(r"&\w+;", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s.lower()

df['review'] = df['review'].apply(clean_text)
df.head()


Unnamed: 0,review,label
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production. the filming tec...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there's a family where a little boy ...,0
4,"petter mattei's ""love in the time of money"" is...",1


In [20]:
#Memisahkan data latih dan data uji

X_temp, X_test, y_temp, y_test = train_test_split(
    df['review'].values, df['label'].values,
    test_size=0.10, random_state=42, stratify=df['label'].values
)

#Val = 10% dari total → proporsi terhadap sisa
val_ratio = 0.10 / (1 - 0.10)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=val_ratio, random_state=42, stratify=y_temp
)

len(X_train), len(X_val), len(X_test)


(40000, 5000, 5000)

In [21]:
#Tokenisasi dan Padding

VOCAB_SIZE = 30000
MAX_LEN    = 300

tok = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tok.fit_on_texts(X_train)

def to_seq(texts):
    return pad_sequences(tok.texts_to_sequences(texts), maxlen=MAX_LEN, padding='post', truncating='post')

X_train_seq = to_seq(X_train)
X_val_seq   = to_seq(X_val)
X_test_seq  = to_seq(X_test)


In [22]:
#Membuat sequence model sederhana

EMBED_DIM   = 128
DENSE_UNITS = 64
DROPOUT     = 0.3

model = Sequential([
    Embedding(input_dim=VOCAB_SIZE, output_dim=EMBED_DIM, input_length=MAX_LEN),
    GlobalAveragePooling1D(),
    Dropout(DROPOUT),
    Dense(DENSE_UNITS, activation='relu'),
    Dropout(DROPOUT),
    Dense(1, activation='sigmoid')
])

model.summary()




In [23]:
#Complile model

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])


In [24]:
#Callback (untuk overfitting)

early_stop = EarlyStopping(monitor='val_loss',
                           mode='min',
                           patience=2,
                           restore_best_weights=True)


In [25]:
#Training model

history = model.fit(
    X_train_seq, y_train,
    validation_data=(X_val_seq, y_val),
    epochs=8,           # boleh naikkan ke 10 jika perlu
    batch_size=64,
    callbacks=[early_stop],
    verbose=1
)

print("\nTarget: train acc ≥ 0.70 dan val acc ≥ 0.70")


Epoch 1/8
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 23ms/step - accuracy: 0.7578 - loss: 0.4831 - val_accuracy: 0.8634 - val_loss: 0.3448
Epoch 2/8
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 22ms/step - accuracy: 0.8810 - loss: 0.2873 - val_accuracy: 0.8478 - val_loss: 0.3373
Epoch 3/8
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 22ms/step - accuracy: 0.9101 - loss: 0.2280 - val_accuracy: 0.8318 - val_loss: 0.3747
Epoch 4/8
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 22ms/step - accuracy: 0.9214 - loss: 0.2063 - val_accuracy: 0.8678 - val_loss: 0.3266
Epoch 5/8
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 22ms/step - accuracy: 0.9318 - loss: 0.1835 - val_accuracy: 0.8938 - val_loss: 0.3076
Epoch 6/8
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 22ms/step - accuracy: 0.9346 - loss: 0.1752 - val_accuracy: 0.8812 - val_loss: 0.3133
Epoch 7/8
[1m625/625

In [26]:
#Cek Akurasi pada data uji (Minimal 70% untuk train dan val)

print("\nEvaluasi pada data uji:")
train_loss, train_acc = model.evaluate(X_train_seq, y_train, verbose=0)
val_loss,   val_acc   = model.evaluate(X_val_seq,   y_val,   verbose=0)
print(f"Train Acc : {train_acc:.4f}")
print(f"Val   Acc : {val_acc:.4f}")



Evaluasi pada data uji:
Train Acc : 0.9584
Val   Acc : 0.8938


In [27]:
#Prediksi pada data uji

#Contoh prediksi kalimat
samples = [
    "I absolutely loved this movie. Great acting and a compelling story!",
    "This was a waste of time, boring and predictable."
]
samples_seq = pad_sequences(tok.texts_to_sequences(samples), maxlen=MAX_LEN, padding='post', truncating='post')
probs = model.predict(samples_seq, verbose=0).ravel()
for s, p in zip(samples, probs):
    print(f"{s}\n→ Prob(positive): {p:.3f}\n")


I absolutely loved this movie. Great acting and a compelling story!
→ Prob(positive): 0.562

This was a waste of time, boring and predictable.
→ Prob(positive): 0.317



In [28]:
#simpan model dan tokenizer
model.save('sentiment_model.h5')

newModel = tf.keras.models.load_model('sentiment_model.h5')
newModel.summary()



In [29]:
#Kesimpulan dari model ini:
#Model ini berhasil melakukan klasifikasi sentimen pada ulasan film dengan akurasi yang memadai pada data latih dan data validasi, menunjukkan kemampuannya dalam mengenali pola dalam teks. Namun, untuk aplikasi nyata, perlu dilakukan evaluasi lebih lanjut pada data yang lebih beragam dan kompleks untuk memastikan keandalannya.
#Presentase akurasi yang dicapai pada data latih dan data validasi sudah memenuhi target minimal 70%.
#Model sudah bisa digunakan untuk prediksi sentimen pada ulasan film baru.