In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [3]:
Factory = StemmerFactory()
stemmer = Factory.create_stemmer()
list_stopwords = set(stopwords.words('indonesian'))

## A.Data Preparation

### A.1 Membaca 5 data teratas

In [4]:
df = pd.read_csv("Corpus Bahasa Indonesia Label.csv")
df

Unnamed: 0,kalimat,sentiment
0,Kalimat-kalimat tersebut adalah sebagai berikut,0
1,Manusia membutuhkan makanan dan air supaya men...,1
2,Tanaman hijau menggunakan air untuk membuat ma...,0
3,Tanaman yang tidak mendapat air akan layu dan ...,1
4,Keberhasilan belajar murid tidak hanya bergant...,1
...,...,...
12604,Saya suka menggunakan kendaraan ini untuk perj...,1
12605,Kendaraan ini memiliki desain interior yang me...,1
12606,Kendaraan tersebut sering mengalami masalah ke...,2
12607,Saya memilih kendaraan ini karena harganya kom...,1


### A.2 Cek data duplikat

In [5]:
df.duplicated().sum()

np.int64(1516)

### A.3 Melihat Dimensi Data

In [6]:
df.shape

(12609, 2)

### A.4 Cek Nilai Hilang / Missing Values

In [7]:
df.isnull().sum()

kalimat      0
sentiment    0
dtype: int64

## B. Data Preprocessing

### B.1 Handling Duplicate data

In [8]:
df.columns

Index(['kalimat', 'sentiment'], dtype='object')

In [9]:
df.drop_duplicates(subset=['kalimat', 'sentiment'], keep='first', inplace=True)

### B.2 Case Normalization

In [10]:
df['kalimat'] = df['kalimat'].str.lower()

### B.3 Text Cleaning

1. Hapus URL

In [11]:
df['kalimat'] = df['kalimat'].str.replace(r'http\S+|www\S+|https\S+', '', regex=True)

2. Hapus Tanda Baca

In [12]:
df['kalimat'] = df['kalimat'].str.replace(r'[^a-zA-Z\s]', '', regex=True)

3. Tokenization

In [13]:
df['Tokenized_text'] = df['kalimat'].str.split()

### B.5 Stopword

In [14]:
negation_words = {'tidak', 'bukan', 'tak'}

list_stopwords = list_stopwords - negation_words
print(f'Total Stopwords setelah Modifikasi {len(list_stopwords)}')

Total Stopwords setelah Modifikasi 754


In [15]:
def remove_sopwords(tokens):
 return[word for word in tokens if word not in list_stopwords]

df['Stopword_Removed_Text'] = df['Tokenized_text'].apply(remove_sopwords)

print(df[['Tokenized_text', 'Stopword_Removed_Text']].head())

                                      Tokenized_text  \
0  [kalimatkalimat, tersebut, adalah, sebagai, be...   
1  [manusia, membutuhkan, makanan, dan, air, supa...   
2  [tanaman, hijau, menggunakan, air, untuk, memb...   
3  [tanaman, yang, tidak, mendapat, air, akan, la...   
4  [keberhasilan, belajar, murid, tidak, hanya, b...   

                               Stopword_Removed_Text  
0                                   [kalimatkalimat]  
1  [manusia, membutuhkan, makanan, air, kuat, seh...  
2                  [tanaman, hijau, air, makanannya]  
3                [tanaman, tidak, air, layu, kering]  
4  [keberhasilan, belajar, murid, tidak, bergantu...  


### B.6 Stemming

In [16]:
def stemming_text(tokens):
 # melakukan stemming pada setiap kata di dalam list
 stemmed_words = [stemmer.stem(word) for word in tokens]

 # gabungkan lagi menjadi string untuk Fetaure Extaction Selanjutnya
 return ' '.join(stemmed_words)

df['Clean_final'] = df['Stopword_Removed_Text'].apply(stemming_text)

print(df[['Stopword_Removed_Text', 'Clean_final']])

                                   Stopword_Removed_Text  \
0                                       [kalimatkalimat]   
1      [manusia, membutuhkan, makanan, air, kuat, seh...   
2                      [tanaman, hijau, air, makanannya]   
3                    [tanaman, tidak, air, layu, kering]   
4      [keberhasilan, belajar, murid, tidak, bergantu...   
...                                                  ...   
12604                      [suka, kendaraan, perjalanan]   
12605     [kendaraan, memiliki, desain, interior, mewah]   
12606                [kendaraan, mengalami, kelistrikan]   
12607         [memilih, kendaraan, harganya, kompetitif]   
12608                    [kendaraan, nyaman, perjalanan]   

                                         Clean_final  
0                                     kalimatkalimat  
1           manusia butuh makan air kuat sehat tanam  
2                              tanam hijau air makan  
3                        tanam tidak air layu kering  
4   

## C.Exploratory Data Analysis (EDA)

In [17]:
df['sentiment'].value_counts()

sentiment
0    5593
2    2972
1    2528
Name: count, dtype: int64

In [18]:
df['sentiment'].value_counts(normalize=True) * 100

sentiment
0    50.419183
2    26.791670
1    22.789146
Name: proportion, dtype: float64

## D.Modeling

### D.1 Standarisasi

D.2 Split data stratfied

In [19]:
x_raw = df['Clean_final']
y_raw = df['sentiment']

In [20]:
def stratified_split(x_raw, y_raw, train_size = 0.8, random_state = 42):
 np.random.seed(random_state)

 x_train_list, x_test_list = [], []
 y_train_list, y_test_list = [], []

 for class_value in np.unique(y_raw):
  class_idx = np.where(y_raw == class_value)[0]

  idx = np.random.permutation(class_idx)
  split_ratio = int(len(idx) * train_size)

  x_train_list.append(x_raw.iloc[idx[:split_ratio]])
  x_test_list.append(x_raw.iloc[idx[split_ratio:]])
  y_train_list.append(y_raw.iloc[idx[:split_ratio]])
  y_test_list.append(y_raw.iloc[idx[split_ratio:]])

 x_train = pd.concat(x_train_list).reset_index(drop = True)
 x_test = pd.concat(x_test_list).reset_index(drop = True)
 y_train = pd.concat(y_train_list).reset_index(drop = True)
 y_test = pd.concat(y_test_list).reset_index(drop = True)

 return x_train, x_test, y_train, y_test

x_train, x_test, y_train, y_test = stratified_split(x_raw, y_raw)

In [21]:
print(f"Train size: {len(x_train)}")
print(f"Test size: {len(x_test)}")

Train size: 8873
Test size: 2220


In [22]:
# --- A. PROSES DATA TRAIN (Membangun Rumus) ---
# Input: x_train (dari hasil split sebelumnya)

N_train = len(x_train)
DF = {} 
vocabulary = set()
Doc_TF_Train = [] 

# 1. Hitung TF dan DF hanya dari Train
for doc in x_train:
    tokens = doc.split()
    doc_word_count = len(tokens)
    
    tf_doc = {}
    for word in tokens:
        tf_doc[word] = tf_doc.get(word, 0) + 1
        vocabulary.add(word) # Catat kata unik
    
    # Normalisasi TF
    for word, count in tf_doc.items():
        tf_doc[word] = count / doc_word_count
    
    Doc_TF_Train.append(tf_doc)

    # Update DF
    for word in set(tokens):
        DF[word] = DF.get(word, 0) + 1

# 2. Urutkan Vocab & Buat Peta Index (Biar Cepat!)
sorted_vocab_items = sorted(DF.items(), key=lambda item: item[1], reverse=True)
LIMIT_FEATURES = 1000  # Kurangi parameter fitur di sini (bisa 500 atau 1000)

final_vocab = [k for k, v in sorted_vocab_items[:LIMIT_FEATURES]] 
vocab_index = {word: i for i, word in enumerate(final_vocab)}

# 3. Hitung IDF (Rumus dari Data Train)
IDF = {}
smoothing_factor = 1

for word in final_vocab:
    df_val = DF.get(word, 0) + smoothing_factor
    IDF[word] = np.log10(N_train / df_val)

# 4. Buat Matriks x_train_features
x_train_features = np.zeros((N_train, len(final_vocab)))

for i, tf_doc in enumerate(Doc_TF_Train):
    for word, tf_val in tf_doc.items():
        # Pakai vocab_index biar tidak looping manual (Jauh lebih cepat)
        if word in vocab_index:
            j = vocab_index[word] 
            tfidf_score = tf_val * IDF[word]
            x_train_features[i, j] = tfidf_score

print(f"Dimensi x_train_features: {x_train_features.shape}")

Dimensi x_train_features: (8873, 1000)


In [23]:
# --- B. PROSES DATA TEST (Hanya Mengubah Bentuk) ---
# Input: x_test

N_test = len(x_test)
x_test_features = np.zeros((N_test, len(final_vocab)))

for i, doc in enumerate(x_test):
    tokens = doc.split()
    doc_word_count = len(tokens)
    
    if doc_word_count == 0: continue 

    # Hitung TF Lokal untuk Test
    tf_doc_test = {}
    for word in tokens:
        tf_doc_test[word] = tf_doc_test.get(word, 0) + 1
    
    # Masukkan ke Matriks
    for word, count in tf_doc_test.items():
        # Cek: Apakah kata ini ada di Vocab Train?
        if word in vocab_index:
            # Normalisasi TF
            tf_val = count / doc_word_count
            
            # Ambil IDF dari Train (JANGAN hitung ulang!)
            idf_val = IDF[word]
            
            # Ambil nomor kolom
            j = vocab_index[word]
            
            # Masukkan nilai
            x_test_features[i, j] = tf_val * idf_val
        # Jika kata tidak ada di vocab_index, biarkan 0 (diabaikan)

print(f"Dimensi x_test_features: {x_test_features.shape}")

Dimensi x_test_features: (2220, 1000)


In [24]:
# Pastikan y_label adalah numpy array
y_train_array = y_train if isinstance(y_train, np.ndarray) else y_train.values

# 1. Hitung jumlah sampel di kelas mayoritas (Target Oversampling)
unique_classes, counts = np.unique(y_train_array, return_counts=True)
max_count = counts.max() # Ini akan jadi target jumlah (5593)

print(f"Target jumlah sampel per kelas: {max_count}")

Target jumlah sampel per kelas: 4474


In [25]:
# List untuk menampung semua index (baris) yang akan kita ambil
final_indices = []

# 2. Iterasi setiap kelas untuk mengambil index
for cls in unique_classes:
    # Ambil index (nomor baris) yang memiliki label 'cls'
    cls_indices = np.where(y_train_array == cls)[0]
    n_current = len(cls_indices)
    
    # Masukkan index asli ke list final
    final_indices.extend(cls_indices)
    
    # Hitung kekurangan sampel
    n_needed = max_count - n_current
    
    if n_needed > 0:
        print(f"Kelas {cls}: Menambah {n_needed} sampel sintetik.")
        # Ambil index acak dari kelas tersebut (Resampling)
        random_indices = np.random.choice(cls_indices, size=n_needed, replace=True)
        final_indices.extend(random_indices)
    else:
        print(f"Kelas {cls}: Sudah mayoritas (tidak ditambah).")

# 3. Buat Array Index Final dan Acak Urutannya (Shuffle)
final_indices = np.array(final_indices)
np.random.seed(42)
np.random.shuffle(final_indices)

# 4. Ambil Data X dan y berdasarkan Index Final
# Ini langsung memotong array X_features tanpa mengubahnya jadi DataFrame
X_resampled = x_train_features[final_indices]
y_resampled = y_train_array[final_indices]

print("\n=== Oversampling Selesai ===")
print(f"Dimensi X_resampled: {X_resampled.shape}")
print(f"Dimensi y_resampled: {y_resampled.shape}")

# Cek distribusi baru
unique, counts = np.unique(y_resampled, return_counts=True)
print(f"Distribusi Akhir: {dict(zip(unique, counts))}")

Kelas 0: Sudah mayoritas (tidak ditambah).
Kelas 1: Menambah 2452 sampel sintetik.
Kelas 2: Menambah 2097 sampel sintetik.

=== Oversampling Selesai ===
Dimensi X_resampled: (13422, 1000)
Dimensi y_resampled: (13422,)
Distribusi Akhir: {np.int64(0): np.int64(4474), np.int64(1): np.int64(4474), np.int64(2): np.int64(4474)}


In [26]:
df_dataset = pd.DataFrame(X_resampled)

In [27]:
df_dataset['label'] = y_resampled

### 1. Gini Index

In [28]:
def hitung_gini(groups, classes):
 n_instance = float(sum(len(group) for group in groups))
 gini = 0.0

 for group in groups:
  size = float(len(group))
  if size == 0:
   continue

  score = 0.0
  
  y_group = group.iloc[:, -1]

  for class_val in classes:
   p = (y_group == class_val).sum() / size
   score += p ** 2
  gini += (1.0 - score) * (size / n_instance)
 return gini

In [29]:
def hitung_split(df_dataset, feat_idx, threshold):
 left = df_dataset[df_dataset.iloc[:,feat_idx] < threshold]
 right = df_dataset[df_dataset.iloc[:, feat_idx] >= threshold]
 return left, right

In [30]:
def get_best_split(df_dataset, classes):
    best_gini = float('inf')
    best_split = {}
    n_features = df_dataset.shape[1] - 1
    
    # Tips: Untuk percobaan cepat, batasi fitur (misal: range(min(100, n_features)))
    # Jika ingin full, biarkan range(n_features)
    for feat_idx in range(n_features):
        
        # --- PERBAIKAN: Pakai .unique() biar cepat ---
        thresholds = df_dataset.iloc[:, feat_idx].unique()
        
        for threshold in thresholds:
            groups = hitung_split(df_dataset, feat_idx, threshold)
            gini = hitung_gini(groups, classes)
            
            if gini < best_gini:
                best_gini = gini
                best_split = {
                    'feat_idx' : feat_idx,
                    'val' : threshold,
                    'groups' : groups
                }
    return best_split

In [31]:
# 1. Fungsi untuk menentukan hasil akhir (Label) jika sudah di ujung pohon (daun)
def to_terminal(group):
    # Mengambil modus (label yang paling sering muncul) di kolom terakhir
    return group.iloc[:, -1].mode()[0]

# 2. Fungsi Rekursif Membangun Pohon
def build_tree(df_dataset, classes, depth, max_depth):
    # Cari split terbaik untuk node saat ini
    root = get_best_split(df_dataset, classes)
    
    # Jika tidak ditemukan split (misal data sudah murni), jadikan terminal node
    if not root:
        return to_terminal(df_dataset) 
    
    # Ambil hasil split
    left, right = root['groups']
    del(root['groups']) # Hapus data dari dictionary agar hemat memori
    
    # Jika salah satu sisi kosong, berarti tidak bisa di-split lagi -> buat terminal
    if len(left) == 0 or len(right) == 0:
        no_split_data = pd.concat([left, right])
        root['left'] = root['right'] = to_terminal(no_split_data)
        return root
    
    # Jika sudah mencapai kedalaman maksimum, stop dan buat terminal
    if depth >= max_depth:
        root['left'] = to_terminal(left)
        root['right'] = to_terminal(right)
        return root
    
    # Jika belum stop, panggil lagi fungsi ini (rekursif) untuk anak kiri dan kanan
    root['left'] = build_tree(left, classes, depth + 1, max_depth)
    root['right'] = build_tree(right, classes, depth + 1, max_depth)
    
    return root

In [32]:
def predict_row(node, row):
    if not isinstance(node, dict):
        return node
    
    if row.iloc[node['feat_idx']] < node['val']:
        return predict_row(node['left'], row)
    else:
        return predict_row(node['right'], row)
    
def predict_batch(tree, df_test):
    predictions = []
    for index, row in df_test.iterrows():
        prediction = predict_row(tree, row)
        predictions.append(prediction)
    return predictions

In [33]:
# 1. Training (Gunakan data hasil oversampling & TF-IDF yang sudah kamu buat sebelumnya)
# df_dataset ini sudah berisi fitur X_resampled dan kolom 'label' di akhir
my_tree = build_tree(df_dataset, classes=[0, 1, 2], depth=0, max_depth=5)

# 2. Testing (Siapkan data test agar formatnya sama dengan data train)
# Kamu harus menjadikan x_test_features sebagai DataFrame dulu
test_data = pd.DataFrame(x_test_features)
# (Opsional) Tambahkan label jika ingin mengecek akurasi nanti, tapi untuk prediksi tidak wajib
# test_data['label'] = y_test.values 

# 3. Prediksi
predictions_test = predict_batch(my_tree, test_data)
predictions_train = predict_batch(my_tree, df_dataset) # Prediksi ke data train sendiri