In [130]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# --- 1. SETUP LIBRARY (ENGLISH VERSION) ---
nltk.download('stopwords')
nltk.download('wordnet') # Untuk Lemmatization
nltk.download('omw-1.4')

# Load Stopwords Inggris
list_stopwords = set(stopwords.words('english'))

# Pilih salah satu: Stemmer (Potong imbuhan) atau Lemmatizer (Cari kata dasar baku)
# Rekomendasi: Lemmatizer lebih bagus untuk analisis sentimen/konteks
lemmatizer = WordNetLemmatizer()
# stemmer = PorterStemmer() # Opsi alternatif kalau mau cepat & kasar

# Kamus Singkatan (Bahasa Inggris banyak singkatan chat)
kamus_singkatan = {
    "u": "you",
    "ur": "your",
    "r": "are",
    "n": "and",
    "d": "the",
    "b4": "before",
    "c": "see",
    "lol": "laughing out loud",
    "txt": "text",
    "2": "to",
    "4": "for",
    # Tambahkan lagi sesuai temuan di data
}

# --- 2. FUNGSI LINGUISTIC PROCESSING ---
def linguistic_processing_english(list_of_words):
    clean_words = []
    
    for word in list_of_words:
        # A. Normalisasi Singkatan (Chat/Alay Inggris)
        word = kamus_singkatan.get(word, word)
        
        # B. Stopword Removal
        if word not in list_stopwords:
            # C. Lemmatization (Lebih bagus dari Stemming untuk Inggris)
            # Contoh: "better" -> "good", "running" -> "run"
            lemma_word = lemmatizer.lemmatize(word)
            clean_words.append(lemma_word)
            
    return " ".join(clean_words)

# --- 3. EKSEKUSI PADA DATAFRAME ---

# Load Data
df = pd.read_csv('email.csv')

# --- TAMBAHAN WAJIB: BERSIHKAN DATA SAMPAH ---
# Cek apakah ada label aneh
print("Kategori sebelum dibersihkan:", df['Category'].unique())

# Lakukan Mapping
df['label_enc'] = df['Category'].map({'ham': 0, 'spam': 1})

# Hapus baris yang labelnya NaN (Sampah tadi)
df.dropna(subset=['label_enc'], inplace=True)

# Pastikan tipe data integer
df['label_enc'] = df['label_enc'].astype(int)

print("Jumlah data bersih:", len(df))

# TAHAP 1: Cleaning Cepat (Pandas)
# Hapus angka, tanda baca, lower, dan split
df['temp_tokens'] = df['Message'].str.lower() \
                                 .str.replace(r'[^a-z\s]', ' ', regex=True) \
                                 .str.split()

# TAHAP 2: Linguistic Processing (NLTK)
print("Sedang memproses NLTK (English)...")
df['clean_message'] = df['temp_tokens'].apply(linguistic_processing_english)

# Hapus kolom bantuan
df.drop(columns=['temp_tokens'], inplace=True)

# Cek Hasil
print(df[['Message', 'clean_message']].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nahls\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nahls\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\nahls\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Kategori sebelum dibersihkan: ['ham' 'spam' '{"mode":"full"']
Jumlah data bersih: 5572
Sedang memproses NLTK (English)...
                                             Message  \
0  Go until jurong point, crazy.. Available only ...   
1                      Ok lar... Joking wif u oni...   
2  Free entry in 2 a wkly comp to win FA Cup fina...   
3  U dun say so early hor... U c already then say...   
4  Nah I don't think he goes to usf, he lives aro...   

                                       clean_message  
0  go jurong point crazy available bugis great wo...  
1                              ok lar joking wif oni  
2  free entry wkly comp win fa cup final tkts st ...  
3                  dun say early hor see already say  
4                nah think go usf life around though  


In [131]:
# 1. Pisahkan Data Ham dan Spam
df_ham = df[df['label_enc'] == 0]
df_spam = df[df['label_enc'] == 1]

print(f"Jumlah Awal -> Ham: {len(df_ham)}, Spam: {len(df_spam)}")

# 2. Oversample Spam (Duplikasi data spam agar sama banyak dengan Ham)
# Kita ambil sampel spam secara acak berulang-ulang sampai jumlahnya sama dengan Ham
df_spam_oversampled = df_spam.sample(n=len(df_ham), replace=True, random_state=42)

# 3. Gabungkan Kembali
df_balanced = pd.concat([df_ham, df_spam_oversampled])

# 4. Acak (Shuffle) agar posisi Ham dan Spam tercampur
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Jumlah Setelah Oversampling -> Ham: {len(df_balanced[df_balanced['label_enc']==0])}, Spam: {len(df_balanced[df_balanced['label_enc']==1])}")

# 5. Gunakan df_balanced untuk Split Data selanjutnya
split_index = int(len(df_balanced) * 0.8)
df_train = df_balanced.iloc[:split_index]
df_test = df_balanced.iloc[split_index:]

# ... (Lanjut ke TF-IDF dan Training seperti biasa) ...

Jumlah Awal -> Ham: 4825, Spam: 747
Jumlah Setelah Oversampling -> Ham: 4825, Spam: 4825


In [132]:
def fit_transform_tfidf(text_series):
    # --- A. BANGUN VOCABULARY (Kamus Kata) ---
    # Ambil semua kata unik di data training
    print("Membangun Vocabulary...")
    all_words = " ".join(text_series).split()
    vocab = sorted(list(set(all_words)))
    
    # Mapping kata ke index biar cepat (word: index)
    vocab_index = {word: i for i, word in enumerate(vocab)}
    n_docs = len(text_series)
    n_vocab = len(vocab)
    
    # --- B. HITUNG DF (Document Frequency) ---
    # Berapa banyak dokumen yang mengandung kata X?
    print("Menghitung IDF...")
    df_counts = dict.fromkeys(vocab, 0)
    
    for text in text_series:
        # Set(text.split()) biar kata yang muncul 2x dalam 1 SMS tetap dihitung 1
        unique_words_in_doc = set(text.split())
        for word in unique_words_in_doc:
            if word in df_counts:
                df_counts[word] += 1
                
    # --- C. HITUNG IDF (Inverse Document Frequency) ---
    # Rumus: log(Total Dokumen / (Jumlah Dokumen yg ada kata itu + 1))
    idf_values = {}
    for word, count in df_counts.items():
        # +1 di penyebut untuk smoothing (biar ga error bagi 0)
        idf_values[word] = np.log(n_docs / (count + 1))
        
    # --- D. HITUNG TF-IDF (TF * IDF) ---
    print("Menghitung TF-IDF Matrix...")
    # Kita pakai Numpy Array kosong dulu
    tfidf_matrix = np.zeros((n_docs, n_vocab))
    
    for row, text in enumerate(text_series):
        words = text.split()
        if len(words) == 0: continue
            
        # Hitung TF (Frekuensi kata di SMS ini)
        word_counts = {}
        for w in words:
            word_counts[w] = word_counts.get(w, 0) + 1
            
        # Isi Matrix
        for word, count in word_counts.items():
            if word in vocab_index:
                # Rumus TF Biasa: Jumlah Kemunculan
                tf = count 
                # (Opsional: Bisa pakai tf = count / len(words) untuk normalisasi)
                
                col_idx = vocab_index[word]
                tfidf_matrix[row, col_idx] = tf * idf_values[word]
                
    return tfidf_matrix, vocab, vocab_index, idf_values

def transform_tfidf(text_series, vocab_index, idf_values):
    # Fungsi ini untuk DATA TEST. Tidak belajar vocab baru, cuma pakai yang ada.
    n_docs = len(text_series)
    n_vocab = len(vocab_index)
    tfidf_matrix = np.zeros((n_docs, n_vocab))
    
    for row, text in enumerate(text_series):
        words = text.split()
        word_counts = {}
        for w in words:
            word_counts[w] = word_counts.get(w, 0) + 1
            
        for word, count in word_counts.items():
            # Cuma hitung kata yang ada di Vocabulary Training
            if word in vocab_index:
                tf = count
                col_idx = vocab_index[word]
                tfidf_matrix[row, col_idx] = tf * idf_values[word]
                
    return tfidf_matrix

In [133]:
# 1. Latih pada Data Train
X_train_matrix, vocab, vocab_index, idf_model = fit_transform_tfidf(df_train['clean_message'])

# 2. Terapkan pada Data Test (Jangan intip data test!)
X_test_matrix = transform_tfidf(df_test['clean_message'], vocab_index, idf_model)

# 3. Siapkan Label (y)
y_train = df_train['label_enc'].values
y_test = df_test['label_enc'].values

print("Selesai!")
print(f"Ukuran Matrix Train: {X_train_matrix.shape}")
print(f"Ukuran Matrix Test : {X_test_matrix.shape}")
# Harusnya (Jumlah Data, Jumlah Kata Unik)

Membangun Vocabulary...
Menghitung IDF...
Menghitung TF-IDF Matrix...
Selesai!
Ukuran Matrix Train: (7720, 6466)
Ukuran Matrix Test : (1930, 6466)


In [134]:
# --- Tambahkan Fungsi Ini ---
def normalize_l2(matrix):
    # Hitung akar kuadrat dari jumlah kuadrat setiap baris (Magnitude)
    norms = np.sqrt(np.sum(matrix**2, axis=1, keepdims=True))
    
    # Hindari pembagian dengan 0 (kalau ada baris kosong)
    norms[norms == 0] = 1
    
    # Bagi setiap nilai dengan magnitude-nya agar range-nya 0-1
    return matrix / norms

# --- Panggil Fungsi Ini Sebelum Training ---
# (Lakukan setelah cell TF-IDF selesai)

print("Melakukan Normalisasi L2...")
X_train_matrix = normalize_l2(X_train_matrix)
X_test_matrix = normalize_l2(X_test_matrix)

print("Nilai max sekarang:", np.max(X_train_matrix)) # Harusnya maks 1.0

Melakukan Normalisasi L2...
Nilai max sekarang: 1.0


In [135]:
def linear_equation(X, w, b):
    # X: Data fitur
    # w: Bobot (weights)
    # b: Bias
    z = np.dot(X, w) + b
    return z

In [136]:
def sigmoid(z):
    # z: Hasil dari linear equation
    return 1 / (1 + np.exp(-z))

In [137]:
def log_loss(y_true, y_pred):
    # y_true: Kunci jawaban asli (0 atau 1)
    # y_pred: Prediksi probabilitas dari sigmoid
    
    epsilon = 1e-15
    # Jepit nilai biar gak 0 murni atau 1 murni
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    loss = -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
    return loss

In [138]:
def grad_desc(X, y_true, y_pred):
    m = X.shape[0] # Jumlah data
    
    # Hitung selisih tebakan (Error term)
    error = y_pred - y_true
    
    # Turunan terhadap bobot (w)
    dw = (1 / m) * np.dot(X.T, error)
    
    # Turunan terhadap bias (b)
    db = (1 / m) * np.sum(error)
    
    return dw, db   

In [139]:
def update_parameters(w, b, dw, db, learning_rate):
    
    w_new = w - learning_rate * dw
    b_new = b - learning_rate * db
    
    return w_new, b_new

In [140]:
# 1. Inisialisasi Parameter Awal (Semuanya nol)
# Kita butuh w sebanyak jumlah kolom fitur (X_train.shape[1])
w = np.zeros(X_train_matrix.shape[1])
b = 0
learning_rate = 10
epochs = 1001 # Jumlah putaran belajar
loss_history = []

# 2. Loop Belajar
print("Mulai Training:")
for i in range(epochs):
    # A. Maju (Forward Pass)
    z = linear_equation(X_train_matrix, w, b)       # Rumus 1
    y_pred = sigmoid(z)                      # Rumus 2
    
    # B. Hitung Error (Opsional, buat grafik)
    loss = log_loss(y_train, y_pred)     # Rumus 3
    loss_history.append(loss)
    
    # C. Mundur (Backward Pass - Cari arah perbaikan)
    dw, db = grad_desc(X_train_matrix, y_train, y_pred) # Rumus 4
    
    # D. Update (Perbaiki bobot)
    w, b = update_parameters(w, b, dw, db, learning_rate) # Rumus 5
    
    # Print progress setiap 100 putaran
    if i % 50 == 0:
        print(f"Epoch {i}: Loss = {loss:.4f}")

print("Training Selesai")

Mulai Training:
Epoch 0: Loss = 0.6931
Epoch 50: Loss = 0.3191
Epoch 100: Loss = 0.2298
Epoch 150: Loss = 0.1851
Epoch 200: Loss = 0.1571
Epoch 250: Loss = 0.1374
Epoch 300: Loss = 0.1226
Epoch 350: Loss = 0.1111
Epoch 400: Loss = 0.1017
Epoch 450: Loss = 0.0940
Epoch 500: Loss = 0.0874
Epoch 550: Loss = 0.0818
Epoch 600: Loss = 0.0769
Epoch 650: Loss = 0.0727
Epoch 700: Loss = 0.0689
Epoch 750: Loss = 0.0655
Epoch 800: Loss = 0.0624
Epoch 850: Loss = 0.0597
Epoch 900: Loss = 0.0572
Epoch 950: Loss = 0.0549
Epoch 1000: Loss = 0.0528
Training Selesai


In [191]:
def predict(X, w, b, threshold=0.6):
    # 1. Hitung probabilitas (sama kayak waktu training)
    z = linear_equation(X, w, b)
    y_prob = sigmoid(z)
    
    # 2. Ubah probabilitas jadi kelas (0 atau 1)
    # List comprehension: Kalau p > 0.5 jadi 1, selain itu 0
    y_class = [1 if p > threshold else 0 for p in y_prob]
    
    return np.array(y_class)

In [196]:
predictions = predict(X_test_matrix, w, b)
akurasi_test = np.sum(predictions == y_test) / len(X_test_matrix)
print(f"{akurasi_test:.2f}")

0.99


In [197]:
prediction_train = predict(X_train_matrix, w, b)
akurasi_train = np.sum(prediction_train == y_train) / len(X_train_matrix)
print(f"{akurasi_train:.2f}")

0.99


In [198]:
TP = np.sum((predictions == 1) & (y_test == 1))
FP = np.sum((predictions == 1) & (y_test == 0))
TN = np.sum((predictions == 0) & (y_test == 0))
FN = np.sum((predictions == 0) & (y_test == 1))

print(f"True Positive: {TP}")
print(f"False Positive: {FP}")
print(f"True Negative: {TN}")
print(f"False Negative: {FN}")

True Positive: 944
False Positive: 8
True Negative: 965
False Negative: 13


In [200]:
precision = TP / (TP + FP)
print(f"{precision:.2f}")

0.99
