# **[Klasifikasi] Submission Akhir BMLP_Ananta Boemi Adji**

# **1. Import Library**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

import joblib
import os
import gdown

# **2. Memuat Dataset dari Hasil Clustering**

In [2]:
gdrive_url = 'https://drive.google.com/uc?id=1uzY_agf0RohaSHA0EJxMaYLRYueTzAZI'
download_path = 'data_clustering_inverse.csv'

gdown.download(gdrive_url, download_path, quiet=False)

df = pd.read_csv(download_path)

Downloading...
From: https://drive.google.com/uc?id=1uzY_agf0RohaSHA0EJxMaYLRYueTzAZI
To: /content/data_clustering_inverse.csv
100%|██████████| 260k/260k [00:00<00:00, 7.96MB/s]


**Sedikit Cleaning Dataset**

Disini karena saya akan menggunakan kolom TransactionDate yang dimana harus diubah terlebih dahulu menjadi fitur numerik maka disini saya akan mengubah type dari dataset tersebut dan juga menggunakan one-hot encoding untuk kolom kategorikal.

In [3]:
df['TransactionDate'] = pd.to_datetime(df['TransactionDate'])
df['PreviousTransactionDate'] = pd.to_datetime(df['PreviousTransactionDate'])

df['TransactionHour'] = df['TransactionDate'].dt.hour
df['TransactionDayOfWeek'] = df['TransactionDate'].dt.dayofweek
df['TransactionMonth'] = df['TransactionDate'].dt.month

df['DaysSincePreviousTransaction'] = (df['TransactionDate'] - df['PreviousTransactionDate']).dt.days
df.drop(columns=['TransactionDate', 'PreviousTransactionDate'], inplace=True)
categorical_cols = ['TransactionType', 'Location', 'Channel', 'CustomerOccupation', 'AgeGroup']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

Kemudian Saya akan membersihkan dataset terlebih dahulu agar Model Logistic Regression dan KNN bisa dilatih.

In [4]:
print(df.isnull().sum())

TransactionAmount              0
CustomerAge                    0
TransactionDuration            0
LoginAttempts                  0
AccountBalance                 0
                              ..
CustomerOccupation_Engineer    0
CustomerOccupation_Retired     0
CustomerOccupation_Student     0
AgeGroup_1                     0
AgeGroup_2                     0
Length: 61, dtype: int64


In [5]:
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

for col in num_cols:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].median())

for col in cat_cols:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].mode()[0])

In [6]:
df.head()

Unnamed: 0,TransactionAmount,CustomerAge,TransactionDuration,LoginAttempts,AccountBalance,LoginFreq,Target,TransactionHour,TransactionDayOfWeek,TransactionMonth,...,Location_Tucson,Location_Virginia Beach,Location_Washington,Channel_Branch,Channel_Online,CustomerOccupation_Engineer,CustomerOccupation_Retired,CustomerOccupation_Student,AgeGroup_1,AgeGroup_2
0,14.09,70.0,81.0,1.0,5112.21,0,2,16.0,1.0,4.0,...,False,False,False,False,False,False,False,False,False,True
1,376.24,68.0,141.0,1.0,13758.91,0,2,16.0,1.0,6.0,...,False,False,False,False,False,False,False,False,False,True
2,126.29,19.0,56.0,1.0,1122.35,0,1,18.0,0.0,7.0,...,False,False,False,False,True,False,False,True,True,False
3,184.5,26.0,25.0,1.0,8569.06,0,2,16.0,4.0,5.0,...,False,False,False,False,True,False,False,True,True,False
4,13.45,45.0,198.0,1.0,7429.4,0,3,17.0,0.0,10.0,...,False,False,False,False,True,False,False,True,False,False


# **3. Data Splitting**
Bagian ini untuk memisahkan dataset menjadi dua bagian: data latih (training set) dan data uji (test set) dalam pembuatan model nanti.

In [7]:
X = df.drop(columns='Target')
y = df['Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# **4. Membangun Model Klasifikasi**
Setelah memilih algoritma klasifikasi yang sesuai, langkah selanjutnya adalah melatih model menggunakan data latih.

Berikut adalah rekomendasi tahapannya.
1. Menggunakan algoritma klasifikasi yaitu Decision Tree.
2. Latih model menggunakan data yang sudah dipisah.

In [8]:
decision_tree_model = DecisionTreeClassifier(random_state=42)
decision_tree_model.fit(X_train, y_train)

In [9]:
joblib.dump(decision_tree_model, 'decision_tree_model.h5')

['decision_tree_model.h5']

# **5. Memenuhi Kriteria Skilled dan Advanced dalam Membangun Model Klasifikasi**



In [10]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

logreg_model = LogisticRegression(max_iter=2000)
logreg_model.fit(X_train_scaled, y_train)

knn_model = KNeighborsClassifier()
knn_model.fit(X_train_scaled, y_train)

In [11]:
def evaluate_model(model, X_test, y_test, scaled=False):
    X_eval = X_test_scaled if scaled else X_test
    y_pred = model.predict(X_eval)
    print(f"Model: {model.__class__.__name__}")
    print("Accuracy :", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred, average='weighted'))
    print("Recall   :", recall_score(y_test, y_pred, average='weighted'))
    print("F1-Score :", f1_score(y_test, y_pred, average='weighted'))
    print("-" * 40)

evaluate_model(rf_model, X_test, y_test)
evaluate_model(logreg_model, X_test, y_test, scaled=True)
evaluate_model(knn_model, X_test, y_test, scaled=True)

Model: RandomForestClassifier
Accuracy : 0.972972972972973
Precision: 0.9744251234859465
Recall   : 0.972972972972973
F1-Score : 0.9731288367083528
----------------------------------------
Model: LogisticRegression
Accuracy : 0.9168399168399168
Precision: 0.9178086450256429
Recall   : 0.9168399168399168
F1-Score : 0.9171207505923771
----------------------------------------
Model: KNeighborsClassifier
Accuracy : 0.46153846153846156
Precision: 0.4757414002296112
Recall   : 0.46153846153846156
F1-Score : 0.4570656782485503
----------------------------------------


In [12]:
joblib.dump(rf_model, 'explore_RandomForest_classification.h5')
joblib.dump(logreg_model, 'explore_LogisticRegression_classification.h5')
joblib.dump(knn_model, 'explore_KNN_classification.h5')

['explore_KNN_classification.h5']

Disini saya akan melakukan Hyperparameter Tuning untuk algoritma Random Forest.

In [13]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='f1_weighted',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)
best_rf_model = grid_search.best_estimator_

In [14]:
evaluate_model(best_rf_model, X_test, y_test)

Model: RandomForestClassifier
Accuracy : 0.975051975051975
Precision: 0.9765937221477577
Recall   : 0.975051975051975
F1-Score : 0.9752419053907887
----------------------------------------


In [15]:
joblib.dump(best_rf_model, 'tuning_classification.h5')

['tuning_classification.h5']

**Insight**: Terdapat tambahan pada bagian klasifikasi ini dikarenakan pada `data_clustering_inverse.csv` perlu untuk disesuaikan agar beberapa kolom dataset kategorikal bisa digunakan untuk klasifikasi ini, dari pada saya melakukan drop beberapa kolom yang sebenernya bisa digunakan untuk traning model.