# **1. Import Library**

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report
import joblib

# **2. Load Dataset from Clustering Result**

In [2]:
df = pd.read_csv("data_clustering_inverse.csv")

df.head()

Unnamed: 0,TransactionAmount,TransactionType,Location,Channel,CustomerAge,CustomerOccupation,TransactionDuration,LoginAttempts,AccountBalance,AgeGroup,Target
0,14.09,Debit,San Diego,ATM,70.0,Doctor,81.0,1.0,5112.21,Old,1
1,376.24,Debit,Houston,ATM,68.0,Doctor,141.0,1.0,13758.91,Old,0
2,126.29,Debit,Mesa,Online,19.0,Student,56.0,1.0,1122.35,Young,1
3,184.5,Debit,Raleigh,Online,26.0,Student,25.0,1.0,8569.06,Young,1
4,92.15,Debit,Oklahoma City,ATM,18.0,Student,172.0,1.0,781.68,Young,1


## **Feature Encoding: One Hot Encoding**

In [3]:
categorical_cols = list(df.select_dtypes(include=['object']).columns)

# Use 'pd.get_dummies' to do OneHotEncoding
df_encoded = pd.get_dummies(
    df,
    columns = categorical_cols,
    drop_first = True
)

df_encoded.head()

Unnamed: 0,TransactionAmount,CustomerAge,TransactionDuration,LoginAttempts,AccountBalance,Target,TransactionType_Debit,Location_Atlanta,Location_Austin,Location_Baltimore,...,Location_Tucson,Location_Virginia Beach,Location_Washington,Channel_Branch,Channel_Online,CustomerOccupation_Engineer,CustomerOccupation_Retired,CustomerOccupation_Student,AgeGroup_Old,AgeGroup_Young
0,14.09,70.0,81.0,1.0,5112.21,1,True,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1,376.24,68.0,141.0,1.0,13758.91,0,True,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2,126.29,19.0,56.0,1.0,1122.35,1,True,False,False,False,...,False,False,False,False,True,False,False,True,False,True
3,184.5,26.0,25.0,1.0,8569.06,1,True,False,False,False,...,False,False,False,False,True,False,False,True,False,True
4,92.15,18.0,172.0,1.0,781.68,1,True,False,False,False,...,False,False,False,False,False,False,False,True,False,True


# **3. Data Splitting**
The Data Splitting stage aims to separate the dataset into two parts: training data (training set) and test data (test set)

In [4]:
# Use train_test_split() to split the dataset.
X = df_encoded.drop('Target', axis=1)
y = df_encoded['Target']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size = 0.2,
    random_state = 42,
    stratify = y
)

print("Total data amount: ",len(X))
print("Amount of training data: ",len(X_train))
print("Number of test data: ",len(X_test))

Total data amount:  1945
Amount of training data:  1556
Number of test data:  389


# **4. Building a Classification Model**
After selecting a suitable classification algorithm, the next step is to train the model using the training data

1. Using a classification algorithm, namely Decision Tree
2. Train the model using the split data

In [5]:
# Create a classification model using Decision Tree

# 1. Create a Decision Tree model object
decision_tree_model = DecisionTreeClassifier(random_state=42)

# 2. Train (fit) the model with training data (X_train and y_train)
decision_tree_model.fit(X_train, y_train)

In [6]:
# Save Model
joblib.dump(decision_tree_model, 'decision_tree_model.h5')

['decision_tree_model.h5']

# **5. Building a Random Forest Model**

In [7]:
# Train a model using scikit-learn classification algorithms other than Decision Tree. (Example: RandomForestClassifier)
# Create a new model object
new_model = RandomForestClassifier(random_state=42)

# Latih (fit) model dengan data training (X_train dan y_train)
new_model.fit(X_train, y_train)

In [8]:
# Displays the results of evaluating accuracy, precision, recall, and F1-Score on all algorithms that have been created.

# Make predictions on the 'X_test' data using both models
y_pred_dt = decision_tree_model.predict(X_test)
y_pred_new = new_model.predict(X_test)

# Show classification_report for Decision Tree
print("Decision Tree Performance")
print(classification_report(y_test, y_pred_dt))

print("="*50)

# Show classification_report for New Model
print("New Model Performance")
print(classification_report(y_test, y_pred_new))

Decision Tree Performance
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       196
           1       1.00      1.00      1.00       193

    accuracy                           1.00       389
   macro avg       1.00      1.00      1.00       389
weighted avg       1.00      1.00      1.00       389

New Model Performance
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       196
           1       1.00      1.00      1.00       193

    accuracy                           1.00       389
   macro avg       1.00      1.00      1.00       389
weighted avg       1.00      1.00      1.00       389



In [9]:
# Saving Models Other Than Decision Trees
joblib.dump(new_model, 'explore_RandomForest_classification.h5')

['explore_RandomForest_classification.h5']

## **Hyperparameter Tuning Model**

In [10]:
# Perform Hyperparameter Tuning and Retrain.

# Determine the Hyperparameters to be tuned
params = {'n_estimators': [100, 200, 300],
          'max_depth': [10, 20, 30],
          'min_samples_split': [2, 5, 10],
          'criterion': ['gini', 'entropy']
          }
# Create a GridSearchCV model object
new_model_tuned = GridSearchCV(
    estimator = RandomForestClassifier(random_state=42),
    param_grid = params,
    cv = 5,
    scoring = 'accuracy'
)

# Train model objects with training data (X_train and y_train)
new_model_tuned.fit(X_train, y_train)

In [11]:
# Displays the results of evaluating accuracy, precision, recall, and F1-Score on the tuned algorithm.

# Make predictions on 'X_test' using the tuned model
y_pred_tuning = new_model_tuned.predict(X_test)

# Display classification_report for the tuned model
print("Tuned Model Performance")
print(classification_report(y_test, y_pred_tuning))

Tuned Model Performance
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       196
           1       1.00      1.00      1.00       193

    accuracy                           1.00       389
   macro avg       1.00      1.00      1.00       389
weighted avg       1.00      1.00      1.00       389



In [12]:
# Saving the tuning results model
joblib.dump(new_model_tuned, 'tuning_classification.h5')

['tuning_classification.h5']