In [1]:
import pandas as pd

training_dataset = pd.read_csv("./dataset/cleaned_dataset.csv")
training_dataset.drop(columns=['Cluster_DBSCAN'],inplace=True)
training_dataset.head()

Unnamed: 0,X_LOC,Y_LOC,DEPT,NPHI,DTC,SP,RHOB,GR,CALI,Lithology_code
0,455221.34375,6533321.5,2712.460002,0.218318,71.954613,115.249199,2.257217,58.31152,14.66645,30000.0
1,455221.34375,6533321.5,2726.596002,0.085455,59.167652,107.675835,2.548615,88.030212,12.50916,30000.0
2,455221.34375,6533321.5,2726.748002,0.114017,60.975471,106.604393,2.518298,86.302811,12.547709,30000.0
3,455221.34375,6533321.5,2726.900002,0.147875,62.972198,105.224342,2.49236,79.648369,12.620093,30000.0
4,455221.34375,6533321.5,2727.052002,0.180865,64.157539,103.775002,2.470284,67.485962,12.53736,30000.0


In [2]:
from sklearn.cluster import KMeans
from sklearn.utils import shuffle
import pandas as pd

# Assume dataset_df_updated is your dataframe with 'Lithology_code' and feature columns

# Define the list of majority classes to undersample
majority_classes = [65000, 65030, 30000, 70000]  # Ganti dengan kode kelas mayoritas yang sesuai

# Get class frequencies to understand the initial distribution
class_counts = training_dataset['Lithology_code'].value_counts()

# Display initial distribution
print("Jumlah sampel sebelum undersampling:")
for cls in class_counts.index:
    print(f"Kelas {cls}: {class_counts[cls]} sampel")

# Identify minority classes (all classes not in majority_classes)
minority_classes = [cls for cls in class_counts.index if cls not in majority_classes]

target_size = 50000

# Features used for clustering
features_for_clustering = training_dataset.columns.to_list()

# List to hold the processed dataframes
df_list = []

# Process each class group
for cls, group in training_dataset.groupby('Lithology_code'):
    if cls in majority_classes and len(group) > target_size:
        # Perform clustering-based undersampling for majority class es exceeding target_size
        n_clusters = target_size
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        group['Cluster'] = kmeans.fit_predict(group[features_for_clustering])
        # Select one sample per cluster to maintain variation
        group_undersampled = group.groupby('Cluster').apply(
            lambda x: x.sample(1, random_state=42)
        ).reset_index(drop=True)
        df_list.append(group_undersampled.drop(columns=['Cluster']))
    else:
        # Keep all samples for minority classes or majority classes with size <= target_size
        df_list.append(group)

# Combine all processed dataframes
df_balanced = pd.concat(df_list, ignore_index=True)
df_balanced = shuffle(df_balanced, random_state=42).reset_index(drop=True)

# Display the class distribution after undersampling
print("\nDistribusi kelas setelah undersampling:")
print(df_balanced['Lithology_code'].value_counts())

Jumlah sampel sebelum undersampling:
Kelas 65000.0: 144966 sampel
Kelas 30000.0: 43855 sampel
Kelas 65030.0: 20284 sampel
Kelas 70000.0: 16767 sampel
Kelas 80000.0: 8245 sampel
Kelas 70032.0: 5343 sampel
Kelas 88000.0: 3919 sampel
Kelas 99000.0: 3824 sampel
Kelas 74000.0: 1109 sampel
Kelas 90000.0: 1027 sampel
Kelas 86000.0: 920 sampel
Kelas 93000.0: 141 sampel

Distribusi kelas setelah undersampling:
Lithology_code
65000.0    50000
30000.0    43855
65030.0    20284
70000.0    16767
80000.0     8245
70032.0     5343
88000.0     3919
99000.0     3824
74000.0     1109
90000.0     1027
86000.0      920
93000.0      141
Name: count, dtype: int64


In [4]:
df_balanced.to_csv("./dataset/undersampling_dataset_50.csv",index=False)

In [3]:
import pandas as pd

df_balanced = pd.read_csv("./dataset/undersampling_dataset_50.csv")
df_balanced.drop(columns=['RHOB','SP'], inplace=True)

FileNotFoundError: [Errno 2] No such file or directory: './dataset/undersampling_dataset_50.csv'

In [10]:
df_balanced.drop(columns=['SP'],inplace=True)

In [11]:
from pycaret.classification import *
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset (ganti dengan dataset yang sesuai)
data = df_balanced.copy()

# Pisahkan fitur dan target
target_column = "Lithology_code"  # Ganti dengan nama kolom target

# Split dataset into train and test
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Setup PyCaret
clf = setup(train_data, target=target_column, normalize=True, session_id=42)

# Compare models and find the best one
best_model = compare_models()

# Evaluate best model on training data
evaluate_model(best_model)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,Lithology_code
2,Target type,Multiclass
3,Target mapping,"30000.0: 0, 65000.0: 1, 65030.0: 2, 70000.0: 3, 70032.0: 4, 74000.0: 5, 80000.0: 6, 86000.0: 7, 88000.0: 8, 90000.0: 9, 93000.0: 10, 99000.0: 11"
4,Original data shape,"(124347, 9)"
5,Transformed data shape,"(124347, 9)"
6,Transformed train set shape,"(87042, 9)"
7,Transformed test set shape,"(37305, 9)"
8,Numeric features,8
9,Rows with missing values,0.1%


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9595,0.9978,0.9595,0.9595,0.9594,0.9482,0.9482,0.681
rf,Random Forest Classifier,0.9563,0.9974,0.9563,0.9563,0.9562,0.9441,0.9441,2.182
knn,K Neighbors Classifier,0.9269,0.9858,0.9269,0.9268,0.9266,0.9065,0.9065,0.401
dt,Decision Tree Classifier,0.9263,0.953,0.9263,0.9263,0.9262,0.9058,0.9058,0.178
gbc,Gradient Boosting Classifier,0.8876,0.0,0.8876,0.8867,0.8857,0.8553,0.8558,43.305
lightgbm,Light Gradient Boosting Machine,0.8833,0.9476,0.8833,0.8867,0.8842,0.851,0.8511,2.549
qda,Quadratic Discriminant Analysis,0.6704,0.0,0.6704,0.6912,0.642,0.5746,0.5807,0.034
lr,Logistic Regression,0.6572,0.0,0.6572,0.6042,0.6143,0.543,0.5507,1.291
lda,Linear Discriminant Analysis,0.6366,0.0,0.6366,0.6024,0.6012,0.52,0.5253,0.052
svm,SVM - Linear Kernel,0.6216,0.0,0.6216,0.582,0.5861,0.501,0.5079,0.107


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [12]:
from sklearn.metrics import classification_report

# Evaluate best model on test data
test_predictions = predict_model(best_model, data=test_data)
print("Evaluation on test data:")
# print(test_predictions)

# Measure prediction quality
y_true = test_data[target_column]
y_pred = test_predictions['Lithology_code']
print("Classification Report:")
print(classification_report(y_true, y_pred))

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.9622,0.998,0.9622,0.9621,0.962,0.9517,0.9517


Evaluation on test data:
Classification Report:
              precision    recall  f1-score   support

     30000.0       1.00      1.00      1.00      8813
     65000.0       1.00      1.00      1.00      9885
     65030.0       1.00      1.00      1.00      4147
     70000.0       1.00      1.00      1.00      3343
     70032.0       1.00      1.00      1.00      1034
     74000.0       1.00      1.00      1.00       222
     80000.0       1.00      1.00      1.00      1686
     86000.0       1.00      1.00      1.00       181
     88000.0       1.00      1.00      1.00       789
     90000.0       1.00      1.00      1.00       201
     93000.0       1.00      1.00      1.00        29
     99000.0       1.00      1.00      1.00       757

    accuracy                           1.00     31087
   macro avg       1.00      1.00      1.00     31087
weighted avg       1.00      1.00      1.00     31087



In [13]:
from pycaret.classification import save_model

# Simpan model
save_model(best_model, 'best_model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('label_encoding',
                  TransformerWrapperWithInverse(exclude=None, include=None,
                                                transformer=LabelEncoder())),
                 ('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['X_LOC', 'Y_LOC', 'DEPT', 'NPHI',
                                              'DTC', 'RHOB', 'GR', 'CALI'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_f...
                  ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
                                       class_weight=None, criterion='gini',
                                       max_depth=None, max_f