In [6]:
import pandas as pd

training_dataset = pd.read_csv("./dataset/undersampling_dataset_50_2.csv")
training_dataset.drop(columns=['PHI_S'],inplace=True)

In [6]:
from sklearn.cluster import KMeans
from sklearn.utils import shuffle
import pandas as pd

# Assume dataset_df_updated is your dataframe with 'Lithology_code' and feature columns

# Define the list of majority classes to undersample
majority_classes = [65000, 65030, 30000, 70000]  # Ganti dengan kode kelas mayoritas yang sesuai

# Get class frequencies to understand the initial distribution
class_counts = training_dataset['Lithology_code'].value_counts()

# Display initial distribution
print("Jumlah sampel sebelum undersampling:")
for cls in class_counts.index:
    print(f"Kelas {cls}: {class_counts[cls]} sampel")

# Identify minority classes (all classes not in majority_classes)
minority_classes = [cls for cls in class_counts.index if cls not in majority_classes]

target_size = 50000

# Features used for clustering
features_for_clustering = training_dataset.columns.to_list()

# List to hold the processed dataframes
df_list = []

# Process each class group
for cls, group in training_dataset.groupby('Lithology_code'):
    if cls in majority_classes and len(group) > target_size:
        # Perform clustering-based undersampling for majority class es exceeding target_size
        n_clusters = target_size
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        group['Cluster'] = kmeans.fit_predict(group[features_for_clustering])
        # Select one sample per cluster to maintain variation
        group_undersampled = group.groupby('Cluster').apply(
            lambda x: x.sample(1, random_state=42)
        ).reset_index(drop=True)
        df_list.append(group_undersampled.drop(columns=['Cluster']))
    else:
        # Keep all samples for minority classes or majority classes with size <= target_size
        df_list.append(group)

# Combine all processed dataframes
df_balanced = pd.concat(df_list, ignore_index=True)
df_balanced = shuffle(df_balanced, random_state=42).reset_index(drop=True)

# Display the class distribution after undersampling
print("\nDistribusi kelas setelah undersampling:")
print(df_balanced['Lithology_code'].value_counts())

Jumlah sampel sebelum undersampling:
Kelas 65000.0: 144966 sampel
Kelas 30000.0: 43855 sampel
Kelas 65030.0: 20284 sampel
Kelas 70000.0: 16767 sampel
Kelas 80000.0: 8245 sampel
Kelas 70032.0: 5343 sampel
Kelas 88000.0: 3919 sampel
Kelas 99000.0: 3824 sampel
Kelas 74000.0: 1109 sampel
Kelas 90000.0: 1027 sampel
Kelas 86000.0: 920 sampel
Kelas 93000.0: 141 sampel

Distribusi kelas setelah undersampling:
Lithology_code
65000.0    50000
30000.0    43855
65030.0    20284
70000.0    16767
80000.0     8245
70032.0     5343
88000.0     3919
99000.0     3824
74000.0     1109
90000.0     1027
86000.0      920
93000.0      141
Name: count, dtype: int64


In [7]:
df_balanced.to_csv("./dataset/undersampling_dataset_50_2.csv",index=False)

In [7]:
from pycaret.classification import *
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset (ganti dengan dataset yang sesuai)
data = training_dataset.copy()

# Pisahkan fitur dan target
target_column = "Lithology_code"  # Ganti dengan nama kolom target

# Split dataset into train and test
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Setup PyCaret
clf = setup(train_data, target=target_column, normalize=True, session_id=42)

# Compare models and find the best one
best_model = compare_models()

# Evaluate best model on training data
evaluate_model(best_model)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,Lithology_code
2,Target type,Multiclass
3,Target mapping,"30000.0: 0, 65000.0: 1, 65030.0: 2, 70000.0: 3, 70032.0: 4, 74000.0: 5, 80000.0: 6, 86000.0: 7, 88000.0: 8, 90000.0: 9, 93000.0: 10, 99000.0: 11"
4,Original data shape,"(124347, 13)"
5,Transformed data shape,"(124347, 13)"
6,Transformed train set shape,"(87042, 13)"
7,Transformed test set shape,"(37305, 13)"
8,Numeric features,12
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9681,0.9984,0.9681,0.9682,0.968,0.9592,0.9592,1.049
rf,Random Forest Classifier,0.9653,0.998,0.9653,0.9653,0.9651,0.9555,0.9556,3.778
xgboost,Extreme Gradient Boosting,0.9614,0.9978,0.9614,0.9614,0.9613,0.9506,0.9506,2.044
dt,Decision Tree Classifier,0.9359,0.9591,0.9359,0.936,0.9359,0.9181,0.9181,0.327
knn,K Neighbors Classifier,0.9348,0.988,0.9348,0.9348,0.9345,0.9166,0.9166,0.507
gbc,Gradient Boosting Classifier,0.903,0.0,0.903,0.9028,0.9014,0.875,0.8756,74.319
lightgbm,Light Gradient Boosting Machine,0.8428,0.9087,0.8428,0.8482,0.8441,0.7994,0.7997,2.732
lr,Logistic Regression,0.6762,0.0,0.6762,0.6417,0.6417,0.57,0.5769,1.512
lda,Linear Discriminant Analysis,0.6557,0.0,0.6557,0.6304,0.617,0.5432,0.5503,0.074
svm,SVM - Linear Kernel,0.6469,0.0,0.6469,0.5963,0.6067,0.5315,0.5395,0.147


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [8]:
from sklearn.metrics import classification_report

# Evaluate best model on test data
test_predictions = predict_model(best_model, data=test_data)
print("Evaluation on test data:")
# print(test_predictions)

# Measure prediction quality
y_true = test_data[target_column]
y_pred = test_predictions['Lithology_code']
print("Classification Report:")
print(classification_report(y_true, y_pred))

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.9684,0.9985,0.9684,0.9684,0.9683,0.9596,0.9596


Evaluation on test data:
Classification Report:
              precision    recall  f1-score   support

     30000.0       1.00      1.00      1.00      8813
     65000.0       1.00      1.00      1.00      9885
     65030.0       1.00      1.00      1.00      4147
     70000.0       1.00      1.00      1.00      3343
     70032.0       1.00      1.00      1.00      1034
     74000.0       1.00      1.00      1.00       222
     80000.0       1.00      1.00      1.00      1686
     86000.0       1.00      1.00      1.00       181
     88000.0       1.00      1.00      1.00       789
     90000.0       1.00      1.00      1.00       201
     93000.0       1.00      1.00      1.00        29
     99000.0       1.00      1.00      1.00       757

    accuracy                           1.00     31087
   macro avg       1.00      1.00      1.00     31087
weighted avg       1.00      1.00      1.00     31087



In [10]:
y_pred.value_counts()

Lithology_code
65000.0    9885
30000.0    8813
65030.0    4147
70000.0    3343
80000.0    1686
70032.0    1034
88000.0     789
99000.0     757
74000.0     222
90000.0     201
86000.0     181
93000.0      29
Name: count, dtype: int64

In [11]:
from pycaret.classification import save_model

# Simpan model
save_model(best_model, 'best_model_1')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('label_encoding',
                  TransformerWrapperWithInverse(exclude=None, include=None,
                                                transformer=LabelEncoder())),
                 ('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['SP', 'GR', 'DTC', 'Y_LOC', 'DRHO',
                                              'RHOB', 'NPHI', 'X_LOC', 'CALI',
                                              'DEPT', 'Vclay', 'NDPD'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               f...
                  ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
                                       class_weight=None, criterion='gini',
                                       max_depth=None, max_features='sqrt'

---
## EXTRA TREES CLASSFIER

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [18]:
dataset_full = pd.read_csv("./dataset/ready_training_dataset.csv")
dataset_full.shape

(250400, 13)

In [19]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

X = dataset_full.drop(columns=['Lithology_code'])
y = dataset_full['Lithology_code']

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on training data and transform both training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train Extra Trees Classifier
model = ExtraTreesClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:\n', classification_report(y_test, y_pred))
# Perform hyperparameter tuning using GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create GridSearchCV object
grid_search = GridSearchCV(
    ExtraTreesClassifier(random_state=42),
    param_grid,
    cv=5,
    n_jobs=-1,
    scoring='accuracy'
)

# Fit grid search
grid_search.fit(X_train_scaled, y_train)

# Print best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

# Use best model for final prediction
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)

# Print final accuracy
final_accuracy = accuracy_score(y_test, y_pred)
print(f'Final accuracy with tuned model: {final_accuracy:.4f}')

Accuracy: 0.9808
Classification Report:
               precision    recall  f1-score   support

     30000.0       0.98      0.98      0.98      8677
     65000.0       0.98      0.99      0.99     29001
     65030.0       0.96      0.93      0.95      4095
     70000.0       0.98      0.94      0.96      3394
     70032.0       0.99      0.98      0.98      1033
     74000.0       0.99      0.82      0.90       227
     80000.0       0.97      0.96      0.96      1654
     86000.0       0.98      0.99      0.99       174
     88000.0       1.00      1.00      1.00       815
     90000.0       0.98      0.94      0.96       202
     93000.0       1.00      1.00      1.00        20
     99000.0       0.98      0.97      0.98       788

    accuracy                           0.98     50080
   macro avg       0.98      0.96      0.97     50080
weighted avg       0.98      0.98      0.98     50080

Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estim

In [29]:
dataset_full.columns

Index(['Lithology_code', 'SP', 'GR', 'DTC', 'Y_LOC', 'DRHO', 'RHOB', 'NPHI',
       'X_LOC', 'CALI', 'DEPT', 'Vclay', 'NDPD'],
      dtype='object')

In [None]:
from sklearn.preprocessing import StandardScaler

X = dataset_full.drop(columns=['Lithology_code'])
y = dataset_full['Lithology_code']

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on training data and transform both training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train Extra Trees Classifier
model = ExtraTreesClassifier(n_estimators=300, random_state=42, min_samples_split = 2, min_samples_leaf = 1, max_depth = None)
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:\n', classification_report(y_test, y_pred))

Accuracy: 0.9814
Classification Report:
               precision    recall  f1-score   support

     30000.0       0.98      0.98      0.98      8677
     65000.0       0.98      0.99      0.99     29001
     65030.0       0.96      0.94      0.95      4095
     70000.0       0.98      0.95      0.96      3394
     70032.0       0.99      0.98      0.99      1033
     74000.0       0.99      0.81      0.89       227
     80000.0       0.97      0.96      0.96      1654
     86000.0       0.98      0.99      0.99       174
     88000.0       1.00      1.00      1.00       815
     90000.0       0.97      0.93      0.95       202
     93000.0       0.95      1.00      0.98        20
     99000.0       0.98      0.97      0.98       788

    accuracy                           0.98     50080
   macro avg       0.98      0.96      0.97     50080
weighted avg       0.98      0.98      0.98     50080



In [27]:
import joblib

# Save the model
joblib.dump(model, 'extra_trees_model.pkl')

['extra_trees_model.pkl']