In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN

In [2]:
test_df = pd.read_csv('./dataset/Test_dataset.csv')
test_df = test_df[['X_LOC','Y_LOC','DEPT','NPHI','DTC','SP','RHOB','GR','CALI']]
test_df.head()

Unnamed: 0,X_LOC,Y_LOC,DEPT,NPHI,DTC,SP,RHOB,GR,CALI
0,459853.34375,6560993.0,1348.3104,0.398266,133.537033,,2.131319,61.177399,12.301304
1,459853.34375,6560993.0,1348.4624,0.38946,133.525543,,2.129777,63.512333,12.299226
2,459853.34375,6560993.0,1348.6144,0.394868,130.739624,,2.138082,63.515835,12.297644
3,459853.34375,6560993.0,1348.7664,0.389355,128.074249,,2.153999,63.153057,12.276056
4,459853.34375,6560993.0,1348.9184,0.365808,121.454926,,2.14092,60.224148,12.299644


In [4]:
test_df.shape

(121797, 9)

In [8]:
cleaned_dataset = pd.read_csv('./dataset/cleaned_dataset.csv')
cleaned_dataset.drop(columns=['Cluster_DBSCAN'],inplace=True)
cleaned_dataset.head()

Unnamed: 0,X_LOC,Y_LOC,Z_LOC,NPHI,DTC,SP,RHOB,GR,CALI,Lithology_code
0,455221.34375,6533321.5,-2883.996094,0.148734,68.589714,79.002701,2.346344,61.986121,7.918733,30000.0
1,455221.34375,6533321.5,-2883.843994,0.139775,68.00721,78.982742,2.350433,61.986166,7.893227,30000.0
2,455221.34375,6533321.5,-2883.691895,0.136017,68.325829,78.962296,2.36463,61.98621,7.876397,30000.0
3,455221.34375,6533321.5,-2883.540039,0.136147,69.102173,78.941872,2.388633,61.986253,7.88751,30000.0
4,455221.34375,6533321.5,-2883.387939,0.135847,69.374542,78.921913,2.411212,61.986298,7.914729,30000.0


In [10]:
# Ensure same columns between datasets
common_columns = ['X_LOC', 'Y_LOC', 'NPHI', 'DTC', 'SP', 'RHOB', 'GR', 'CALI']

# Prepare test_df by selecting common columns
test_df_subset = test_df[common_columns].copy()

# Prepare cleaned_dataset by selecting common columns
cleaned_dataset_subset = cleaned_dataset[common_columns].copy()

# Concatenate the datasets vertically
combined_dataset = pd.concat([cleaned_dataset_subset, test_df_subset], axis=0, ignore_index=True)

# Reset index and show the result
combined_dataset = combined_dataset.reset_index(drop=True)
print(f"Combined dataset shape: {combined_dataset.shape}")
combined_dataset

Combined dataset shape: (1123397, 8)


Unnamed: 0,X_LOC,Y_LOC,NPHI,DTC,SP,RHOB,GR,CALI
0,455221.34375,6533321.5,0.148734,68.589714,79.002701,2.346344,61.986121,7.918733
1,455221.34375,6533321.5,0.139775,68.007210,78.982742,2.350433,61.986166,7.893227
2,455221.34375,6533321.5,0.136017,68.325829,78.962296,2.364630,61.986210,7.876397
3,455221.34375,6533321.5,0.136147,69.102173,78.941872,2.388633,61.986253,7.887510
4,455221.34375,6533321.5,0.135847,69.374542,78.921913,2.411212,61.986298,7.914729
...,...,...,...,...,...,...,...,...
1123392,,,,,,2.515146,65.925987,8.647468
1123393,,,,,,2.543073,66.297127,8.636636
1123394,,,,,,2.576337,64.853714,8.592650
1123395,,,,,,2.606787,62.779541,8.546233


In [11]:
# Fungsi untuk menghapus 'val:' dan '[UNIT]' serta konversi ke float
def clean_value(value):
    if isinstance(value, str):  # Pastikan nilai adalah string
        # Hapus 'val:' jika ada
        if 'val:' in value:
            value = value.replace('val:', '').strip()
        # Hapus '[UNIT]' jika ada
        if '[UNIT]' in value:
            value = value.replace('[UNIT]', '').strip()
        # Coba konversi ke float
        try:
            return float(value)
        except ValueError:
            return value  # Kembalikan asli jika gagal konversi
    # Jika bukan string (misalnya sudah float), kembalikan apa adanya
    try:
        return float(value)
    except (ValueError, TypeError):
        return value  # Kembalikan asli jika gagal konversi

# Terapkan fungsi ke kolom DTC
combined_dataset['DTC'] = combined_dataset['DTC'].apply(clean_value)

In [None]:
# Get indices of non-null X_LOC and Y_LOC values
valid_indices = combined_dataset[['X_LOC', 'Y_LOC']].dropna().index

# Get X array for clustering
X = combined_dataset.loc[valid_indices, ['X_LOC', 'Y_LOC']].astype(int).values

# Gunakan eps berdasarkan k-dist plot
dbscan = DBSCAN(eps=1, min_samples=5)
clusters = dbscan.fit_predict(X)

# Tambahkan hasil klaster ke combined_dataset
combined_dataset.loc[valid_indices, 'Cluster_DBSCAN'] = clusters

# Plot hasil clustering
plt.figure(figsize=(8, 6))
plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis', marker='o', alpha=0.6)
plt.xlabel('X_LOC')
plt.ylabel('Y_LOC')
plt.title('DBSCAN Clustering')
plt.colorbar(label='Cluster')
plt.grid()
plt.show()