In [None]:
import pandas as pd
#import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

In [None]:
test_df = pd.read_csv('./dataset/Test_dataset.csv')
test_df = test_df.drop(columns=['DTS','SGR','ROPA','RMIC','RXO','DCAL','RSHA','DEPTH_MD','Z_LOC','MUDWEIGHT','PEF'])
test_df.shape

In [None]:
# Get indices of non-null X_LOC and Y_LOC values
valid_indices = test_df[['X_LOC', 'Y_LOC']].dropna().index

# Get X array for clustering
X = test_df.loc[valid_indices, ['X_LOC', 'Y_LOC']].astype(int).values

# Gunakan eps berdasarkan k-dist plot
dbscan = DBSCAN(eps=2, min_samples=5)
clusters = dbscan.fit_predict(X)

# Tambahkan hasil klaster ke test_df
test_df.loc[valid_indices, 'Cluster_DBSCAN'] = clusters

# Plot hasil clustering
plt.figure(figsize=(8, 6))
plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis', marker='o', alpha=0.6)
plt.xlabel('X_LOC')
plt.ylabel('Y_LOC')
plt.title('DBSCAN Clustering')
plt.colorbar(label='Cluster')
plt.grid()
plt.show()

## **SPLIT DATASET BY CLUSTER**

In [None]:
clusters = test_df['Cluster_DBSCAN'].dropna().unique()  # Ambil semua cluster unik

# Buat dictionary untuk menyimpan DataFrame tiap cluster
cluster_dfs = {}

for cluster in clusters:
    cluster_dfs[cluster] = test_df[test_df['Cluster_DBSCAN'] == cluster].copy()

In [None]:
# Buat folder utama untuk menyimpan dataset per cluster
output_folder = './test_dataset/dataset_cluster'
os.makedirs(output_folder, exist_ok=True)

# Iterasi setiap cluster dan simpan sebagai file CSV
for cluster, df in cluster_dfs.items():
    # Tentukan nama file dengan format "(nama_cluster)_dataset.csv"
    filename = f"{cluster}_dataset.csv"
    filepath = os.path.join(output_folder, filename)

    # Simpan DataFrame ke CSV
    df.to_csv(filepath, index=False)

## **LABELLING OUTLIER**

In [None]:
# 1. Pilih fitur untuk analisis (exclude specified columns)
exclude_cols = ['X_LOC', 'Y_LOC', 'DEPT', 'Cluster_DBSCAN','uuid']
numeric_cols = [col for col in test_df.select_dtypes('number').columns if col not in exclude_cols]

# Daftar fitur yang TIDAK BOLEH negatif
cannot_be_negative = ['NPHI', 'RHOB', 'GR', 'PEF', 'CALI', 'BS', 'ROP', 'MUDWEIGHT', 'RDEP', 'RMED', 'DTC', 'DEPT', 'DEPTH_MD']

# Daftar fitur yang BOLEH negatif
can_be_negative = ['SP', 'DRHO', 'Z_LOC', 'X_LOC', 'Y_LOC', 'Cluster_DBSCAN']

# 2. Definisikan fungsi untuk mendeteksi outlier
def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    
    # Tentukan lower_bound berdasarkan apakah fitur boleh negatif atau tidak
    if column in cannot_be_negative:
        # Untuk fitur yang tidak boleh negatif, lower_bound minimal 0
        lower_bound = max(Q1 - 1.5 * IQR, 0)
    elif column in can_be_negative:
        # Untuk fitur yang boleh negatif, lower_bound = Q1 - 1.5 * IQR
        lower_bound = Q1 - 1.5 * IQR
    else:
        # Default: anggap tidak boleh negatif jika tidak diketahui
        lower_bound = max(Q1 - 1.5 * IQR, 0)
    
    upper_bound = Q3 + 1.5 * IQR
    is_outlier = (df[column] < lower_bound) | (df[column] > upper_bound)
    return is_outlier, lower_bound, upper_bound

# 3. Labeling outlier untuk setiap cluster
for cluster, df_cluster in cluster_dfs.items():
    # Buat DataFrame untuk menyimpan status outlier
    outlier_mask = pd.DataFrame(index=df_cluster.index)
    
    # Deteksi outlier untuk setiap kolom numerik
    for col in numeric_cols:
        if df_cluster[col].notna().any():  # Only process columns with non-null values
            is_outlier, lower_bound, upper_bound = detect_outliers(df_cluster, col)
            outlier_mask[f'{col}_is_outlier'] = is_outlier
         
    # Tambahkan status outlier ke DataFrame cluster
    cluster_dfs[cluster] = df_cluster.join(outlier_mask)
    output_folder = './test_dataset/dataset_cluster'
    filename = f"{cluster}_dataset.csv"
    filepath = os.path.join(output_folder, filename)

    # Simpan DataFrame ke CSV
    cluster_dfs[cluster].to_csv(filepath, index=False)

## **INTERPOLATE OUTLIER**

In [None]:
test_df.info()

In [None]:
df = pd.read_csv('dataset/estimate_test_dataset.csv')
df

In [None]:
from sklearn.cluster import KMeans
import numpy as np

most_feature_importance = ['GR','DEPT']

# Folder utama tempat dataset tersimpan
main_folder = './test_dataset/dataset_cluster'

# Iterasi setiap subfolder (cluster)
for cluster_file in os.listdir(main_folder):
    cluster_path = os.path.join(main_folder, cluster_file)
    # Periksa apakah file adalah CSV
    if cluster_file.endswith('.csv'):
        print(f"Reading file: {cluster_path}")
        # Buka file CSV sebagai DataFrame
        df = pd.read_csv(cluster_path)
        
        X = df[most_feature_importance].dropna()
        
        # Standardize the features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        
        # Range of k values to test
        k_values = range(1, 12)
        inertia_values = []

        # Compute KMeans for each k and store inertia
        for k in k_values:
            kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
            kmeans.fit(X_scaled)
            inertia_values.append(kmeans.inertia_)

        # Find the optimal elbow point
        elbow_point = k_values[np.diff(inertia_values, 2).argmin() + 1]
        
        # Fit final KMeans model with optimal number of clusters
        kmeans = KMeans(n_clusters=elbow_point, random_state=42, n_init=10)
        kmeans.fit(X_scaled)
        
        # Create a new column for cluster labels
        df['SubCluster'] = np.nan
        df.loc[X.index, 'SubCluster'] = kmeans.labels_
        
        # Save the updated DataFrame
        df.to_csv(cluster_path, index=False)

In [None]:
# Folder utama tempat dataset tersimpan
main_folder = './test_dataset/dataset_cluster'

# Iterasi setiap file dalam folder utama
for cluster_file in os.listdir(main_folder):
    cluster_path = os.path.join(main_folder, cluster_file)
    # Periksa apakah file adalah CSV
    if cluster_file.endswith('.csv'):
        print(f"Reading file: {cluster_path}")
        # Buka file CSV sebagai DataFrame
        df = pd.read_csv(cluster_path)

        # Ambil nomor cluster utama dari nama file
        main_cluster = cluster_file.split('_')[0]

        # Jika kolom SubCluster ada dan memiliki nilai
        if 'SubCluster' in df.columns and df['SubCluster'].notna().any():
            subclusters = df['SubCluster'].dropna().unique()

            # Buat folder untuk subcluster
            subcluster_folder = os.path.join(main_folder, f'cluster_{main_cluster}_subclusters')
            os.makedirs(subcluster_folder, exist_ok=True)

            # Simpan setiap subcluster ke file terpisah
            for subcluster in subclusters:
                subcluster_df = df[df['SubCluster'] == subcluster].copy()
                filename = f"cluster_{main_cluster}_subcluster_{int(subcluster)}.csv"
                filepath = os.path.join(subcluster_folder, filename)
                subcluster_df.to_csv(filepath, index=False)

## **IMPUTE FEATURE IN DATASET SUB CLUSTER**

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

# Definisi hubungan fitur untuk imputasi
imputation_dict = {
    'DTC': ['RHOB', 'DEPT'],
    'RHOB': ['DTC', 'DEPT'],
}

def impute_feature(df, target_feature, predictor_features):
    # Periksa apakah kolom target 100% NaN
    if df[target_feature].isna().sum() == len(df):
        print(f"Skipping {target_feature}: 100% NaN.")
        return df  # Kembalikan DataFrame tanpa perubahan
    
    # Periksa apakah semua kolom prediktor 100% NaN
    valid_predictors = [col for col in predictor_features if df[col].isna().sum() < len(df)]
    
    if len(valid_predictors) == 0:
        print(f"Skipping {target_feature}: All predictor columns are 100% NaN.")
        return df  # Kembalikan DataFrame tanpa perubahan

    # Cek apakah ada NaN yang perlu diimputasi di target
    if df[target_feature].isna().sum() > 0:
        print(f"Imputing {target_feature} using {valid_predictors}...")

        # Ambil subset data dengan fitur prediktor valid dan target
        imputation_data = df[valid_predictors + [target_feature]].copy()

        # Standarisasi data
        scaler = StandardScaler()
        imputation_data_scaled = scaler.fit_transform(imputation_data)

        # Terapkan KNN Imputer
        imputer = KNNImputer(n_neighbors=5, weights='distance')
        imputed_data_scaled = imputer.fit_transform(imputation_data_scaled)

        # Kembalikan ke skala asli
        imputed_data = scaler.inverse_transform(imputed_data_scaled)
        imputed_df = pd.DataFrame(imputed_data, columns=valid_predictors + [target_feature], index=df.index)

        # Update DataFrame asli
        df[target_feature] = imputed_df[target_feature]
    else:
        print(f"No NaN values in {target_feature}. Skipping imputation.")
    
    return df

In [None]:
# Folder utama tempat dataset tersimpan
main_folder = './test_dataset/dataset_cluster'

# Iterasi setiap folder dalam folder utama
for cluster_folder in os.listdir(main_folder):
    cluster_folder_path = os.path.join(main_folder, cluster_folder)
    
    # Periksa apakah ini adalah folder dan mengandung 'subclusters'
    if os.path.isdir(cluster_folder_path) and 'subclusters' in cluster_folder:
        # Iterasi file dalam folder subcluster
        for sub_cluster_file in os.listdir(cluster_folder_path):
            if sub_cluster_file.endswith('.csv'):
                sub_cluster_path = os.path.join(cluster_folder_path, sub_cluster_file)
                print(f"Processing: {sub_cluster_path}")
                try:
                    df = pd.read_csv(sub_cluster_path)
                    # Lakukan imputasi untuk setiap fitur target
                    for target_feature, predictor_features in imputation_dict.items():
                        # Pastikan semua fitur prediktor ada di DataFrame
                        if all(feature in df.columns for feature in predictor_features):
                            df = impute_feature(df, target_feature, predictor_features)
                        else:
                            print(f"Skipping imputation for {target_feature}")
                    df.to_csv(sub_cluster_path, index=False)
                    print(f"Saved to: {sub_cluster_path}")
                except Exception as e:
                    print(f"Error processing {sub_cluster_path}: {str(e)}")

## **MERGE SUB CLUSTER TO CLUSTER**

In [None]:
import os
import pandas as pd

# Folder utama tempat dataset tersimpan
main_folder = './test_dataset/dataset_cluster'

# Iterasi setiap folder dalam folder utama
for cluster_folder in os.listdir(main_folder):
    cluster_folder_path = os.path.join(main_folder, cluster_folder)
    
    # Periksa apakah ini adalah folder dan mengandung 'subclusters'
    if os.path.isdir(cluster_folder_path) and 'subclusters' in cluster_folder:
        merged_df = pd.DataFrame()
        
        # Iterasi file dalam folder subcluster
        for sub_cluster_file in os.listdir(cluster_folder_path):
            if sub_cluster_file.endswith('.csv'):
                sub_cluster_path = os.path.join(cluster_folder_path, sub_cluster_file)
                print(f"Merging: {sub_cluster_path}")
                try:
                    df = pd.read_csv(sub_cluster_path)
                    merged_df = pd.concat([merged_df, df], ignore_index=True)
                except Exception as e:
                    print(f"Error merging {sub_cluster_path}: {str(e)}")
        
        # Simpan hasil gabungan dalam folder utama dengan nama cluster
        merged_cluster_path = os.path.join(main_folder, f"merged_{cluster_folder}.csv")
        merged_df.to_csv(merged_cluster_path, index=False)
        print(f"Merged cluster saved to: {merged_cluster_path}")

## **IMPUTATION IN CLUSTER LEVEL**

In [None]:
import os
import pandas as pd

# Folder utama tempat dataset tersimpan
main_folder = './test_dataset/dataset_cluster'

# Iterasi hanya file yang sudah digabung (merged)
for merged_file in os.listdir(main_folder):
    if merged_file.startswith('merged_') and merged_file.endswith('.csv'):
        merged_file_path = os.path.join(main_folder, merged_file)
        print(f"Processing merged file: {merged_file_path}")
        try:
            df = pd.read_csv(merged_file_path)
            
            for target_feature, predictor_features in imputation_dict.items():
                # Pastikan semua fitur prediktor ada di DataFrame
                if all(feature in df.columns for feature in predictor_features):
                    df = impute_feature(df, target_feature, predictor_features)
                else:
                    print(f"Skipping imputation for {target_feature}")
            df.to_csv(merged_file_path, index=False)
            print(f"Saved to: {sub_cluster_path}")
        except Exception as e:
            print(f"Error processing {merged_file_path}: {str(e)}")            

## **MERGE TO FULL DATASET**

In [None]:
import os
import pandas as pd

# Folder utama tempat dataset tersimpan
main_folder = './test_dataset/dataset_cluster'
output_folder = './test_dataset'

# DataFrame untuk menyimpan semua data
full_merged_df = pd.DataFrame()

# Iterasi hanya file yang sudah digabung (merged)
for merged_file in os.listdir(main_folder):
    if merged_file.startswith('merged_') and merged_file.endswith('.csv'):
        merged_file_path = os.path.join(main_folder, merged_file)
        print(f"Processing merged file: {merged_file_path}")
        try:
            df = pd.read_csv(merged_file_path)
            full_merged_df = pd.concat([full_merged_df, df], ignore_index=True)
        except Exception as e:
            print(f"Error processing {merged_file_path}: {str(e)}")

# Simpan hasil penggabungan ke dalam folder test_dataset
full_merged_path = os.path.join(output_folder, 'full_merged_dataset.csv')
full_merged_df.to_csv(full_merged_path, index=False)
print(f"Full merged dataset saved to: {full_merged_path}")

In [None]:
# Get unique uuids from both dataframes
test_uuids = set(test_df['uuid'].unique())
merged_uuids = set(full_merged_df['uuid'].unique())

# Find uuids that are in test_df but not in full_merged_df
missing_uuids = test_uuids - merged_uuids

# Print the number of missing uuids
print(f"Number of uuids missing in merged dataset: {len(missing_uuids)}")

# Print first few missing uuids if any exist
if missing_uuids:
    print("\nFirst few missing uuids:")
    print(list(missing_uuids)[:5])

In [None]:
# Create a copy of test_df for the missing UUIDs
missing_data = test_df[test_df['uuid'].isin(missing_uuids)].copy()

# Combine the missing data with full_merged_df
df = pd.concat([full_merged_df, missing_data], ignore_index=True)

# Sort by UUID to maintain consistency
df = df.sort_values('uuid').reset_index(drop=True)

# Verify the final shape matches test_df
print(f"Original test_df shape: {test_df.shape}")
print(f"Final merged df shape: {df.shape}")

# Save the complete merged dataset
df.to_csv("./test_dataset/full_merged_dataset.csv", index=False)

In [None]:
# Create a copy of test_df to work with
#df = test_df.copy()

# Lakukan imputasi untuk setiap fitur target
for target_feature, predictor_features in imputation_dict.items():
    # Pastikan semua fitur prediktor ada di DataFrame
    if all(feature in df.columns for feature in predictor_features):
        df = impute_feature(df, target_feature, predictor_features)
        df = impute_feature(df, target_feature, predictor_features)
    else:
        print(f"Skipping imputation for {target_feature}")

In [None]:
class FeatureEngineering:
    def __init__(self, df):
        """
        Initialize the FeatureEngineering class with a DataFrame.
        """
        self.df = df.copy()

    def compute_vclay(self):
        """Compute Vclay from Gamma Ray (GR) log."""
        if 'GR' in self.df.columns:
            GR_min, GR_max = self.df['GR'].min(), self.df['GR'].max()
            self.df['Vclay'] = (self.df['GR'] - GR_min) / (GR_max - GR_min)

    def compute_density_porosity(self):
        """Compute Density Porosity (PHI_D) from bulk density (RHOB)."""
        rho_matrix, rho_fluid = 2.65, 1.0  # Assumed matrix and fluid density values
        if 'RHOB' in self.df.columns:
            self.df['PHI_D'] = (rho_matrix - self.df['RHOB']) / (rho_matrix - rho_fluid)

    def compute_sonic_porosity(self):
        """Compute Sonic Porosity (PHI_S) from sonic transit time (DTC)."""
        dt_matrix, dt_fluid = 55.5, 189.0  # Assumed matrix and fluid travel times
        if 'DTC' in self.df.columns:
            self.df['PHI_S'] = (self.df['DTC'] - dt_matrix) / (dt_fluid - dt_matrix)

    def compute_ndpd(self):
        """Compute Neutron-Density Porosity Difference (NDPD)."""
        if 'NPHI' in self.df.columns and 'PHI_D' in self.df.columns:
            self.df['NDPD'] = self.df['NPHI'] - self.df['PHI_D']

    def compute_water_saturation(self):
        """Compute Water Saturation (Sw) using Archie's Equation."""
        if all(col in self.df.columns for col in ['RDEP', 'PHI_D']):
            a, m, n, rw = 1.0, 2.0, 2.0, 0.1  # Archie's parameters
            self.df['Sw'] = (a * rw / self.df['RDEP']) ** (1/n) / self.df['PHI_D'] ** (m/n)

    def apply_feature_engineering(self):
        """Run all feature engineering functions."""
        self.compute_vclay()
        self.compute_density_porosity()
        self.compute_sonic_porosity()
        self.compute_ndpd()
        self.compute_water_saturation()
        print("Feature engineering completed successfully!")
        return self.df

# Initialize FeatureEngineering class and apply feature engineering
feature_engineer = FeatureEngineering(df)
df_engineered = feature_engineer.apply_feature_engineering()

df_engineered = df_engineered[['uuid','SP', 'GR', 'DTC', 'Y_LOC', 'DRHO', 'RHOB', 'NPHI','X_LOC', 'CALI', 'DEPT', 'Vclay', 'NDPD']]
df_engineered.to_csv("./test_dataset/ready_test.csv",index=False)

In [None]:
# Get the ordered list of uuids from test_df
ordered_uuids = test_df['uuid'].tolist()

# Create a dictionary mapping uuid to position for sorting
uuid_order = {uuid: pos for pos, uuid in enumerate(ordered_uuids)}

# Sort df_engineered based on the uuid order
df_engineered = df_engineered.set_index('uuid').loc[ordered_uuids].reset_index()

In [None]:
df_engineered

In [None]:
df_engineered = df_engineered[['SP', 'GR', 'DTC', 'Y_LOC', 'DRHO', 'RHOB', 'NPHI','X_LOC', 'CALI', 'DEPT', 'Vclay', 'NDPD']]

In [None]:
df_engineered.fillna(0,inplace=True)

In [None]:
import pandas as pd

df_test = pd.read_csv("./test_dataset/result_cleaning2.csv")
df_test

In [None]:
from pycaret.classification import load_model
from pycaret.classification import *

# Muat kembali model
loaded_model = load_model('final_lithology_model')

# Gunakan model yang telah dimuat kembali untuk prediksi
test_predictions = predict_model(loaded_model, data=df_test)

In [None]:
test_predictions

In [None]:
# import joblib

# scaler_loaded = joblib.load('scaler.pkl')
# X_new_scaled = scaler_loaded.transform(df_engineered)  # Transform data baru pakai scaler yang sama

# # Load the model later
# loaded_model = joblib.load('extra_trees_model.pkl')

# # Use the loaded model for predictions
# y_pred_loaded = loaded_model.predict(X_new_scaled)

In [None]:
len(set(y_pred_loaded))

In [None]:
# Create a DataFrame from the predictions
predictions_df = pd.DataFrame(test_predictions, columns=['prediction_label'])

# Get the corresponding uuids from df_engineered
predictions_df['uuid'] = df_test['uuid']

# Reorder columns to match required format
predictions_df = predictions_df[['uuid', 'prediction_label']]

# Map the numerical predictions to lithology names using the mapping
lithology_mapping = {
    30000: "Sandstone",
    65030: "Sandstone/Shale",
    65000: "Shale",
    80000: "Marl",
    74000: "Dolomite",
    70000: "Limestone",
    70032: "Chalk",
    88000: "Halite",
    86000: "Anhydrite",
    99000: "Tuff",
    90000: "Coal",
    93000: "Basement"
}

# Add lithology names
predictions_df['label_name'] = predictions_df['prediction_label'].map(lithology_mapping)

# Create final submission DataFrame with only uuid and label_name
submission = predictions_df[['uuid', 'label_name']]

In [None]:
submission['label_name'].value_counts()

In [None]:
submission.to_csv("submission_4.0.csv",index=False)

In [None]:
import joblib

# Ganti 'label_encoder.pkl' dengan nama file Anda
file_path = "./model/label_encoder.pkl"

# Memuat Label Encoder
label_encoder = joblib.load(file_path)

# Inverse transform the predictions to get original labels
decoded_predictions = label_encoder.inverse_transform(prediction)
print(decoded_predictions)

In [None]:
# Create DataFrame with uuid and predictions
results_df = pd.DataFrame({
    'uuid': test_predictions['uuid'],
    'prediction_label': test_predictions['prediction_label']
})

# Map the numerical predictions to lithology names using the mapping from cell 11
lithology_mapping = {
    30000: "Sandstone",
    65030: "Sandstone/Shale",
    65000: "Shale",
    80000: "Marl",
    74000: "Dolomite",
    70000: "Limestone",
    70032: "Chalk",
    88000: "Halite",
    86000: "Anhydrite",
    99000: "Tuff",
    90000: "Coal",
    93000: "Basement"
}

# Add lithology names
results_df['label_name'] = results_df['prediction_label'].map(lithology_mapping)

# Create final submission DataFrame with only uuid and label_name
submission = results_df[['uuid', 'label_name']]
submission

In [None]:
submission = test_predictions[['uuid','label_name']]

In [None]:
submission_1 = pd.read_csv("./submission_1.0.csv")
submission_1['label_name'].value_counts()

In [None]:
submission_2 = pd.read_csv("./submission_2.0.csv")
submission_2['label_name'].value_counts()

---

In [None]:
import pandas as pd

In [None]:
test_df = pd.read_csv("./dataset/Test_dataset.csv")
clean_train_df = pd.read_csv("./dataset/cleaned_dataset.csv")

In [None]:
merge_df = pd.concat([test_df,clean_train_df])
merge_df

In [None]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

# Definisi hubungan fitur untuk imputasi
imputation_dict = {
    'DTC': ['RHOB', 'NPHI', 'DEPT'],
    'RHOB': ['NPHI', 'DTC', 'DEPT'],
    'GR': ['DEPT'],
    'CALI': ['DEPT', 'BS'],
    'DRHO' : ['NPHI', 'DEPT'],
    'ROP' : ['NPHI','DEPT','CALI'],
    'NPHI': ['RHOB', 'DTC', 'DEPT','CALI'],
}

def impute_feature(df, target_feature, predictor_features):
    # Periksa apakah kolom target 100% NaN
    if df[target_feature].isna().sum() == len(df):
        print(f"Skipping {target_feature}: 100% NaN.")
        return df  # Kembalikan DataFrame tanpa perubahan
    
    # Periksa apakah semua kolom prediktor 100% NaN
    valid_predictors = [col for col in predictor_features if df[col].isna().sum() < len(df)]
    
    if len(valid_predictors) == 0:
        print(f"Skipping {target_feature}: All predictor columns are 100% NaN.")
        return df  # Kembalikan DataFrame tanpa perubahan
    
    # Jika target adalah CALI dan BS tersedia, gunakan regresi linear
    if (target_feature == 'CALI' and 'BS' in valid_predictors) or (target_feature == 'NPHI' and ['RHOB', 'DTC', 'DEPT','CALI'] in valid_predictors):
        print("Using Linear Regression for CALI imputations with BS...")
        
        # Pisahkan data menjadi yang memiliki nilai dan yang tidak
        known_data = df.dropna(subset=['CALI', 'BS'])
        unknown_data = df[df['CALI'].isna() & df['BS'].notna()]
        
        if not known_data.empty and not unknown_data.empty:
            # Latih model regresi
            model = LinearRegression()
            model.fit(known_data[['BS']], known_data['CALI'])
            
            # Prediksi nilai CALI yang hilang
            df.loc[df['CALI'].isna() & df['BS'].notna(), 'CALI'] = model.predict(unknown_data[['BS']])
    
    # Cek apakah masih ada NaN yang perlu diimputasi di target
    if df[target_feature].isna().sum() > 0:
        print(f"Imputing {target_feature} using {valid_predictors} with KNN...")

        # Ambil subset data dengan fitur prediktor valid dan target
        imputation_data = df[valid_predictors + [target_feature]].copy()

        # Standarisasi data
        scaler = StandardScaler()
        imputation_data_scaled = scaler.fit_transform(imputation_data)

        # Terapkan KNN Imputer
        imputer = KNNImputer(n_neighbors=5, weights='distance')
        imputed_data_scaled = imputer.fit_transform(imputation_data_scaled)

        # Kembalikan ke skala asli
        imputed_data = scaler.inverse_transform(imputed_data_scaled)
        imputed_df = pd.DataFrame(imputed_data, columns=valid_predictors + [target_feature], index=df.index)

        # Update DataFrame asli
        df[target_feature] = imputed_df[target_feature]
    else:
        print(f"No NaN values in {target_feature}. Skipping imputation.")
    
    return df

In [None]:
for target_feature, predictor_features in imputation_dict.items():
    # Pastikan semua fitur prediktor ada di DataFrame
    if all(feature in merge_df.columns for feature in predictor_features):
        df = impute_feature(merge_df, target_feature, predictor_features)
    else:
        print(f"Skipping imputation for {target_feature}: missing predictors in {filename}")

---

In [11]:
test_df = pd.read_csv("./dataset/cleaned_location_test_dataset.csv")
test_df

Unnamed: 0,uuid,file_name,ROPA,BS,PEF,RXO,DCAL,RMIC,ROP,SP,...,RHOB,NPHI,SGR,X_LOC,CALI,RSHA,Z_LOC,RDEP,DEPT,Cluster_DBSCAN
0,14812.0,25_10-10,53.066914,12.250001,3.125396,,0.051304,,45.224689,,...,2.131319,0.398266,,459853.34375,61.177399,0.985707,-1308.193359,1.019758,1348.310400,0
1,72908.0,25_10-10,53.109482,12.250001,3.292384,,0.049227,,49.948738,,...,2.129777,0.389460,,459853.34375,63.512333,0.888855,-1308.345337,0.924186,1348.462400,0
2,103104.0,25_10-10,53.152004,12.250001,3.693287,,0.047644,,52.813866,,...,2.138082,0.394868,,459853.34375,63.515835,0.870774,-1308.497314,0.895908,1348.614400,0
3,23480.0,25_10-10,53.194557,12.250001,4.051715,,0.026056,,,,...,2.153999,0.389355,,459853.34375,63.153057,0.913030,-1308.649292,0.955895,1348.766400,0
4,104267.0,25_10-10,53.237164,12.250001,4.299806,,0.049645,,,,...,2.140920,0.365808,,459853.34375,60.224148,0.959649,-1308.801392,1.022451,1348.918400,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121792,10928.0,34_3-3 A,12.779722,,8.879548,,,,15.756256,,...,2.515146,0.266054,,486205.50000,65.925987,,,,5006.809976,11
121793,26727.0,34_3-3 A,11.651907,,10.237462,,,,15.760216,,...,2.543073,0.264554,,486205.50000,66.297127,,,,5006.961976,11
121794,83995.0,34_3-3 A,11.266949,,,,,,15.990236,,...,2.576337,0.259230,,486205.50000,64.853714,,,,5007.113976,11
121795,67375.0,34_3-3 A,11.116782,,10.468700,,,,15.314774,,...,2.606787,0.258194,,486205.50000,62.779541,,,,5007.265976,11


In [12]:
clusters = test_df['Cluster_DBSCAN'].dropna().unique()  # Ambil semua cluster unik

# Buat dictionary untuk menyimpan DataFrame tiap cluster
cluster_dfs = {}

for cluster in clusters:
    cluster_dfs[cluster] = test_df[test_df['Cluster_DBSCAN'] == cluster].copy()

In [None]:
clusters

0