In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

In [2]:
test_df = pd.read_csv('./dataset/Test_dataset.csv')
test_df = test_df[['uuid','X_LOC', 'Y_LOC', 'DEPT', 'NPHI','DTC', 'RHOB', 'GR','CALI']]
test_df

Unnamed: 0,uuid,X_LOC,Y_LOC,DEPT,NPHI,DTC,RHOB,GR,CALI
0,14812,459853.34375,6560993.0,1348.310400,0.398266,133.537033,2.131319,61.177399,12.301304
1,72908,459853.34375,6560993.0,1348.462400,0.389460,133.525543,2.129777,63.512333,12.299226
2,103104,459853.34375,6560993.0,1348.614400,0.394868,130.739624,2.138082,63.515835,12.297644
3,23480,459853.34375,6560993.0,1348.766400,0.389355,128.074249,2.153999,63.153057,12.276056
4,104267,459853.34375,6560993.0,1348.918400,0.365808,121.454926,2.140920,60.224148,12.299644
...,...,...,...,...,...,...,...,...,...
121792,10928,,,5006.809976,,,2.515146,65.925987,8.647468
121793,26727,,,5006.961976,,,2.543073,66.297127,8.636636
121794,83995,,,5007.113976,,,2.576337,64.853714,8.592650
121795,67375,,,5007.265976,,,2.606787,62.779541,8.546233


In [3]:
# Definisi hubungan fitur untuk imputasi
imputation_dict = {
    'NPHI': ['RHOB', 'DTC', 'DEPT'],
    'DTC': ['RHOB', 'NPHI', 'DEPT'],
    'GR': ['DEPT'],
    'CALI': ['DEPT'],
    'RHOB': ['NPHI', 'DTC', 'DEPT'],
}

def impute_feature(df, target_feature, predictor_features):
    # Periksa apakah kolom target 100% NaN
    if df[target_feature].isna().sum() == len(df):
        print(f"Skipping {target_feature}: 100% NaN.")
        return df  # Kembalikan DataFrame tanpa perubahan
    
    # Periksa apakah semua kolom prediktor 100% NaN
    valid_predictors = [col for col in predictor_features if df[col].isna().sum() < len(df)]
    
    if len(valid_predictors) == 0:
        print(f"Skipping {target_feature}: All predictor columns are 100% NaN.")
        return df  # Kembalikan DataFrame tanpa perubahan

    # Cek apakah ada NaN yang perlu diimputasi di target
    if df[target_feature].isna().sum() > 0:
        print(f"Imputing {target_feature} using {valid_predictors}...")

        # Ambil subset data dengan fitur prediktor valid dan target
        imputation_data = df[valid_predictors + [target_feature]].copy()

        # Standarisasi data
        scaler = StandardScaler()
        imputation_data_scaled = scaler.fit_transform(imputation_data)

        # Terapkan KNN Imputer
        imputer = KNNImputer(n_neighbors=5, weights='distance')
        imputed_data_scaled = imputer.fit_transform(imputation_data_scaled)

        # Kembalikan ke skala asli
        imputed_data = scaler.inverse_transform(imputed_data_scaled)
        imputed_df = pd.DataFrame(imputed_data, columns=valid_predictors + [target_feature], index=df.index)

        # Update DataFrame asli
        df[target_feature] = imputed_df[target_feature]
    else:
        print(f"No NaN values in {target_feature}. Skipping imputation.")
    
    return df

In [4]:
# Create a copy of test_df to work with
df = test_df.copy()

# Lakukan imputasi untuk setiap fitur target
for target_feature, predictor_features in imputation_dict.items():
    # Pastikan semua fitur prediktor ada di DataFrame
    if all(feature in df.columns for feature in predictor_features):
        df = impute_feature(df, target_feature, predictor_features)
    else:
        print(f"Skipping imputation for {target_feature}")

Imputing NPHI using ['RHOB', 'DTC', 'DEPT']...
Imputing DTC using ['RHOB', 'NPHI', 'DEPT']...
Imputing GR using ['DEPT']...
Imputing CALI using ['DEPT']...
Imputing RHOB using ['NPHI', 'DTC', 'DEPT']...


In [5]:
df.fillna(0,inplace=True)

In [7]:
class FeatureEngineering:
    def __init__(self, df):
        """
        Initialize the FeatureEngineering class with a DataFrame.
        """
        self.df = df.copy()

    def compute_vclay(self):
        """Compute Vclay from Gamma Ray (GR) log."""
        if 'GR' in self.df.columns:
            GR_min, GR_max = self.df['GR'].min(), self.df['GR'].max()
            self.df['Vclay'] = (self.df['GR'] - GR_min) / (GR_max - GR_min)

    def compute_density_porosity(self):
        """Compute Density Porosity (PHI_D) from bulk density (RHOB)."""
        rho_matrix, rho_fluid = 2.65, 1.0  # Assumed matrix and fluid density values
        if 'RHOB' in self.df.columns:
            self.df['PHI_D'] = (rho_matrix - self.df['RHOB']) / (rho_matrix - rho_fluid)

    def compute_sonic_porosity(self):
        """Compute Sonic Porosity (PHI_S) from sonic transit time (DTC)."""
        dt_matrix, dt_fluid = 55.5, 189.0  # Assumed matrix and fluid travel times
        if 'DTC' in self.df.columns:
            self.df['PHI_S'] = (self.df['DTC'] - dt_matrix) / (dt_fluid - dt_matrix)

    def compute_ndpd(self):
        """Compute Neutron-Density Porosity Difference (NDPD)."""
        if 'NPHI' in self.df.columns and 'PHI_D' in self.df.columns:
            self.df['NDPD'] = self.df['NPHI'] - self.df['PHI_D']

    def compute_water_saturation(self):
        """Compute Water Saturation (Sw) using Archie's Equation."""
        if all(col in self.df.columns for col in ['RDEP', 'PHI_D']):
            a, m, n, rw = 1.0, 2.0, 2.0, 0.1  # Archie's parameters
            self.df['Sw'] = (a * rw / self.df['RDEP']) ** (1/n) / self.df['PHI_D'] ** (m/n)

    def apply_feature_engineering(self):
        """Run all feature engineering functions."""
        self.compute_vclay()
        self.compute_density_porosity()
        self.compute_sonic_porosity()
        self.compute_ndpd()
        self.compute_water_saturation()
        print("Feature engineering completed successfully!")
        return self.df

# Initialize FeatureEngineering class and apply feature engineering
feature_engineer = FeatureEngineering(df)
df_engineered = feature_engineer.apply_feature_engineering()

df_engineered

Feature engineering completed successfully!


Unnamed: 0,uuid,X_LOC,Y_LOC,DEPT,NPHI,DTC,RHOB,GR,CALI,Vclay,PHI_D,PHI_S,NDPD
0,14812,459853.34375,6560993.0,1348.310400,0.398266,133.537033,2.131319,61.177399,12.301304,0.056710,0.314352,0.584547,0.083914
1,72908,459853.34375,6560993.0,1348.462400,0.389460,133.525543,2.129777,63.512333,12.299226,0.058878,0.315287,0.584461,0.074174
2,103104,459853.34375,6560993.0,1348.614400,0.394868,130.739624,2.138082,63.515835,12.297644,0.058881,0.310254,0.563593,0.084614
3,23480,459853.34375,6560993.0,1348.766400,0.389355,128.074249,2.153999,63.153057,12.276056,0.058544,0.300606,0.543627,0.088749
4,104267,459853.34375,6560993.0,1348.918400,0.365808,121.454926,2.140920,60.224148,12.299644,0.055824,0.308534,0.494044,0.057275
...,...,...,...,...,...,...,...,...,...,...,...,...,...
121792,10928,0.00000,0.0,5006.809976,0.231205,80.453865,2.515146,65.925987,8.647468,0.061119,0.081729,0.186920,0.149476
121793,26727,0.00000,0.0,5006.961976,0.232819,80.927140,2.543073,66.297127,8.636636,0.061464,0.064804,0.190465,0.168015
121794,83995,0.00000,0.0,5007.113976,0.260863,80.394626,2.576337,64.853714,8.592650,0.060124,0.044644,0.186477,0.216218
121795,67375,0.00000,0.0,5007.265976,0.212640,83.122766,2.606787,62.779541,8.546233,0.058198,0.026189,0.206912,0.186451


In [12]:
import joblib
import xgboost as xgb

# Ganti 'model.joblib' dengan nama file Anda
file_path = "./model//best_xgb_model (1).joblib"

feature_order = ['NDPD', 'Y_LOC', 'X_LOC', 'DEPT', 'NPHI', 'DTC', 'GR', 'RHOB', 'Vclay', 'CALI']

# Memuat model
model = joblib.load(file_path)

# Ensure correct feature order and reindex columns to match the model's expected order
X_test = df_engineered[feature_order].reindex(columns=feature_order)

# Make predictions
prediction = model.predict(X_test)

In [16]:
import joblib

# Ganti 'label_encoder.pkl' dengan nama file Anda
file_path = "./model/label_encoder.pkl"

# Memuat Label Encoder
label_encoder = joblib.load(file_path)

# Inverse transform the predictions to get original labels
decoded_predictions = label_encoder.inverse_transform(prediction)
print(decoded_predictions)

[65000. 65000. 65000. ... 65000. 65000. 65000.]


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [17]:
# Create DataFrame with uuid and predictions
results_df = pd.DataFrame({
    'uuid': df_engineered['uuid'],
    'prediction_label': decoded_predictions
})

# Map the numerical predictions to lithology names using the mapping from cell 11
lithology_mapping = {
    30000: "Sandstone",
    65030: "Sandstone/Shale",
    65000: "Shale",
    80000: "Marl",
    74000: "Dolomite",
    70000: "Limestone",
    70032: "Chalk",
    88000: "Halite",
    86000: "Anhydrite",
    99000: "Tuff",
    90000: "Coal",
    93000: "Basement"
}

# Add lithology names
results_df['label_name'] = results_df['prediction_label'].map(lithology_mapping)

# Create final submission DataFrame with only uuid and label_name
submission = results_df[['uuid', 'label_name']]
submission

Unnamed: 0,uuid,label_name
0,14812,Shale
1,72908,Shale
2,103104,Shale
3,23480,Limestone
4,104267,Limestone
...,...,...
121792,10928,Shale
121793,26727,Shale
121794,83995,Shale
121795,67375,Shale


In [24]:
submission_1 = pd.read_csv("./submission_1.0.csv")

submission_1['label_name'].unique()

array(['Sandstone', 'Shale', 'Tuff', 'Limestone', 'Marl',
       'Sandstone/Shale', 'Chalk', 'Dolomite', 'Anhydrite', 'Halite',
       'Coal'], dtype=object)

In [26]:
len(list(submission['label_name'].unique()))

12

In [18]:
submission.to_csv("submission.csv",index=False)

In [18]:
from pycaret.classification import load_model
from pycaret.classification import *

# Muat kembali model
loaded_model = load_model('best_model')

# Gunakan model yang telah dimuat kembali untuk prediksi
test_predictions = predict_model(loaded_model, data=df)

Transformation Pipeline and Model Successfully Loaded


In [22]:
test_predictions

Unnamed: 0,uuid,X_LOC,Y_LOC,DEPT,NPHI,DTC,RHOB,GR,CALI,prediction_label,prediction_score
0,14812,459853.34375,6560993.0,1348.310425,0.398266,133.537033,2.131319,61.177399,12.301304,30000,0.34
1,72908,459853.34375,6560993.0,1348.462402,0.389460,133.525543,2.129777,63.512333,12.299226,30000,0.37
2,103104,459853.34375,6560993.0,1348.614380,0.394868,130.739624,2.138082,63.515835,12.297644,30000,0.39
3,23480,459853.34375,6560993.0,1348.766357,0.389355,128.074249,2.153999,63.153057,12.276056,30000,0.36
4,104267,459853.34375,6560993.0,1348.918457,0.365808,121.454926,2.140920,60.224148,12.299644,30000,0.31
...,...,...,...,...,...,...,...,...,...,...,...
121792,10928,0.00000,0.0,5006.810059,0.231205,80.453865,2.515146,65.925987,8.647468,65030,0.50
121793,26727,0.00000,0.0,5006.961914,0.232819,80.927139,2.543073,66.297127,8.636636,65030,0.48
121794,83995,0.00000,0.0,5007.113770,0.260863,80.394623,2.576337,64.853714,8.592650,65030,0.50
121795,67375,0.00000,0.0,5007.266113,0.212640,83.122765,2.606787,62.779541,8.546233,65030,0.49


In [24]:
import pandas as pd

# Data mapping berdasarkan label_code dan label_name
lithology_mapping = {
    30000: "Sandstone",
    65030: "Sandstone/Shale",
    65000: "Shale",
    80000: "Marl",
    74000: "Dolomite",
    70000: "Limestone",
    70032: "Chalk",
    88000: "Halite",
    86000: "Anhydrite",
    99000: "Tuff",
    90000: "Coal",
    93000: "Basement"
}


# Buat kolom baru berdasarkan mapping
test_predictions['label_name'] = test_predictions['prediction_label'].map(lithology_mapping)

test_predictions

Unnamed: 0,uuid,X_LOC,Y_LOC,DEPT,NPHI,DTC,RHOB,GR,CALI,prediction_label,prediction_score,lithology_name,label_name
0,14812,459853.34375,6560993.0,1348.310425,0.398266,133.537033,2.131319,61.177399,12.301304,30000,0.34,Sandstone,Sandstone
1,72908,459853.34375,6560993.0,1348.462402,0.389460,133.525543,2.129777,63.512333,12.299226,30000,0.37,Sandstone,Sandstone
2,103104,459853.34375,6560993.0,1348.614380,0.394868,130.739624,2.138082,63.515835,12.297644,30000,0.39,Sandstone,Sandstone
3,23480,459853.34375,6560993.0,1348.766357,0.389355,128.074249,2.153999,63.153057,12.276056,30000,0.36,Sandstone,Sandstone
4,104267,459853.34375,6560993.0,1348.918457,0.365808,121.454926,2.140920,60.224148,12.299644,30000,0.31,Sandstone,Sandstone
...,...,...,...,...,...,...,...,...,...,...,...,...,...
121792,10928,0.00000,0.0,5006.810059,0.231205,80.453865,2.515146,65.925987,8.647468,65030,0.50,Sandstone/Shale,Sandstone/Shale
121793,26727,0.00000,0.0,5006.961914,0.232819,80.927139,2.543073,66.297127,8.636636,65030,0.48,Sandstone/Shale,Sandstone/Shale
121794,83995,0.00000,0.0,5007.113770,0.260863,80.394623,2.576337,64.853714,8.592650,65030,0.50,Sandstone/Shale,Sandstone/Shale
121795,67375,0.00000,0.0,5007.266113,0.212640,83.122765,2.606787,62.779541,8.546233,65030,0.49,Sandstone/Shale,Sandstone/Shale


In [25]:
submission = test_predictions[['uuid','label_name']]

In [26]:
submission.to_csv("submission.csv",index=False)