In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tqdm import tqdm
import mlflow
from mlflow.models import infer_signature
import sys
sys.path.append("..")

from src.score import score
from src.utility_functions import preprocess_data

# model
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion

#mlflow.set_experiment("lithofacies_prediction")

2024-08-04 06:21:11.322123: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-04 06:21:11.331221: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-04 06:21:11.355310: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-04 06:21:11.404415: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-04 06:21:11.415131: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-04 06:21:11.445516: I tensorflow/core/platform/cpu_feature_gu

In [2]:
df = pd.read_csv("../data/train.csv", sep=";")
# test = pd.read_csv("../data/test.csv", sep=";")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1170511 entries, 0 to 1170510
Data columns (total 29 columns):
 #   Column                             Non-Null Count    Dtype  
---  ------                             --------------    -----  
 0   WELL                               1170511 non-null  object 
 1   DEPTH_MD                           1170511 non-null  float64
 2   X_LOC                              1159736 non-null  float64
 3   Y_LOC                              1159736 non-null  float64
 4   Z_LOC                              1159736 non-null  float64
 5   GROUP                              1169233 non-null  object 
 6   FORMATION                          1033517 non-null  object 
 7   CALI                               1082634 non-null  float64
 8   RSHA                               630650 non-null   float64
 9   RMED                               1131518 non-null  float64
 10  RDEP                               1159496 non-null  float64
 11  RHOB                    

In [3]:
def return_missing_values():
    for col in df.columns:
        missing = df[col].isna().sum()
        print(f"% missing in {col}: {missing/len(df):.1%}")

In [4]:
def drop_colums(df, cols):
    return df.drop(cols, axis=1)

cols_to_drop =["SGR", "DTS", "ROP", "DCAL", "MUDWEIGHT", "RMIC", "ROPA", "RXO", "BS", 'FORCE_2020_LITHOFACIES_LITHOLOGY', 'FORCE_2020_LITHOFACIES_CONFIDENCE']

target = df['FORCE_2020_LITHOFACIES_LITHOLOGY']
df = drop_colums(df, cols_to_drop)

In [13]:
lithology_keys = {30000: 'Sandstone',
                 65030: 'Sandstone/Shale',
                 65000: 'Shale',
                 80000: 'Marl',
                 74000: 'Dolomite',
                 70000: 'Limestone',
                 70032: 'Chalk',
                 88000: 'Halite',
                 86000: 'Anhydrite',
                 99000: 'Tuff',
                 90000: 'Coal',
                 93000: 'Basement'}

lithology_num = {'Sandstone': 0,
                 'Sandstone/Shale': 1,
                 'Shale': 2,
                 'Marl': 3,
                 'Dolomite': 4,
                 'Limestone': 5,
                 'Chalk': 6,
                 'Halite': 7,
                 'Anhydrite': 8,
                 'Tuff': 9,
                 'Coal': 10,
                 'Basement': 11}

target = target.map(lithology_keys)

In [5]:
return_missing_values()

% missing in WELL: 0.0%
% missing in DEPTH_MD: 0.0%
% missing in X_LOC: 0.9%
% missing in Y_LOC: 0.9%
% missing in Z_LOC: 0.9%
% missing in GROUP: 0.1%
% missing in FORMATION: 11.7%
% missing in CALI: 7.5%
% missing in RSHA: 46.1%
% missing in RMED: 3.3%
% missing in RDEP: 0.9%
% missing in RHOB: 13.8%
% missing in GR: 0.0%
% missing in NPHI: 34.6%
% missing in PEF: 42.6%
% missing in DTC: 6.9%
% missing in SP: 26.2%
% missing in DRHO: 15.6%


## Experimentation


In [6]:
num_attribs = df.select_dtypes(include=[int, float]).columns
cat_attribs = df.select_dtypes(include=[object]).columns

In [7]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attrib_names):
        self.attrib_names = attrib_names
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.attrib_names]
    
    
class FillCategoricalMissingValues(BaseEstimator, TransformerMixin):
    def __init__(self, ):
        pass
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        for col in X.columns:
            X.loc[:, col] = X[col].fillna('unkwn')
        return X

In [8]:
# numerical dataframe
num_pipeline = Pipeline([
    ("Select dataframe", DataFrameSelector(num_attribs)),
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])


In [9]:
class KerasCategoryTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, ):
        pass
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        str_lookup = keras.layers.StringLookup(num_oov_indices=2)
        str_lookup.adapt(X)
        lookup_and_embed = keras.Sequential([
            str_lookup,
            keras.layers.Embedding(input_dim=str_lookup.vocabulary_size(), output_dim=8)
        ])
        return lookup_and_embed(X.to_numpy()).numpy().reshape(X.shape[0], -1)

In [10]:
cat_pipeline = Pipeline([
    ("Select dataframe", DataFrameSelector(cat_attribs)),
    ("Fill Missing Values", FillCategoricalMissingValues()),
    ("Embed categorical values", KerasCategoryTransformer())
])

In [11]:
full_pipeline = FeatureUnion(transformer_list=[
('numerical', num_pipeline),
('cat_pipeline', cat_pipeline)
])

In [12]:
train_full = full_pipeline.fit_transform(df)

X_train, X_test, y_train, y_test = train_test_split(train_full, target, test_size=0.2, random_state=42, stratify=target)

In [15]:
rnd_clf = RandomForestClassifier(random_state=42)
rnd_clf.fit(X_train, y_train)

In [16]:
pred = rnd_clf.predict(X_test)

print(f1_score(y_test, pred, average="weighted"))
print(accuracy_score(y_test, pred))

0.9619759904671766
0.9623499058106901


In [17]:
a = y_test.map(lithology_num).values
b = pd.Series(pred).map(lithology_num).values

In [18]:
score(a, b)

-0.10324141510360824

In [19]:
import pickle

with open("../model/random_forest_1.pkl", "wb") as file:
    pickle.dump(rnd_clf, file)