In [104]:
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, KFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import numpy as np
import matplotlib.pyplot as plt
df = pd.read_csv('training.csv')
test = pd.read_csv('test.csv')

In [106]:
test_id = test['segment_id']

In [107]:
from numpy import percentile
def remove_outliers(column, df):
    q25, q75 = percentile(df[column], 25), percentile(df[column], 75)
    cut_off = 1.5 * (q75 - q25)
    lower, upper = q25 - cut_off, q75 + cut_off
    new_df = df[(df[column] < upper) & (df[column] > lower)]
    return new_df

print(f"Dataset size before removing outliers: {df.shape[0]}")
df = remove_outliers('rate_of_penetration', df)
print(f"Dataset size after removing outliers: {df.shape[0]}")
df.reset_index(drop=True, inplace=True)

Dataset size before removing outliers: 6838
Dataset size after removing outliers: 6735


In [110]:
XX = pd.concat([df, test])
df.drop(columns=['segment_id'], inplace=True)
df.head()
XX.drop(columns=['segment_id'], inplace=True)

In [111]:
print(XX.shape)
print(df.shape)

(8302, 10)
(6735, 10)


In [112]:
enc = OneHotEncoder(handle_unknown='error', sparse=False)
y = XX['rate_of_penetration']
nominal = ['wellbore_chev_no_id', 'area_id', 'formation_id', 'bit_model_id']
numerical = ['drillbit_size', 'min_depth', 'max_depth', 'surface_weight_on_bit', 'surface_rpm']

X_nominal = enc.fit_transform(XX[nominal])
nominal_cols = enc.get_feature_names_out(nominal)

scaler = StandardScaler()
X_num = XX[numerical].copy()

for i in numerical:
    X_num[i] = scaler.fit_transform(XX[[i]])

XX = np.concatenate((X_nominal, X_num), axis=1)
print(XX.shape)


(8302, 319)


In [113]:
X_train, X_test= XX[0:6735,], XX[6735:,]
y_train, y_test = y[:6735], y[6735:]

In [114]:
XGB = XGBRegressor(subsample= 0.95, min_child_weight= 0.1121, max_depth= 12, learning_rate= 0.14, gamma= 2.715, colsample_bytree= 0.756, colsample_bylevel= 1.0, objective ='reg:squarederror', tree_method='gpu_hist')
XGB.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1.0,
             colsample_bynode=1, colsample_bytree=0.756,
             enable_categorical=False, gamma=2.715, gpu_id=0,
             importance_type=None, interaction_constraints='',
             learning_rate=0.14, max_delta_step=0, max_depth=12,
             min_child_weight=0.1121, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=8, num_parallel_tree=1, predictor='auto',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             subsample=0.95, tree_method='gpu_hist', validate_parameters=1,
             verbosity=None)

In [127]:
y_pred = XGB.predict(X_test)

In [131]:
result = pd.concat([test_id, pd.Series(y_pred)], axis=1)
result.to_csv('submission.csv', index=False)