In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [68]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, AdaBoostRegressor
import xgboost

In [69]:
df = pd.read_csv("../data/geometry.csv")
df = df.copy()

# Geometry imputation by D_h

In [70]:
df_1 = df.loc[:,["D_h [mm]", "geometry_corrected"]]

df_1 = df_1[df_1["D_h [mm]"].notna()]

In [71]:
mapping = {'tube': 1, 'annulus': 2, 'plate': 3}
df_1['geometry_corrected'] = df_1['geometry_corrected'].map(mapping)

In [72]:
df_1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27055 entries, 0 to 31643
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   D_h [mm]            27055 non-null  float64
 1   geometry_corrected  22382 non-null  float64
dtypes: float64(2)
memory usage: 634.1 KB


In [73]:
df_imputed = df_1.copy()

imputer = KNNImputer(n_neighbors=3)
df_imputed_1 = pd.DataFrame(imputer.fit_transform(df_imputed), columns=df_1.columns)

In [74]:
remapping = {1: 'tube', 2: 'annulus', 3: 'plate'}
df_imputed_1['geometry_corrected'] = df_imputed_1['geometry_corrected'].map(remapping)

# Geometry imputation by D_e

In [75]:
df_2 = df.loc[:,["D_e [mm]", "geometry_corrected"]]

df_2 = df_2[df_2["D_e [mm]"].notna()]

mapping = {'tube': 1, 'annulus': 2, 'plate': 3}
df_2['geometry_corrected'] = df_2['geometry_corrected'].map(mapping)

df_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26156 entries, 1 to 31643
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   D_e [mm]            26156 non-null  float64
 1   geometry_corrected  22382 non-null  float64
dtypes: float64(2)
memory usage: 613.0 KB


In [76]:
df_imputed = df_2.copy()

imputer = KNNImputer(n_neighbors=3)
df_imputed_2 = pd.DataFrame(imputer.fit_transform(df_imputed), columns=df_2.columns)

In [77]:
remapping = {1: 'tube', 2: 'annulus', 3: 'plate'}
df_imputed_2['geometry_corrected'] = df_imputed_2['geometry_corrected'].map(remapping)

# Geometry update

In [78]:
merged_df = df.merge(df_imputed_1[['D_h [mm]', 'geometry_corrected']], left_on='id', right_index=True, how='left')
merged_df = merged_df.rename(columns={"geometry_corrected_y":"geometry_corrected"})
df.update(merged_df[['geometry_corrected']])

merged_df = df.merge(df_imputed_2[['D_e [mm]', 'geometry_corrected']], left_on='id', right_index=True, how='left')
merged_df = merged_df.rename(columns={"geometry_corrected_y":"geometry_corrected"})
df.update(merged_df[['geometry_corrected']])

df

Unnamed: 0,id,D_h [mm],D_e [mm],geometry_corrected
0,0,10.8,,tube
1,1,10.3,10.3,tube
2,2,7.7,7.7,annulus
3,3,15.2,5.6,tube
4,4,11.1,11.1,tube
...,...,...,...,...
31639,31639,7.8,,
31640,31640,4.7,4.7,tube
31641,31641,3.0,3.0,tube
31642,31642,23.6,23.6,tube


In [79]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31644 entries, 0 to 31643
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  31644 non-null  int64  
 1   D_h [mm]            27055 non-null  float64
 2   D_e [mm]            26156 non-null  float64
 3   geometry_corrected  30254 non-null  object 
dtypes: float64(2), int64(1), object(1)
memory usage: 989.0+ KB


# D_e & D_h imputation

In [80]:
df_dh = df.copy()
# df_dh.drop(columns="id", inplace=True)

### Target

In [81]:
target = df_dh[df_dh["D_h [mm]"].isnull()]
target = target[(target["D_e [mm]"].notna()) & (target["geometry_corrected"].notna())]
target.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3207 entries, 11 to 27046
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  3207 non-null   int64  
 1   D_h [mm]            0 non-null      float64
 2   D_e [mm]            3207 non-null   float64
 3   geometry_corrected  3207 non-null   object 
dtypes: float64(2), int64(1), object(1)
memory usage: 125.3+ KB


### Training set

In [82]:
mapping = {'tube': 1, 'annulus': 2, 'plate': 3}
df_dh['geometry_corrected'] = df_dh['geometry_corrected'].map(mapping)

In [83]:
df_dh.dropna(how="any", inplace=True)
df_dh.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22382 entries, 1 to 31643
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  22382 non-null  int64  
 1   D_h [mm]            22382 non-null  float64
 2   D_e [mm]            22382 non-null  float64
 3   geometry_corrected  22382 non-null  float64
dtypes: float64(3), int64(1)
memory usage: 874.3 KB


In [84]:
scaler = StandardScaler()
d_h = df_dh["D_e [mm]"].values.reshape(-1, 1)
d_h_scaled = scaler.fit_transform(d_h)
df_dh["D_e [mm]"] = d_h_scaled

In [85]:
X = df_dh[["D_e [mm]", "geometry_corrected"]]
y = df_dh["D_h [mm]"]

In [86]:
X.describe()

Unnamed: 0,D_e [mm],geometry_corrected
count,22382.0,22382.0
mean,-1.165084e-16,1.216871
std,1.000022,0.46663
min,-1.473376,1.0
25%,-0.7035784,1.0
50%,-0.1647203,1.0
75%,0.4126276,1.0
max,5.551024,3.0


## D_h train

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [88]:
rand_forest = RandomForestRegressor(random_state=42)

svm = SVR()

ada_reg = AdaBoostRegressor(n_estimators=200,
                            random_state=42)

gbr_reg = GradientBoostingRegressor(random_state=42)

vot_reg = VotingRegressor(estimators=200)

# Definimos sus hiperparametros

rand_forest_param = {
    'n_estimators': [10, 100, 1000],
    'min_samples_leaf': [2, 4],
    'max_features': [1, 2]
    }

svm_param = {
    'C': [0.1, 1, 10],
    'kernel': ["linear", "rbf", "poly"],
    'gamma': ["scale", "auto"]
}

ada_param = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 1]
}

gbr_param = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 1],
    'max_depth': [3, 5, 7]
}



# GridSearch

gs_rand_forest = GridSearchCV(rand_forest,
                            rand_forest_param,
                            cv=10,
                            scoring=["neg_mean_squared_error", "r2"],
                            verbose=1,
                            n_jobs=-1,refit="neg_mean_squared_error")

gs_svm = GridSearchCV(svm,
                        svm_param,
                        cv=10,
                        scoring=["neg_mean_squared_error", "r2"],
                        verbose=1,
                        n_jobs=-1,refit="neg_mean_squared_error")

gs_ada = GridSearchCV(ada_reg,
                        ada_param,
                        cv=10,
                        scoring=["neg_mean_squared_error", "r2"],
                        verbose=1,
                        n_jobs=-1,refit="neg_mean_squared_error")

gs_gbr = GridSearchCV(gbr_reg,
                        gbr_param,
                        cv=10,
                        scoring=["neg_mean_squared_error", "r2"],
                        verbose=1,
                        n_jobs=-1,
                        refit="neg_mean_squared_error")

grids = {"gs_rand_forest":gs_rand_forest,
         
         "gs_ada":gs_ada,
         "gs_gbr":gs_gbr,
         }

        #  "gs_svm":gs_svm,

In [89]:
%%time

for nombre, grid_search in grids.items():
    grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 12 candidates, totalling 120 fits
Fitting 10 folds for each of 9 candidates, totalling 90 fits
Fitting 10 folds for each of 27 candidates, totalling 270 fits
CPU times: total: 3.97 s
Wall time: 44.6 s


In [90]:
best_grids = [(i, j.best_score_) for i, j in grids.items()]

best_grids = pd.DataFrame(best_grids, columns=["Grid", "Best score"]).sort_values(by="Best score", ascending=False)
best_grids

Unnamed: 0,Grid,Best score
0,gs_rand_forest,-7.692597
2,gs_gbr,-7.774858
1,gs_ada,-27.993805


In [91]:
# El mejor modelo ha sido
best_model = grids["gs_rand_forest"]
mejor_modelo = best_model.best_estimator_
mejor_modelo.fit(X_train, y_train)
mejor_modelo.score(X_test, y_test)

0.9810738437277992

In [92]:
best_model = pd.DataFrame(gs_rand_forest.cv_results_)
model_ranking = best_model[["params", "rank_test_r2", "rank_test_neg_mean_squared_error"]].sort_values(by="rank_test_r2")
model_ranking.loc[8, "params"]

{'max_features': 2, 'min_samples_leaf': 2, 'n_estimators': 1000}

In [93]:
forest_reg = RandomForestRegressor(n_estimators=1000, min_samples_leaf=2, max_features=2)

forest_reg.fit(X, y)

In [94]:
target = target[["D_e [mm]", "geometry_corrected"]]

In [95]:
mapping = {'tube': 1, 'annulus': 2, 'plate': 3}
target['geometry_corrected'] = target['geometry_corrected'].map(mapping)

In [96]:
final = forest_reg.predict(target)

In [97]:
final.shape

(3207,)

In [98]:
df_dh = df.copy()

In [99]:
final_target = df_dh[df_dh["D_h [mm]"].isnull()]
final_target = final_target[(final_target["D_e [mm]"].notna()) & (final_target["geometry_corrected"].notna())]

final_target["D_h [mm]"] = final

In [100]:
final_target

Unnamed: 0,id,D_h [mm],D_e [mm],geometry_corrected
11,11,37.614335,12.7,tube
13,13,37.614335,10.0,tube
25,25,37.500000,4.7,annulus
26,26,19.560001,1.9,tube
33,33,37.614335,5.6,tube
...,...,...,...,...
27024,27024,37.614335,7.7,tube
27031,27031,37.614335,5.6,tube
27033,27033,37.614335,10.8,tube
27035,27035,37.614335,5.6,tube


## D_e train

In [101]:
df_de = df.copy()

In [102]:
target = df_de[df_de["D_e [mm]"].isnull()]
target = target[(target["D_h [mm]"].notna()) & (target["geometry_corrected"].notna())]
target.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3969 entries, 0 to 27054
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  3969 non-null   int64  
 1   D_h [mm]            3969 non-null   float64
 2   D_e [mm]            0 non-null      float64
 3   geometry_corrected  3969 non-null   object 
dtypes: float64(2), int64(1), object(1)
memory usage: 155.0+ KB


In [103]:
mapping = {'tube': 1, 'annulus': 2, 'plate': 3}
df_de['geometry_corrected'] = df_de['geometry_corrected'].map(mapping)
df_de.dropna(how="any", inplace=True)
df_de.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22382 entries, 1 to 31643
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  22382 non-null  int64  
 1   D_h [mm]            22382 non-null  float64
 2   D_e [mm]            22382 non-null  float64
 3   geometry_corrected  22382 non-null  float64
dtypes: float64(3), int64(1)
memory usage: 874.3 KB


In [104]:
X = df_de[["D_h [mm]", "geometry_corrected"]]
y = df_de["D_e [mm]"]

In [105]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22382 entries, 1 to 31643
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   D_h [mm]            22382 non-null  float64
 1   geometry_corrected  22382 non-null  float64
dtypes: float64(2)
memory usage: 524.6 KB


In [106]:
forest_reg_2 = RandomForestRegressor(n_estimators=1000, min_samples_leaf=2, max_features=2)

forest_reg_2.fit(X, y)

In [107]:
target = target[["D_h [mm]", "geometry_corrected"]]
mapping = {'tube': 1, 'annulus': 2, 'plate': 3}
target['geometry_corrected'] = target['geometry_corrected'].map(mapping)
final = forest_reg_2.predict(target)

In [108]:
df_de = df.copy()

In [109]:
final_target_2 = df_de[df_de["D_e [mm]"].isnull()]
final_target_2 = final_target_2[(final_target_2["D_h [mm]"].notna()) & (final_target_2["geometry_corrected"].notna())]

final_target_2["D_e [mm]"] = final

In [110]:
final_target_2

Unnamed: 0,id,D_h [mm],D_e [mm],geometry_corrected
0,0,10.8,10.799992,tube
5,5,1.9,1.899995,tube
12,12,12.8,12.799715,tube
18,18,10.8,10.800000,annulus
34,34,7.8,7.800000,tube
...,...,...,...,...
27025,27025,11.5,11.500000,annulus
27044,27044,5.6,5.648015,tube
27048,27048,10.3,10.300009,tube
27051,27051,4.6,4.615768,tube


# D_e & D_h imputation

In [111]:
df

Unnamed: 0,id,D_h [mm],D_e [mm],geometry_corrected
0,0,10.8,,tube
1,1,10.3,10.3,tube
2,2,7.7,7.7,annulus
3,3,15.2,5.6,tube
4,4,11.1,11.1,tube
...,...,...,...,...
31639,31639,7.8,,
31640,31640,4.7,4.7,tube
31641,31641,3.0,3.0,tube
31642,31642,23.6,23.6,tube


In [126]:
df.update(final_target.set_index('id').combine_first(df.set_index('id')))
df.update(final_target_2.set_index('id').combine_first(df.set_index('id')))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31644 entries, 0 to 31643
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  31644 non-null  int64  
 1   D_h [mm]            30262 non-null  float64
 2   D_e [mm]            30125 non-null  float64
 3   geometry_corrected  30254 non-null  object 
dtypes: float64(2), int64(1), object(1)
memory usage: 989.0+ KB


### Last NaN values imputation

In [129]:
df["D_e [mm]"] = df["D_e [mm]"].fillna(df["D_e [mm]"].mean())
df["D_h [mm]"] = df["D_h [mm]"].fillna(df["D_h [mm]"].mean())
df["geometry_corrected"] = df["geometry_corrected"].fillna(df["geometry_corrected"].mode())

In [130]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31644 entries, 0 to 31643
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  31644 non-null  int64  
 1   D_h [mm]            31644 non-null  float64
 2   D_e [mm]            31644 non-null  float64
 3   geometry_corrected  30254 non-null  object 
dtypes: float64(2), int64(1), object(1)
memory usage: 989.0+ KB


In [132]:
df.to_csv("../data/imputed_geometry.csv", index=False)