In [29]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px

from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline



In [30]:
DATA_PATH = "welddb/welddb.data"
COLS = [
    "C_wt_pct", "Si_wt_pct", "Mn_wt_pct", "S_wt_pct", "P_wt_pct", "Ni_wt_pct", "Cr_wt_pct",
    "Mo_wt_pct", "V_wt_pct", "Cu_wt_pct", "Co_wt_pct", "W_wt_pct",
    "O_ppm", "Ti_ppm", "N_ppm", "Al_ppm", "B_ppm", "Nb_ppm", "Sn_ppm", "As_ppm", "Sb_ppm",
    "Current_A", "Voltage_V", "AC_or_DC", "Electrode_polarity",
    "HeatInput_kJ_per_mm", "InterpassTemp_C", "WeldType",
    "PWHT_Temp_C", "PWHT_Time_h",
    "YieldStrength_MPa", "UTS_MPa", "Elongation_pct", "ReductionArea_pct",
    "CharpyTemp_C", "CharpyJ", "Hardness_kg_per_mm2",
    "FATT_50pct",
    "PrimaryFerrite_pct", "FerriteSecondPhase_pct", "AcicularFerrite_pct", "Martensite_pct", "FerriteCarbideAgg_pct",
    "WeldID"
]



In [31]:
df_raw = pd.read_csv(
        DATA_PATH,
        header=None,
        names=COLS,
        sep=r"\s+",
        na_values=["N", "n"],
        engine="python"
    )

In [32]:
display(df_raw.head(3))

Unnamed: 0,C_wt_pct,Si_wt_pct,Mn_wt_pct,S_wt_pct,P_wt_pct,Ni_wt_pct,Cr_wt_pct,Mo_wt_pct,V_wt_pct,Cu_wt_pct,...,CharpyTemp_C,CharpyJ,Hardness_kg_per_mm2,FATT_50pct,PrimaryFerrite_pct,FerriteSecondPhase_pct,AcicularFerrite_pct,Martensite_pct,FerriteCarbideAgg_pct,WeldID
0,0.037,0.3,0.65,0.008,0.012,0.0,,,,,...,,,,,,,,,,Evans-Ni/CMn-1990/1991-0Aaw
1,0.037,0.3,0.65,0.008,0.012,0.0,,,,,...,-28.0,100.0,,,,,,,,Evans-Ni/CMn-1990/1991-0Aawch
2,0.037,0.3,0.65,0.008,0.012,0.0,,,,,...,-38.0,100.0,,,,,,,,Evans-Ni/CMn-1990/1991-0Aht


In [33]:
miss = df_raw.isna().mean().sort_values(ascending=False)
miss_df = miss.to_frame(name="missing_ratio").reset_index().rename(columns={"index": "column"})
miss_df


Unnamed: 0,column,missing_ratio
0,FATT_50pct,0.981235
1,W_wt_pct,0.9546
2,FerriteCarbideAgg_pct,0.946126
3,Martensite_pct,0.946126
4,FerriteSecondPhase_pct,0.945521
5,AcicularFerrite_pct,0.945521
6,PrimaryFerrite_pct,0.940678
7,Co_wt_pct,0.921913
8,Hardness_kg_per_mm2,0.916465
9,As_ppm,0.858354


J'ai trouvé dans la littérature que les variables les plus à même d'expliquer la qualité d'une soudure sont le charpyJ, UTS et Elongation

In [44]:
target_vars = ["CharpyJ", "Elongation_pct", "UTS_MPa"]

# Drop rows with missing targets (modeling step; keep a copy of df_raw for EDA if needed)
df = df_raw.copy()
df = df.dropna(subset=target_vars)

# Identify features
feature_cols = [c for c in df.columns if c not in (target_vars + ["WeldID"])]
X = df[feature_cols].copy()
y = df[target_vars].copy()

num_cols = df_raw.select_dtypes(include=[np.number]).columns.tolist()

corrs = df_raw[num_cols].corr(method='pearson')[target_vars]

# Tri des corrélations en fonction de la quantité de missing values
corrs_sorted = corrs.reindex(miss_df['column'])


print("=== Corrélation absolue avec les variables de qualité ===")
print(corrs_sorted.round(3))
fig = px.imshow(
    corrs_sorted,
    aspect="auto",
    color_continuous_scale="RdBu_r",
    title="Correlation absolue entre les variables numériques et les variables de qualité"
)
fig.show()

=== Corrélation absolue avec les variables de qualité ===
                        CharpyJ  Elongation_pct  UTS_MPa
column                                                  
FATT_50pct                  NaN          -0.321   -0.547
W_wt_pct                    NaN             NaN      NaN
FerriteCarbideAgg_pct       NaN             NaN      NaN
Martensite_pct              NaN             NaN      NaN
FerriteSecondPhase_pct      NaN             NaN      NaN
AcicularFerrite_pct         NaN             NaN      NaN
PrimaryFerrite_pct          NaN             NaN      NaN
Co_wt_pct                   NaN             NaN      NaN
Hardness_kg_per_mm2         NaN             NaN      NaN
As_ppm                      NaN             NaN      NaN
Sb_ppm                      NaN             NaN      NaN
Sn_ppm                      NaN             NaN      NaN
B_ppm                       NaN             NaN      NaN
Cu_wt_pct                   NaN             NaN      NaN
Ni_wt_pct                -0.01

On va donc retirer toutes les variables qui on un taux de missing value élevé ainsi qu'une corrélation faible ou inexistante avec les variables explicatives.

In [36]:
COLS_TO_DROP = ["FATT_50pct", "W_wt_pct", "FerriteCarbideAgg_pct", "Martensite_pct", "FerriteSecondPhase_pct", "AcicularFerrite_pct", "PrimaryFerrite_pct", "Co_wt_pct", "Hardness_kg_per_mm2", "As_ppm", "Sb_ppm", "Sn_ppm", "B_ppm", "Cu_wt_pct"]
df = df_raw.drop(columns=COLS_TO_DROP)

In [37]:
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
desc = df[num_cols].describe().T
desc

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
C_wt_pct,1652.0,0.075521,0.023898,0.029,0.06175,0.074,0.086,0.18
Si_wt_pct,1652.0,0.328577,0.112455,0.04,0.27,0.32,0.36,1.14
Mn_wt_pct,1652.0,1.202821,0.382137,0.27,0.94,1.27,1.44,2.25
P_wt_pct,1642.0,0.012952,0.019627,0.002,0.007,0.01,0.014,0.25
Ni_wt_pct,697.0,0.415034,0.786951,0.0,0.0,0.067,0.26,3.5
Cr_wt_pct,784.0,2.101273,3.026548,0.0,0.0,0.53,2.3,10.2
O_ppm,1256.0,441.967357,147.483825,132.0,370.0,423.0,471.0,1650.0
Current_A,1404.0,283.843661,192.560955,115.0,170.0,170.0,300.0,900.0
Voltage_V,1404.0,27.607379,12.555629,11.5,21.0,21.0,30.0,75.36
HeatInput_kJ_per_mm,1652.0,1.700987,1.298465,0.6,1.0,1.0,2.0,7.9


In [38]:
row_missing_ratio = df.isna().mean(axis=1)
print(max(row_missing_ratio))
fig = px.histogram(row_missing_ratio, nbins=12,
                   title="Distribution du pourcentage de valeurs manquantes par ligne",
                   labels={"value": "Taux de valeurs manquantes"})
fig.show()

0.5


Toutes les lignes ont au moins la moitié de leurs valeurs. On va les conserver

In [None]:
target_vars = ["CharpyJ", "Elongation_pct", "UTS_MPa"]

# Drop rows with missing targets (modeling step; keep a copy of df_raw for EDA if needed)
df = df_raw.copy()
df = df.dropna(subset=target_vars)

# Identify features
feature_cols = [c for c in df.columns if c not in (target_vars + ["WeldID"])]
X = df[feature_cols].copy()
y = df[target_vars].copy()

Maintenant, on va compléter les lignes qui contiennent des NaN

In [39]:
# the paper describes how to handle chemicals : 

impurity_elements = ["P_wt_pct", "S_wt_pct"]
deliberate_elements = ["Mn_wt_pct", "Ni_wt_pct", "Cr_wt_pct", "Mo_wt_pct"]

for col in impurity_elements + deliberate_elements:
    X[col] = pd.to_numeric(X[col], errors='coerce')

# Add indicators
for col in impurity_elements + deliberate_elements:
    X[f"{col}_was_na"] = X[col].isna().astype(int)

# Apply domain-specific fills
for col in impurity_elements:
    X[col] = X[col].fillna(X[col].mean())
for col in deliberate_elements:
    X[col] = X[col].fillna(0)

In [40]:
remaining_missing = df.isna().mean().sort_values(ascending=False)
print(remaining_missing[remaining_missing > 0])

Elongation_pct        0.576271
ReductionArea_pct     0.573245
UTS_MPa               0.553269
Nb_ppm                0.544794
YieldStrength_MPa     0.527845
CharpyJ               0.467918
CharpyTemp_C          0.467918
Al_ppm                0.452179
V_wt_pct              0.438257
Ti_ppm                0.434019
N_ppm                 0.248184
O_ppm                 0.239709
Current_A             0.150121
Voltage_V             0.150121
AC_or_DC              0.130145
Electrode_polarity    0.094431
PWHT_Time_h           0.007869
PWHT_Temp_C           0.007869
dtype: float64


In [41]:
#uts before imputation :
px.histogram(df, x="UTS_MPa", nbins=30, title="UTS before imputation").show()

num_cols = df.select_dtypes(include=[np.number]).columns
cat_cols = df.select_dtypes(exclude=[np.number]).columns

knn = KNNImputer(n_neighbors=5)
df[num_cols] = knn.fit_transform(df[num_cols])

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('imputer', KNNImputer(n_neighbors=5))
])
df[num_cols] = pipeline.fit_transform(df[num_cols])

for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

print(df.isna().sum().sum())
px.histogram(df, x="UTS_MPa", nbins=30, title="UTS after imputation").show()

0
