In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns  

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import RobustScaler # it is not affected by outliers.
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, learning_curve, KFold, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.semi_supervised import LabelSpreading
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, r2_score, make_scorer

from sklearn.feature_selection import f_regression, SelectKBest, RFECV
# from imputer import create_full_pipeline
from imputer import create_full_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.semi_supervised import SelfTrainingClassifier

data_original = pd.read_csv('Assets/Data/welddb.csv', delimiter='\s+', header=None)

data = data_original.copy().replace({"N": np.nan})

# Name the columns
data.columns = [
    'Carbon concentration (weight%)', 
    'Silicon concentration (weight%)', 
    'Manganese concentration (weight%)', 
    'Sulphur concentration (weight%)', 
    'Phosphorus concentration (weight%)', 
    'Nickel concentration (weight%)', 
    'Chromium concentration (weight%)', 
    'Molybdenum concentration (weight%)', 
    'Vanadium concentration (weight%)', 
    'Copper concentration (weight%)', 
    'Cobalt concentration (weight%)', 
    'Tungsten concentration (weight%)', 
    'Oxygen concentration (ppm by weight)', 
    'Titanium concentration (ppm by weight)', 
    'Nitrogen concentration (ppm by weight)', 
    'Aluminium concentration (ppm by weight)', 
    'Boron concentration (ppm by weight)', 
    'Niobium concentration (ppm by weight)', 
    'Tin concentration (ppm by weight)', 
    'Arsenic concentration (ppm by weight)', 
    'Antimony concentration (ppm by weight)', 
    'Current (A)', 
    'Voltage (V)', 
    'AC or DC', 
    'Electrode positive or negative', 
    'Heat input (kJ/mm)', 
    'Interpass temperature (°C)', 
    'Type of weld', 
    'Post weld heat treatment temperature (°C)', 
    'Post weld heat treatment time (hours)', 
    'Yield strength (MPa)', 
    'Ultimate tensile strength (MPa)', 
    'Elongation (%)', 
    'Reduction of Area (%)', 
    'Charpy temperature (°C)', 
    'Charpy impact toughness (J)', 
    'Hardness (kg/mm2)', 
    '50% FATT', 
    'Primary ferrite in microstructure (%)', 
    'Ferrite with second phase (%)', 
    'Acicular ferrite (%)', 
    'Martensite (%)', 
    'Ferrite with carbide aggregate (%)', 
    'Weld ID'
]

# Definição das colunas categóricas e numéricas
categoric_features = ['AC or DC', 'Electrode positive or negative', 'Type of weld']
numeric_features = ['Sulphur concentration (weight%)', 'Nickel concentration (weight%)', 
                    'Silicon concentration (weight%)', 'Phosphorus concentration (weight%)', 
                    'Titanium concentration (ppm by weight)', 'Nitrogen concentration (ppm by weight)', 
                    'Oxygen concentration (ppm by weight)', 'Voltage (V)', 'Heat input (kJ/mm)']

# Separação dos dados em treino e teste
X = data.drop(columns = ["Yield strength (MPa)", "Weld ID"])
y = data["Yield strength (MPa)"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Criando o pipeline com as features necessárias
full_pipeline = create_full_pipeline()

# Dropping missing values in test
X_test_clean = X_test[~y_test.isna()]
y_test_clean = y_test.dropna().astype(float)

# Aplicando o pipeline nos dados de treino
X_train_transformed = X_train.copy()
X_train_transformed = full_pipeline.fit_transform(X_train_transformed)

# Aplicando o pipeline nos dados de teste
X_test_transformed = full_pipeline.transform(X_test_clean)

# hotencoder = HotEncoderCategorical(X_train_transformed)
# X_train_transformed = hotencoder.fit_transform(X_train_transformed)
# X_test_transformed = hotencoder.transform(X_test_transformed)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[self.numeric_features] = X[self.numeric_features].apply(pd.to_numeric, errors='coerce')


# Preprocessing

In [2]:
# Criar um modelo base (nesse caso, um RandomForest)
# base_model = RandomForestClassifier()
base_model = LabelSpreading(kernel='knn', n_neighbors=7)

# Criar o modelo de auto-treinamento
self_training_model = SelfTrainingClassifier(base_model)

# X_train contém todas as variáveis explicativas
# y_train contém os labels, mas com valores NaN para os rótulos faltantes
# Importante: O SelfTrainingClassifier trata os valores faltantes como -1, então convertemos NaN para -1
y_train_imputed = y_train.fillna(-1)

# Treinar o modelo de auto-treinamento
self_training_model.fit(X_train_transformed, y_train_imputed)


# Preencher somente os valores faltantes no y_train
missing_mask = y_train.isna()

# Assegurar que os valores previstos sejam do tipo float
y_predicted = self_training_model.predict(X_train_transformed).astype(float)

# Apenas preencher os valores que estavam como NaN no conjunto original
y_train_completed = y_train.copy()

# Preencher somente os valores faltantes no y_train
missing_mask = y_train.isna()
y_train_completed[missing_mask] = y_predicted[missing_mask]

In [3]:
# # Categoric features 

# ## Linear Regression
# def print_metrics(y_pred_lr, y_test_clean):
#     print("MAPE :", mean_absolute_percentage_error(y_pred_lr, y_test_clean))
#     print("R2 :", r2_score(y_pred_lr, y_test_clean))
#     print("MSE :", mean_squared_error(y_pred_lr, y_test_clean))

# def train_test_models(X_train_transformed, y_train_completed):

#     print("\n -------------------- Linear Regression -------------------- \n")
#     lr_pipeline = Pipeline(
#         [
#             ("Regressor", LinearRegression())
#         ]
#     )

#     lr_pipeline.fit(X_train_transformed, y_train_completed)
#     y_pred_lr = lr_pipeline.predict(X_test_transformed)
#     print_metrics(y_pred_lr, y_test_clean)

#     ## Ridge Regression

#     print("\n -------------------- Ridge Regression -------------------- \n")
#     ridge_pipeline = Pipeline(
#         [
#             ("Ridge Regressor", Ridge())
#         ]
#     )

#     ridge_pipeline.fit(X_train_transformed, y_train_completed)
#     y_pred_ridge = ridge_pipeline.predict(X_test_transformed)
#     print_metrics(y_pred_ridge, y_test_clean)

#     ## Lasso Regression

#     print("\n -------------------- Lasso Regression -------------------- \n")
#     lasso_pipeline = Pipeline(
#         [
#             ("Lasso Regressor", Lasso())
#         ]
#     )

#     lasso_pipeline.fit(X_train_transformed, y_train_completed)
#     y_pred_lasso = lasso_pipeline.predict(X_test_transformed)

#     print_metrics(y_pred_lasso, y_test_clean)

#     ## ElasticNet Regression

#     print("\n -------------------- ElasticNet Regression -------------------- \n")
#     ElasticNet_pipeline = Pipeline(
#         [
#             ("Regressor", ElasticNet())
#         ]
#     )

#     ElasticNet_pipeline.fit(X_train_transformed, y_train_completed)
#     y_pred_ElasticNet = ElasticNet_pipeline.predict(X_test_transformed)

#     print_metrics(y_pred_ElasticNet, y_test_clean)

#     ## Decision Tree Regression

#     print("\n -------------------- Decision Tree Regression -------------------- \n")
#     tree_pipeline = Pipeline(
#         [
#             ("Regressor", DecisionTreeRegressor())
#         ]
#     )

#     tree_pipeline.fit(X_train_transformed, y_train_completed)
#     y_pred_tree = tree_pipeline.predict(X_test_transformed)

#     print_metrics(y_pred_tree, y_test_clean)

#     ## Random Forest Regression

#     print("\n -------------------- Random Forest Regression -------------------- \n")
#     RF_pipeline = Pipeline(
#         [
#             ("Regressor", RandomForestRegressor())
#         ]
#     )

#     RF_pipeline.fit(X_train_transformed, y_train_completed)
#     y_pred_RF = RF_pipeline.predict(X_test_transformed)

#     print_metrics(y_pred_RF, y_test_clean)

#     ## Gradient Boosting Regression

#     print("\n -------------------- Gradient Boosting Regression -------------------- \n")
#     gb_pipeline = Pipeline(
#         [
#             ("Regressor", GradientBoostingRegressor())
#         ]
#     )

#     gb_pipeline.fit(X_train_transformed, y_train_completed)
#     y_pred_gb = gb_pipeline.predict(X_test_transformed)

#     print_metrics(y_pred_gb, y_test_clean)

# train_test_models(X_train_transformed, y_train_completed)

In [4]:
model = xgb.XGBRegressor(random_state=42)

model.fit(X_train_transformed, y_train_completed)

y_pred = model.predict(X_test_transformed)

print(mean_squared_error(y_test_clean, y_pred))

1028.9364518465923


In [5]:
# Criar o Pipeline para cada modelo
def create_pipeline(estimator):
    return Pipeline(steps=[
        ('feature_selection', RFECV(estimator=estimator, step=1, cv=KFold(5), scoring='neg_mean_squared_error')),
        ('regressor', estimator)
    ])

# Definir os modelos de regressão
models = {
    # 'RandomForest': RandomForestRegressor(random_state=42),
    'XGB': xgb.XGBRegressor(random_state=42),
    "DecisionTree": DecisionTreeRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42)
}

# Parâmetros para a busca em grid (ajuste conforme necessário)
param_grid = {
    'pipeline__feature_selection__min_features_to_select': [5, 10, 15],
    
    'pipeline__regressor__n_estimators': [100, 200],  # Para RandomForest e GradientBoosting
    'pipeline__regressor__max_depth': [3, 5, 10],     # Para DecisionTree e GradientBoosting
    'pipeline__regressor__learning_rate': [0.01, 0.1, 0.2],  # Para GradientBoosting
    'pipeline__regressor__n_estimators': [100, 200],  # Para XGB
}

# Scoring para regressão (Mean Squared Error)
scorer = make_scorer(mean_squared_error, greater_is_better=False)

# Dicionário para armazenar os melhores modelos
best_models = {}

# Fazer a busca em grid para cada modelo
for name, model in models.items():
    print(f"Treinando o modelo {name}...")
    
    # Criar o pipeline para o modelo
    pipeline = create_pipeline(model)
    
    # Fazer a busca em grid para encontrar os melhores parâmetros
    grid_search = GridSearchCV(pipeline, param_grid={f'pipeline__regressor__{param}': value for param, value in param_grid.items() if param.startswith(name)},
                               scoring=scorer, cv=5, verbose=2, n_jobs=-1)
    
    # Ajustar o modelo
    grid_search.fit(X_train_transformed, y_train_completed)
    
    # Guardar o melhor modelo e seus parâmetros
    best_models[name] = grid_search.best_estimator_
    print(f"Melhor modelo para {name}: {grid_search.best_params_}")
    
    # Fazer previsões nos dados de teste
    y_pred = grid_search.best_estimator_.predict(X_test_transformed)
    
    # Avaliar o modelo no conjunto de teste
    test_mse = mean_squared_error(y_test_clean, y_pred)
    print(f"Erro quadrado médio no teste para {name}: {test_mse}")

Treinando o modelo XGB...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END .................................................... total time=  47.3s
[CV] END .................................................... total time=  48.4s
[CV] END .................................................... total time=  48.8s
[CV] END .................................................... total time=  48.9s
[CV] END .................................................... total time=  49.0s
Melhor modelo para XGB: {}
Erro quadrado médio no teste para XGB: 1172.3513182383833
Treinando o modelo DecisionTree...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END .................................................... total time=   2.7s
[CV] END .................................................... total time=   3.0s
[CV] END .................................................... total time=   3.2s
[CV] END .................................................... total time=   3.2s
[CV] EN