In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns  

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import RobustScaler # it is not affected by outliers.
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.semi_supervised import LabelSpreading

from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, r2_score

from sklearn.feature_selection import f_regression, SelectKBest, RFECV
# from imputer import create_full_pipeline
from imputer_old import create_full_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.semi_supervised import SelfTrainingClassifier

data_original = pd.read_csv('Assets/Data/welddb.csv', delimiter='\s+', header=None)

data = data_original.copy().replace({"N": np.nan})

# Name the columns
data.columns = [
    'Carbon concentration (weight%)', 
    'Silicon concentration (weight%)', 
    'Manganese concentration (weight%)', 
    'Sulphur concentration (weight%)', 
    'Phosphorus concentration (weight%)', 
    'Nickel concentration (weight%)', 
    'Chromium concentration (weight%)', 
    'Molybdenum concentration (weight%)', 
    'Vanadium concentration (weight%)', 
    'Copper concentration (weight%)', 
    'Cobalt concentration (weight%)', 
    'Tungsten concentration (weight%)', 
    'Oxygen concentration (ppm by weight)', 
    'Titanium concentration (ppm by weight)', 
    'Nitrogen concentration (ppm by weight)', 
    'Aluminium concentration (ppm by weight)', 
    'Boron concentration (ppm by weight)', 
    'Niobium concentration (ppm by weight)', 
    'Tin concentration (ppm by weight)', 
    'Arsenic concentration (ppm by weight)', 
    'Antimony concentration (ppm by weight)', 
    'Current (A)', 
    'Voltage (V)', 
    'AC or DC', 
    'Electrode positive or negative', 
    'Heat input (kJ/mm)', 
    'Interpass temperature (°C)', 
    'Type of weld', 
    'Post weld heat treatment temperature (°C)', 
    'Post weld heat treatment time (hours)', 
    'Yield strength (MPa)', 
    'Ultimate tensile strength (MPa)', 
    'Elongation (%)', 
    'Reduction of Area (%)', 
    'Charpy temperature (°C)', 
    'Charpy impact toughness (J)', 
    'Hardness (kg/mm2)', 
    '50% FATT', 
    'Primary ferrite in microstructure (%)', 
    'Ferrite with second phase (%)', 
    'Acicular ferrite (%)', 
    'Martensite (%)', 
    'Ferrite with carbide aggregate (%)', 
    'Weld ID'
]

# Definição das colunas categóricas e numéricas
categoric_features = ['AC or DC', 'Electrode positive or negative', 'Type of weld']
numeric_features = ['Sulphur concentration (weight%)', 'Nickel concentration (weight%)', 
                    'Silicon concentration (weight%)', 'Phosphorus concentration (weight%)', 
                    'Titanium concentration (ppm by weight)', 'Nitrogen concentration (ppm by weight)', 
                    'Oxygen concentration (ppm by weight)', 'Voltage (V)', 'Heat input (kJ/mm)']

# Separação dos dados em treino e teste
X = data.drop(columns = ["Yield strength (MPa)", "Weld ID"])
y = data["Yield strength (MPa)"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Criando o pipeline com as features necessárias
percent_n_sorted = X_train.isna().mean().sort_values(ascending=False)
full_pipeline = create_full_pipeline()

# Dropping missing values in test
X_test_clean = X_test[~y_test.isna()]
y_test_clean = y_test.dropna().astype(float)

# Aplicando o pipeline nos dados de treino
X_train_transformed = X_train.copy()
X_train_transformed = full_pipeline.fit_transform(X_train_transformed)

# Aplicando o pipeline nos dados de teste
X_test_transformed = full_pipeline.transform(X_test_clean)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[self.numeric_features] = X[self.numeric_features].apply(pd.to_numeric, errors='coerce')


(1156, 28)
3
(1156, 40)
(244, 28)
3
(244, 39)


KeyError: "['39'] not in index"

In [None]:
categoric_features = ['AC or DC', 'Electrode positive or negative', 'Type of weld']

data['Type of weld'].unique()

In [None]:
X_test_transformed.shape

# Preprocessing

In [None]:
# Criar um modelo base (nesse caso, um RandomForest)
# base_model = RandomForestClassifier()
base_model = LabelSpreading(kernel='knn', n_neighbors=7)

# Criar o modelo de auto-treinamento
self_training_model = SelfTrainingClassifier(base_model)

# X_train contém todas as variáveis explicativas
# y_train contém os labels, mas com valores NaN para os rótulos faltantes
# Importante: O SelfTrainingClassifier trata os valores faltantes como -1, então convertemos NaN para -1
y_train_imputed = y_train.fillna(-1)

# Treinar o modelo de auto-treinamento
self_training_model.fit(X_train_transformed, y_train_imputed)

y_train_imputed = y_train_imputed.astype(float)

# Agora, o modelo está treinado e os labels foram preenchidos (imputados)
y_train_completed = self_training_model.predict(X_train_transformed).astype(float)

In [None]:
# Categoric features 

## Linear Regression
def print_metrics(y_pred_lr, y_test_clean):
    print("MAPE :", mean_absolute_percentage_error(y_pred_lr, y_test_clean))
    print("R2 :", r2_score(y_pred_lr, y_test_clean))
    print("MSE :", mean_squared_error(y_pred_lr, y_test_clean))

def train_test_models(X_train_transformed, y_train_completed):

    print("\n -------------------- Linear Regression -------------------- \n")
    lr_pipeline = Pipeline(
        [
            ("Regressor", LinearRegression())
        ]
    )

    lr_pipeline.fit(X_train_transformed, y_train_completed)
    y_pred_lr = lr_pipeline.predict(X_test_transformed)
    print_metrics(y_pred_lr, y_test_clean)

    ## Ridge Regression

    print("\n -------------------- Ridge Regression -------------------- \n")
    ridge_pipeline = Pipeline(
        [
            ("Ridge Regressor", Ridge())
        ]
    )

    ridge_pipeline.fit(X_train_transformed, y_train_completed)
    y_pred_ridge = ridge_pipeline.predict(X_test_transformed)
    print_metrics(y_pred_ridge, y_test_clean)

    ## Lasso Regression

    print("\n -------------------- Lasso Regression -------------------- \n")
    lasso_pipeline = Pipeline(
        [
            ("Lasso Regressor", Lasso())
        ]
    )

    lasso_pipeline.fit(X_train_transformed, y_train_completed)
    y_pred_lasso = lasso_pipeline.predict(X_test_transformed)

    print_metrics(y_pred_lasso, y_test_clean)

    ## ElasticNet Regression

    print("\n -------------------- ElasticNet Regression -------------------- \n")
    ElasticNet_pipeline = Pipeline(
        [
            ("Regressor", ElasticNet())
        ]
    )

    ElasticNet_pipeline.fit(X_train_transformed, y_train_completed)
    y_pred_ElasticNet = ElasticNet_pipeline.predict(X_test_transformed)

    print_metrics(y_pred_ElasticNet, y_test_clean)

    ## Decision Tree Regression

    print("\n -------------------- Decision Tree Regression -------------------- \n")
    tree_pipeline = Pipeline(
        [
            ("Regressor", DecisionTreeRegressor())
        ]
    )

    tree_pipeline.fit(X_train_transformed, y_train_completed)
    y_pred_tree = tree_pipeline.predict(X_test_transformed)

    print_metrics(y_pred_tree, y_test_clean)

    ## Random Forest Regression

    print("\n -------------------- Random Forest Regression -------------------- \n")
    RF_pipeline = Pipeline(
        [
            ("Regressor", RandomForestRegressor())
        ]
    )

    RF_pipeline.fit(X_train_transformed, y_train_completed)
    y_pred_RF = RF_pipeline.predict(X_test_transformed)

    print_metrics(y_pred_RF, y_test_clean)

    ## Gradient Boosting Regression

    print("\n -------------------- Gradient Boosting Regression -------------------- \n")
    gb_pipeline = Pipeline(
        [
            ("Regressor", GradientBoostingRegressor())
        ]
    )

    gb_pipeline.fit(X_train_transformed, y_train_completed)
    y_pred_gb = gb_pipeline.predict(X_test_transformed)

    print_metrics(y_pred_gb, y_test_clean)

train_test_models(X_train_transformed, y_train_completed)

In [None]:
# Treinando o modelo XGBoost
xbg_pipeline = Pipeline(
        [
            ("XGB", xgb.XGBRegressor(objective='reg:squarederror', random_state=42))
        ]
    )
xbg_pipeline.fit(X_train_transformed, y_train_completed)

y_pred_xgb = xbg_pipeline.predict(X_test_transformed)

print_metrics(y_pred_xgb, y_test_clean)