In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split, ShuffleSplit, KFold, cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.dummy import DummyRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error

## 1. Obtenção de Dados

In [2]:
df = pd.read_csv('../data/raw/train.csv')
df_dict = pd.read_csv("../data/external/dictionary.csv")
df_dict

Unnamed: 0,variavel,descricao,tipo,subtipo
0,Survival,"Sobrevivente 0 = No, 1 = Yes",qualitativa,nominal
1,Pclass,"Ticket Class 1 = 1st, 2 = 2nd, 3 = 3rd",qualitativa,ordinal
2,Sex,Gender,qualitativa,nominal
3,Age,Age in years,quantitativa,contínua
4,Sibsp,# of siblings / spouses aboard the Titanic,qualitativa,nominal
5,Parch,# of parents / children aboard the Titanic,qualitativa,nominal
6,Ticket,Ticket Number,qualitativa,nominal
7,Fare,Passenger fare,quantitiva,discreta
8,Cabin,Cabin Number,qualitativa,ordinal
9,Embarked,Port of Embarkation,qualitativa,nominal


2. Tratamento de dados Faltantes

In [3]:
print(f'A quantidades de dado null é: {df.isnull().sum().sum()}')
print(f'A quantidades de dado NAN é: {df.isna().sum().sum()}')

A quantidades de dado null é: 866
A quantidades de dado NAN é: 866


In [4]:
target_column = 'Survived'
target_variable = target_column
useless_variables =  (
    df_dict
    .query("tipo == 'Inutil'")
    .variavel
    .to_list()
)
nominal_variables = (
    df_dict
    .query("subtipo == 'Nominal' and variavel != @target_variable")
    .variavel
    .to_list()
)
ordinal_variables = (
    df_dict
    .query("subtipo == 'Ordinal' and variavel != @target_variable")
    .variavel
    .to_list()
)
continuous_variables = (
    df_dict
    .query("subtipo == 'Continua' and variavel != @target_variable")
    .variavel
    .to_list()
)
discrete_variables = (
    df_dict
    .query("subtipo == 'Discreta' and variavel != @target_variable")
    .variavel
    .to_list()
)

X = df.drop(columns=[target_variable] + useless_variables)
y = df[target_variable]

## 3. Pre processamento

In [5]:
# Definindo os transformadores para cada tipo de variável
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Criando o ColumnTransformer para aplicar as transformações
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, continuous_variables + discrete_variables),
        ('cat', categorical_transformer, nominal_variables + ordinal_variables)
    ]
)

# Ajustando e transformando os dados
X_preprocessed = preprocessor.fit_transform(X)

## 4. Modelos

In [10]:
X = df.drop(columns=[target_variable] + useless_variables)
y = df[target_variable]

In [11]:
models = [DummyRegressor(strategy='mean'), LinearRegression(), KNeighborsRegressor(n_neighbors=5), SVR()]
metrics = [
    'neg_mean_absolute_error',
    'neg_mean_squared_error',
    'neg_mean_absolute_percentage_error',
    'r2',
]
monte_carlo = ShuffleSplit(n_splits=10, test_size=.3, random_state=42)
# hold_out = ShuffleSplit(n_splits=1, test_size=.3, random_state=42)
# kfold = KFold(n_splits=5, shuffle=True, random_state=42)

In [12]:
results_total = None
for model in models:
    model_name = model.__class__.__name__
    print(f"rodando para o modelo: {model_name}")
    approach = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model),
    ])

    scores = cross_validate(
        approach, X, y,
        scoring=metrics,
        cv=monte_carlo
    )
    results_model = pd.DataFrame(scores)
    results_model['model'] = model_name
    if results_total is None:
        results_total = results_model
    else:
        results_total = pd.concat([results_total, results_model])

rodando para o modelo: DummyRegressor
rodando para o modelo: LinearRegression


ValueError: 
All the 10 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\myle_\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\myle_\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\myle_\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\Users\myle_\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\myle_\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\linear_model\_base.py", line 609, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\myle_\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\myle_\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\validation.py", line 1301, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "c:\Users\myle_\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\validation.py", line 1096, in check_array
    raise ValueError(
ValueError: Found array with 0 feature(s) (shape=(623, 0)) while a minimum of 1 is required by LinearRegression.


In [13]:
results_total.groupby('model').agg(['mean', 'std']).T

Unnamed: 0,model,DummyRegressor
fit_time,mean,0.004816246
fit_time,std,0.00242038
score_time,mean,0.005108261
score_time,std,0.00193634
test_neg_mean_absolute_error,mean,-0.4745706
test_neg_mean_absolute_error,std,0.004181112
test_neg_mean_squared_error,mean,-0.2388608
test_neg_mean_squared_error,std,0.007157341
test_neg_mean_absolute_percentage_error,mean,-1047833000000000.0
test_neg_mean_absolute_percentage_error,std,85207750000000.0
