In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model  import Ridge
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
import os
from datetime import datetime
from scipy import stats
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import ElasticNet
import xgboost as xgb
import lightgbm as lgb
from sklearn.neighbors import KNeighborsRegressor

### REPENSANDO OS PASSOS DO PROCESSO

1. ~~Preparação do dataset~~
2. ~~Seleção dos atributos~~
3. ~~Avaliação dos possíveis algoritmos~~
4. Otimização dos Hiperparâmetros
5. Predição da base de teste e submissão

In [2]:
def data_preparation_1(df):
    # Pré-processando as variáveis
    df_prep = df.copy()
    df_prep['Compartments'] = df_prep['Compartments'].astype(int).astype(object)

    cat_columns = df_prep.select_dtypes(include=['object']).columns
    for column in cat_columns:
        # Mostrando a distribuição antes do preenchimento
        category_percent = df_prep[column].value_counts(normalize=True)
        
        # Preenchendo valores NaN com base nas proporções existentes entre as categorias
        n_nan = df_prep[column].isna().sum()
        categories = category_percent.index
        proportions = category_percent.values

        new_values = np.random.choice(categories, size=n_nan, p=proportions)

        df_prep.loc[df_prep[column].isna(), column] = new_values

    # Preenchendo valores NaN de 'Weight Capacity (kg)' com a média
    df_prep['Weight Capacity (kg)'] = df_prep['Weight Capacity (kg)'].fillna(df_prep['Weight Capacity (kg)'].mean())
    
    # Criando a coluna categórica 'weight_categorical'
    bins = [5, 10, 15, 20, 25, 31]
    labels = ['5-10', '10-15', '15-20', '20-25', '25-30']
    df_prep['weight_category'] = pd.cut(df_prep['Weight Capacity (kg)'], bins=bins, labels=labels, right=False).astype('object')

    # Dropando a coluna numérica 'Weight Capacity (kg)'
    df_prep = df_prep.drop(columns='Weight Capacity (kg)')
    
    # Engenharia de atributos: get dummies para TODAS variáveis categóricas
    df_eng = df_prep.copy()
    cat_columns = df_eng.select_dtypes(include=['object']).columns
    for column in cat_columns:
        df_eng = pd.get_dummies(df_eng, columns=[column], dtype=int)
    
    return df_eng

In [3]:
def avaliar_relevancia_colunas(df):
    colunas_nao_relevantes = []
    
    for column in df.columns:
        
        # Verifica se a coluna é categórica
        if df[column].dtype == 'int32' or df[column].dtype == 'int64':
            
            if df[column].nunique() == 2:
                # Teste de Mann-Whitney U para variáveis binárias
                grupo_1 = df[df[column] == df[column].unique()[0]]['Price']
                grupo_2 = df[df[column] == df[column].unique()[1]]['Price']
                u_stat, p_value = stats.mannwhitneyu(grupo_1, grupo_2)
                
                if p_value > 0.05:
                    colunas_nao_relevantes.append(column)                    

            # Caso a coluna seja multiclasse (mais de 2 categorias)
            else:
                # Teste de Kruskal-Wallis para variáveis multiclasse
                categorias = [df[df[column] == cat]['Price'] for cat in df[column].unique()]
                h_stat, p_value = stats.kruskal(*categorias)
                
                if p_value > 0.05:
                    colunas_nao_relevantes.append(column)       
            
    # Retorna a lista de colunas não relevantes
    return colunas_nao_relevantes

In [4]:
def avaliar_modelos(df, target_column, frac=0.1, random_state=42):
    # Passo 1: Pegar um sample do dataset
    df_sample = df.sample(frac=frac, random_state=random_state)
    
    # Passo 2: Separar as variáveis independentes e o target
    X = df_sample.drop(columns=[target_column])
    y = df_sample[target_column]
    
    # Lista de modelos a serem testados
    models = {
        'Linear Regression': LinearRegression(),
        'Random Forest Regressor': RandomForestRegressor(random_state=random_state),
        'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=random_state),
        'CatBoost Regressor': CatBoostRegressor(iterations=1000, task_type='CPU', random_seed=random_state, verbose=False),
        'Ridge Regression': Ridge(),
        'ElasticNet Regression': ElasticNet(),
        'XGBoost Regressor': xgb.XGBRegressor(n_estimators=100, random_state=random_state),
        'LightGBM Regressor': lgb.LGBMRegressor(n_estimators=100, random_state=random_state)
    }
    
    # Passo 3: Testar os modelos com Cross-Validation e calcular o RMSE
    results = []
    for model_name, model in models.items():
        # Realiza a validação cruzada (K-Fold Cross-Validation)
        cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')  # Usamos MSE negativo para maximizar o score
        
        # Converte de volta para o RMSE
        rmse_scores = np.sqrt(-cv_scores)  # O valor retornado é negativo, por isso precisamos inverter
        
        # Calcula a média e o desvio padrão dos RMSEs
        mean_rmse = np.mean(rmse_scores)
        std_rmse = np.std(rmse_scores)
        
        results.append((model_name, mean_rmse, std_rmse))
    
    # Passo 4: Printar o RMSE médio e o desvio padrão de cada algoritmo
    print("RMSE Médio e Desvio Padrão por Algoritmo (Cross-Validation):")
    for model_name, mean_rmse, std_rmse in results:
        print(f"{model_name}: {mean_rmse:.5f}")

In [5]:
# Carregamento do dataset de treino e o extra
df_train = pd.read_csv("../data/train.csv")
df_extra = pd.read_csv("../data/training_extra.csv")
df = pd.concat([df_train, df_extra], axis=0)
df = df.drop(columns='id')

# Preparação do dataset
df_prep = data_preparation_1(df)

# Avaliar a relevância das colunas para o target e excluir as colunas sem relevância
cols_to_exclude = avaliar_relevancia_colunas(df_prep)
df_prep = df_prep.drop(columns=cols_to_exclude)

# Testar os algoritmos de ML
avaliar_modelos(df_prep, 'Price', 0.1)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005005 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 66
[LightGBM] [Info] Number of data points in the train set: 319545, number of used features: 33
[LightGBM] [Info] Start training from score 81.471151
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003130 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 66
[LightGBM] [Info] Number of data points in the train set: 319545, number of used features: 33
[LightGBM] [Info] Start training from score 81.391124
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004586 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug

### OTIMIZAR OS HIPERPARÂMETROS

In [7]:
# Passo 1: Pegar um sample do dataset
df_sample = df_prep.sample(frac=0.1, random_state=42)
    
# Passo 2: Separar as variáveis independentes e o target
X = df_sample.drop(columns=['Price'])
y = df_sample['Price']

In [8]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Definir o modelo
ridge = Ridge()

# Definir o espaço de busca de hiperparâmetros
param_dist = {
    'alpha': np.logspace(-6, 6, 13),
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'saga']
}

# Configurar o RandomizedSearchCV
random_search = RandomizedSearchCV(ridge, param_distributions=param_dist, n_iter=100, cv=5, n_jobs=-1, verbose=2)

# Ajustar o modelo
random_search.fit(X, y)

# Ver o melhor modelo
print(random_search.best_params_)




Fitting 5 folds for each of 65 candidates, totalling 325 fits
{'solver': 'lsqr', 'alpha': 1000.0}


In [9]:
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Definir o modelo LightGBM
lgb_regressor = lgb.LGBMRegressor()

# Definir o espaço de hiperparâmetros para busca aleatória
param_dist = {
    'num_leaves': [31, 50, 100, 200],  # Número máximo de folhas
    'learning_rate': [0.01, 0.05, 0.1, 0.2],  # Taxa de aprendizado
    'n_estimators': [100, 200, 300, 500],  # Número de árvores (estágios)
    'max_depth': [-1, 5, 10, 20],  # Profundidade máxima das árvores
    'min_child_samples': [20, 30, 50],  # Número mínimo de amostras para dividir um nó
    'subsample': [0.6, 0.8, 1.0],  # Subamostragem dos dados para cada árvore
    'colsample_bytree': [0.6, 0.8, 1.0],  # Subamostragem das features
    'reg_alpha': [0, 0.1, 0.5, 1],  # Regularização L1
    'reg_lambda': [0, 0.1, 0.5, 1]  # Regularização L2
}

# Configurar o RandomizedSearchCV com 5 dobras de validação cruzada
random_search = RandomizedSearchCV(
    estimator=lgb_regressor,
    param_distributions=param_dist,
    n_iter=100,  # Número de tentativas aleatórias
    cv=5,  # Validação cruzada com 5 dobras
    n_jobs=-1,  # Usar todos os núcleos da CPU
    verbose=2,  # Mostrar progresso
    random_state=42  # Semente para reprodutibilidade
)

# Ajustar o modelo aos dados (X_train e y_train são seus dados de treino)
random_search.fit(X, y)

# Mostrar os melhores parâmetros encontrados
print("Melhores parâmetros encontrados: ", random_search.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003541 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 66
[LightGBM] [Info] Number of data points in the train set: 399432, number of used features: 33
[LightGBM] [Info] Start training from score 81.425188
Melhores parâmetros encontrados:  {'subsample': 1.0, 'reg_lambda': 1, 'reg_alpha': 0, 'num_leaves': 50, 'n_estimators': 300, 'min_child_samples': 20, 'max_depth': 5, 'learning_rate': 0.01, 'colsample_bytree': 0.6}


In [11]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Definir o modelo Gradient Boosting Regressor
gbr_regressor = GradientBoostingRegressor()

# Definir o espaço de hiperparâmetros para busca aleatória
param_dist = {
    'n_estimators': [100, 200, 300, 500],  # Número de árvores (estágios)
    'learning_rate': [0.01, 0.05, 0.1, 0.2],  # Taxa de aprendizado
    'max_depth': [3, 5, 7, 10],  # Profundidade máxima das árvores
    'min_samples_split': [2, 5, 10],  # Número mínimo de amostras para dividir um nó
    'min_samples_leaf': [1, 2, 4],  # Número mínimo de amostras por folha
    'subsample': [0.7, 0.8, 0.9, 1.0],  # Fração de amostras para cada árvore
    'max_features': ['auto', 'sqrt', 'log2', None],  # Número máximo de features para dividir um nó
    'loss': ['ls', 'lad', 'huber']  # Função de perda a ser usada
}

# Configurar o RandomizedSearchCV com 5 dobras de validação cruzada
random_search = RandomizedSearchCV(
    estimator=gbr_regressor,
    param_distributions=param_dist,
    n_iter=100,  # Número de tentativas aleatórias
    cv=5,  # Validação cruzada com 5 dobras
    n_jobs=-1,  # Usar todos os núcleos da CPU
    verbose=2,  # Mostrar progresso
    random_state=42,  # Semente para reprodutibilidade
    scoring='neg_mean_squared_error',  # Escolher a métrica de avaliação
    refit=True  # Refit no melhor modelo
)

# Ajustar o modelo aos dados de treino (X_train e y_train)
random_search.fit(X, y)

# Mostrar os melhores parâmetros encontrados
print("Melhores parâmetros encontrados: ", random_search.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


405 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\julia\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\julia\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "C:\Users\julia\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\Lo

Melhores parâmetros encontrados:  {'subsample': 0.9, 'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 5, 'loss': 'huber', 'learning_rate': 0.01}


### GERANDO NOVOS ARQUIVOS DE SUBMISSÃO COM BASE NOS ALGORITMOS TUNADOS

In [15]:
def csv_create(y_pred, ids):
      
    # Creating the submission DataFrame with 'id' and 'Transported'
    submission_data = pd.DataFrame({
        'id': ids,
        'Price': y_pred
    })
    
    # Checking for the 'submission' folder
    current_directory = os.getcwd()
    all_items = os.listdir(current_directory)
    folders = [item for item in all_items if os.path.isdir(os.path.join(current_directory, item))]
    
    # Checking if the 'submissions' folder exists, if not, create it
    if 'submissions' not in folders:
        submission_folder = os.path.join(current_directory, 'submissions')
        os.makedirs(submission_folder)
        
    # Getting the current date and time to create a unique filename
    now = datetime.now()
    filename = now.strftime("submission_%d_%m_%y_%H_%M.csv")
    
    # Saving the CSV in the 'submissions' folder with the date and time in the filename
    submission_data.to_csv(f'submissions/{filename}', index=False)
    
    print(f"File '{filename}' has been created in the 'submissions' folder!")

In [17]:
# Carregamento do dataset de treino e o extra
df_train = pd.read_csv("../data/train.csv")
df_extra = pd.read_csv("../data/training_extra.csv")
df = pd.concat([df_train, df_extra], axis=0)
df = df.drop(columns='id')

# Preparando os dados de treino
df_prep = data_preparation_1(df)

# Separando features do target
X = df_prep.drop(columns='Price')
y = df_prep['Price']

# Treinando modelo
ridge = Ridge(alpha=1000, solver='lsqr')
ridge.fit(X, y)

# Carregando o dataset de teste
df_test = pd.read_csv("../data/test.csv")

# Guardando os ids para submissão
ids = df_test['id']
ids
df_test = df_test.drop(columns='id')

# Preparando dados de teste
df_test_prep = data_preparation_1(df_test)

# Prevendo os resultados
y_pred = ridge.predict(df_test_prep)

# Criando o arquivo '.csv' para submissão
csv_create(y_pred, ids)

File 'submission_24_02_25_11_44.csv' has been created in the 'submissions' folder!


In [18]:
# Carregamento do dataset de treino e o extra
df_train = pd.read_csv("../data/train.csv")
df_extra = pd.read_csv("../data/training_extra.csv")
df = pd.concat([df_train, df_extra], axis=0)
df = df.drop(columns='id')

# Preparando os dados de treino
df_prep = data_preparation_1(df)

# Separando features do target
X = df_prep.drop(columns='Price')
y = df_prep['Price']

# Treinando modelo
light = lgb.LGBMRegressor(
    subsample=1.0,
    reg_lambda=1,
    reg_alpha=0,
    num_leaves=50,
    n_estimators=300,
    min_child_samples=20,
    max_depth=5,
    learning_rate=0.01,
    colsample_bytree=0.6
)
light.fit(X, y)

# Carregando o dataset de teste
df_test = pd.read_csv("../data/test.csv")

# Guardando os ids para submissão
ids = df_test['id']
ids
df_test = df_test.drop(columns='id')

# Preparando dados de teste
df_test_prep = data_preparation_1(df_test)

# Prevendo os resultados
y_pred = light.predict(df_test_prep)

# Criando o arquivo '.csv' para submissão
csv_create(y_pred, ids)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.034498 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 80
[LightGBM] [Info] Number of data points in the train set: 3994318, number of used features: 40
[LightGBM] [Info] Start training from score 81.362175
File 'submission_24_02_25_11_48.csv' has been created in the 'submissions' folder!


In [19]:
# Carregamento do dataset de treino e o extra
df_train = pd.read_csv("../data/train.csv")
df_extra = pd.read_csv("../data/training_extra.csv")
df = pd.concat([df_train, df_extra], axis=0)
df = df.drop(columns='id')

# Preparando os dados de treino
df_prep = data_preparation_1(df)

# Separando features do target
X = df_prep.drop(columns='Price')
y = df_prep['Price']

# Treinando modelo
gbr = GradientBoostingRegressor(
    subsample=0.9,
    n_estimators=500,
    min_samples_split=2,
    min_samples_leaf=2,
    max_features='sqrt',
    max_depth=5,
    loss='huber',
    learning_rate=0.01
)

gbr.fit(X, y)

# Carregando o dataset de teste
df_test = pd.read_csv("../data/test.csv")

# Guardando os ids para submissão
ids = df_test['id']
ids
df_test = df_test.drop(columns='id')

# Preparando dados de teste
df_test_prep = data_preparation_1(df_test)

# Prevendo os resultados
y_pred = gbr.predict(df_test_prep)

# Criando o arquivo '.csv' para submissão
csv_create(y_pred, ids)

File 'submission_24_02_25_12_16.csv' has been created in the 'submissions' folder!
