In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from scipy.stats import shapiro
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import GridSearchCV
import itertools
import os
from datetime import datetime

# Disables all deprecation warnings
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

### CARREGANDO OS DATASETS E CONCATENANDO-OS

In [2]:
df_1 = pd.read_csv("../data/train.csv", index_col='id')
df_2 = pd.read_csv("../data/training_extra.csv", index_col='id')
df = pd.concat([df_1, df_2], axis=0)
df

Unnamed: 0_level_0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.643760,39.17320
3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.937220,80.60793
4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312
...,...,...,...,...,...,...,...,...,...,...
4194313,Nike,Canvas,,3.0,Yes,Yes,Messenger,Blue,28.098120,104.74460
4194314,Puma,Leather,Small,10.0,Yes,Yes,Tote,Blue,17.379531,122.39043
4194315,Jansport,Canvas,Large,10.0,No,No,Backpack,Red,17.037708,148.18470
4194316,Puma,Canvas,,2.0,No,No,Backpack,Gray,28.783339,22.32269


### TRANSFORMANDO 'COMPARTMENTS' EM CATEGÓRICA

In [3]:
df_prep = df.copy()
df_prep['Compartments'] = df_prep['Compartments'].astype(int).astype(object)
df_prep.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3994318 entries, 0 to 4194317
Data columns (total 10 columns):
 #   Column                Dtype  
---  ------                -----  
 0   Brand                 object 
 1   Material              object 
 2   Size                  object 
 3   Compartments          object 
 4   Laptop Compartment    object 
 5   Waterproof            object 
 6   Style                 object 
 7   Color                 object 
 8   Weight Capacity (kg)  float64
 9   Price                 float64
dtypes: float64(2), object(8)
memory usage: 335.2+ MB


### PREENCHENDO OS VALORES NAN COM A MESMA DISTRIBUIÇÃO DAS CATEGORIAS

In [4]:
cat_columns = ['Brand', 'Material', 'Size', 'Compartments', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']

for column in cat_columns:
    # Mostrando a distribuição antes do preenchimento
    category_percent = df_prep[column].value_counts(normalize=True)
    print("Distriubuição antes do preenchimento")
    print(category_percent * 100)
    print("-------------------------------------------")

    # Preenchendo valores NaN com base nas proporções existentes entre as categorias
    n_nan = df_prep[column].isna().sum()
    categories = category_percent.index
    proportions = category_percent.values

    new_values = np.random.choice(categories, size=n_nan, p=proportions)

    df_prep.loc[df_prep[column].isna(), column] = new_values

    # Mostrando a distribuição depois do preenchimento
    category_percent = df_prep[column].value_counts(normalize=True)
    print("Distriubuição depois do preenchimento")
    print(category_percent * 100)
    print("-------------------------------------------")

Distriubuição antes do preenchimento
Brand
Under Armour    20.711637
Adidas          20.607308
Nike            19.764580
Puma            19.541468
Jansport        19.375006
Name: proportion, dtype: float64
-------------------------------------------
Distriubuição depois do preenchimento
Brand
Under Armour    20.712647
Adidas          20.606221
Nike            19.763799
Puma            19.544463
Jansport        19.372869
Name: proportion, dtype: float64
-------------------------------------------
Distriubuição antes do preenchimento
Material
Polyester    27.318690
Leather      25.137690
Nylon        24.274262
Canvas       23.269358
Name: proportion, dtype: float64
-------------------------------------------
Distriubuição depois do preenchimento
Material
Polyester    27.312297
Leather      25.142665
Nylon        24.275333
Canvas       23.269705
Name: proportion, dtype: float64
-------------------------------------------
Distriubuição antes do preenchimento
Size
Medium    34.672355
Large 

### PREENCHENDO 'WEIGHT CAPACITY (KG)' COM A MÉDIA

In [5]:
df_prep['Weight Capacity (kg)'] = df_prep['Weight Capacity (kg)'].fillna(df_prep['Weight Capacity (kg)'].mean())
df_prep.isna().sum()

Brand                   0
Material                0
Size                    0
Compartments            0
Laptop Compartment      0
Waterproof              0
Style                   0
Color                   0
Weight Capacity (kg)    0
Price                   0
dtype: int64

### ENGENHARIA DE ATRIBUTOS: TRANSFORMAR 'SIZE' EM NUMÉRICA ORDINAL

In [6]:
df_eng_1 = df_prep.copy()

# Criando um dicionário para fazer a substituição
size_mapping = {
    'Small': 1,
    'Medium': 2,
    'Large': 3
}

# Substituindo os valores da coluna 'size'
df_eng_1['Size'] = df_eng_1['Size'].replace(size_mapping)

### ENGENHARIA DE ATRIBUTOS: TRANSFORMANDO AS VARIÁVEIS BOOLEAAS EM 0 E 1

In [7]:
df_eng_2 = df_eng_1.copy()

df_eng_2['Laptop Compartment'] = df_eng_2['Laptop Compartment'].replace({'Yes': 1, 'No': 0})
df_eng_2['Waterproof'] = df_eng_2['Waterproof'].replace({'Yes': 1, 'No': 0})

### ENGENHARIA DE ATRIBUTOS: TRANSFORMANDO 'COMPARTMENTS' EM INTEGER

In [8]:
df_eng_3 = df_eng_2.copy()
df_eng_3['Compartments'] = df_eng_3['Compartments'].astype(int)

In [9]:
df_eng_3

Unnamed: 0_level_0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,Jansport,Leather,2,7,1,0,Tote,Black,11.611723,112.15875
1,Jansport,Canvas,1,10,1,1,Messenger,Green,27.078537,68.88056
2,Under Armour,Leather,1,2,1,0,Messenger,Red,16.643760,39.17320
3,Nike,Nylon,1,8,1,0,Messenger,Green,12.937220,80.60793
4,Adidas,Canvas,2,1,1,1,Messenger,Green,17.749338,86.02312
...,...,...,...,...,...,...,...,...,...,...
4194313,Nike,Canvas,1,3,1,1,Messenger,Blue,28.098120,104.74460
4194314,Puma,Leather,1,10,1,1,Tote,Blue,17.379531,122.39043
4194315,Jansport,Canvas,3,10,0,0,Backpack,Red,17.037708,148.18470
4194316,Puma,Canvas,2,2,0,0,Backpack,Gray,28.783339,22.32269


### ENGENHARIA DE ATRIBUTOS: APLICANDO O ONE HOT ENCODING NAS VARIÁVEIS CATEGÓRICAS

In [10]:
df_eng_4 = df_eng_3.copy()

cat_columns = df_eng_4.select_dtypes(include=['object']).columns
for column in cat_columns:
    df_eng_4 = pd.get_dummies(df_eng_4, columns=[column], prefix=['dum'], dtype=int)


### ENGENHARIA DE ATRIBUTOS: NORMALIZAR OS VALORES DA COLUNA 'WEIGHT CAPACITY (KG)'

In [11]:
df_eng_5 = df_eng_4.copy()

scaler = MinMaxScaler()
df_eng_5['Weight Capacity (kg)'] = scaler.fit_transform(df_eng_5[['Weight Capacity (kg)']])
df_eng_5

Unnamed: 0_level_0,Size,Compartments,Laptop Compartment,Waterproof,Weight Capacity (kg),Price,dum_Adidas,dum_Jansport,dum_Nike,dum_Puma,...,dum_Polyester,dum_Backpack,dum_Messenger,dum_Tote,dum_Black,dum_Blue,dum_Gray,dum_Green,dum_Pink,dum_Red
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2,7,1,0,0.264469,112.15875,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
1,1,10,1,1,0.883141,68.88056,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0
2,1,2,1,0,0.465750,39.17320,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,1,8,1,0,0.317489,80.60793,0,0,1,0,...,0,0,1,0,0,0,0,1,0,0
4,2,1,1,1,0.509974,86.02312,1,0,0,0,...,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4194313,1,3,1,1,0.923925,104.74460,0,0,1,0,...,0,0,1,0,0,1,0,0,0,0
4194314,1,10,1,1,0.495181,122.39043,0,0,0,1,...,0,0,0,1,0,1,0,0,0,0
4194315,3,10,0,0,0.481508,148.18470,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1
4194316,2,2,0,0,0.951334,22.32269,0,0,0,1,...,0,1,0,0,0,0,1,0,0,0


### TREINAMENTO DOS MODELOS

In [12]:
df_train = df_eng_5.copy()

# Dividindo as variáveis independentes (X) e o target (y)
X = df_train.drop('Price', axis=1)  # Considerando 'Price' como o target
y = df_train['Price']

# Dividindo o dataset em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Dicionário de modelos a serem testados
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso()
}

# Função para calcular o RMSE
def calculate_rmse(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    return rmse

# Testando cada modelo e imprimindo o RMSE
for name, model in models.items():
    rmse = calculate_rmse(model, X_train, X_test, y_train, y_test)
    print(f'{name} - RMSE: {rmse:.4f}')

Linear Regression - RMSE: 38.8975
Ridge - RMSE: 38.8975
Lasso - RMSE: 38.9157


In [13]:
# Defina os parâmetros a serem afinados
ridge_params = {'alpha': [0.01, 0.1, 1, 10, 100]}

# Defina os modelos
ridge = Ridge()

# GridSearch para Ridge
ridge_search = GridSearchCV(ridge, ridge_params, cv=5, scoring='neg_root_mean_squared_error', verbose=1)
ridge_search.fit(X_train, y_train)
print("Melhores parâmetros para Ridge:", ridge_search.best_params_)
print("Melhor RMSE para Ridge:", -ridge_search.best_score_)


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Melhores parâmetros para Ridge: {'alpha': 100}
Melhor RMSE para Ridge: 38.92473251856673


In [14]:
# Defina os parâmetros a serem afinados
lasso_params = {'alpha': [0.01, 0.1, 1, 10, 100]}

# Defina os modelos
lasso = Lasso()

# GridSearch para Lasso
lasso_search = GridSearchCV(lasso, lasso_params, cv=5, scoring='neg_root_mean_squared_error', verbose=1)
lasso_search.fit(X_train, y_train)
print("Melhores parâmetros para Lasso:", lasso_search.best_params_)
print("Melhor RMSE para Lasso:", -lasso_search.best_score_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Melhores parâmetros para Lasso: {'alpha': 0.01}
Melhor RMSE para Lasso: 38.92484300700449


In [15]:
ridge_tuned = Ridge(alpha=100)
ridge_tuned.fit(X_train, y_train)
y_pred = ridge_tuned.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Ridge: {rmse:.4f}")

Ridge: 38.8975


In [16]:
lasso_tuned = Lasso(alpha=0.01)
lasso_tuned.fit(X_train, y_train)
y_pred = lasso_tuned.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Lasso: {rmse:.4f}")

Lasso: 38.8975


### CRIAR OS ARQUIVOS DE SUBMISSÃO, UTILIZANDO OS DOIS TRATAMENTOS DAS FEATURES

### 1. DUMMIES PARA TODAS AS VARIÁVEIS CATEGÓRICAS

In [2]:
def data_preparation(df):
    # Pré-processando as variáveis
    df_prep = df.copy()
    df_prep['Compartments'] = df_prep['Compartments'].astype(int).astype(object)

    cat_columns = ['Brand', 'Material', 'Size', 'Compartments', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']
    for column in cat_columns:
        # Mostrando a distribuição antes do preenchimento
        category_percent = df_prep[column].value_counts(normalize=True)
        
        # Preenchendo valores NaN com base nas proporções existentes entre as categorias
        n_nan = df_prep[column].isna().sum()
        categories = category_percent.index
        proportions = category_percent.values

        new_values = np.random.choice(categories, size=n_nan, p=proportions)

        df_prep.loc[df_prep[column].isna(), column] = new_values

    df_prep['Weight Capacity (kg)'] = df_prep['Weight Capacity (kg)'].fillna(df_prep['Weight Capacity (kg)'].mean())

    # Engenharia de atributos: get dummies para TODAS variáveis categóricas
    df_eng = df_prep.copy()
    cat_columns = df_eng.select_dtypes(include=['object']).columns
    for column in cat_columns:
        df_eng = pd.get_dummies(df_eng, columns=[column], prefix=['dum'], dtype=int)

    scaler = MinMaxScaler()
    df_eng['Weight Capacity (kg)'] = scaler.fit_transform(df_eng[['Weight Capacity (kg)']])

    return df_eng

def model_training(df_eng):
    # Dividindo as variáveis independentes (X) e o target (y)
    X = df_eng.drop('Price', axis=1)
    y = df_eng['Price']

    ridge = Ridge(alpha=100)
    ridge.fit(X, y)

    return ridge

def csv_create(y_pred, ids):
      
    # Creating the submission DataFrame with 'id' and 'Transported'
    submission_data = pd.DataFrame({
        'id': ids,
        'Price': y_pred
    })
    
    # Checking for the 'submission' folder
    current_directory = os.getcwd()
    all_items = os.listdir(current_directory)
    folders = [item for item in all_items if os.path.isdir(os.path.join(current_directory, item))]
    
    # Checking if the 'submissions' folder exists, if not, create it
    if 'submissions' not in folders:
        submission_folder = os.path.join(current_directory, 'submissions')
        os.makedirs(submission_folder)
        
    # Getting the current date and time to create a unique filename
    now = datetime.now()
    filename = now.strftime("submission_%d_%m_%y_%H_%M.csv")
    
    # Saving the CSV in the 'submissions' folder with the date and time in the filename
    submission_data.to_csv(f'submissions/{filename}', index=False)
    
    print(f"File '{filename}' has been created in the 'submissions' folder!")

In [3]:
# Carregando os datasets para o treinamento e concantenando-os
df_1 = pd.read_csv("../data/train.csv")
df_2 = pd.read_csv("../data/training_extra.csv")
df = pd.concat([df_1, df_2], axis=0)

# Preparando os dados de treino
df_prep = data_preparation(df)
df_prep = df_prep.drop(columns='id')

# Treinando o modelo
model = model_training(df_prep)

# Carregando o data set de teste
df_test = pd.read_csv("../data/test.csv")

# Guardando os 'ids'
ids = df_test['id']

# Preparando os dados de teste
df_test_prep = data_preparation(df_test)
df_to_predict = df_test_prep.drop(columns='id')

# Prevendo os valores de 'Price' no dataset de teste
y_pred = model.predict(df_to_predict)

# Criando o arquivo '.csv' para submissão
csv_create(y_pred, ids)


File 'submission_14_02_25_15_20.csv' has been created in the 'submissions' folder!


### 2. 'SIZE' TRANSFORMADA EM NUMÉRICA ORIDNAL E 'LAPTOP COMPARTMENT' E 'WATERPROOF' COM TRATAMENTOS ESPECÍFICOS

In [4]:
def data_preparation_2(df):
    # Pré-processando as variáveis
    df_prep = df.copy()
    df_prep['Compartments'] = df_prep['Compartments'].astype(int).astype(object)

    cat_columns = ['Brand', 'Material', 'Size', 'Compartments', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']
    for column in cat_columns:
        # Mostrando a distribuição antes do preenchimento
        category_percent = df_prep[column].value_counts(normalize=True)
        
        # Preenchendo valores NaN com base nas proporções existentes entre as categorias
        n_nan = df_prep[column].isna().sum()
        categories = category_percent.index
        proportions = category_percent.values

        new_values = np.random.choice(categories, size=n_nan, p=proportions)

        df_prep.loc[df_prep[column].isna(), column] = new_values

    df_prep['Weight Capacity (kg)'] = df_prep['Weight Capacity (kg)'].fillna(df_prep['Weight Capacity (kg)'].mean())

    # Engenharia de Atributos:
    df_eng = df_prep.copy()

    # Engenharia de Atributos: transformando 'Size' em numérica ordinal
    size_mapping = {
        'Small': 1,
        'Medium': 2,
        'Large': 3
    }
    df_eng['Size'] = df_eng['Size'].replace(size_mapping)
    
    # Engenharia de Atributos: convertendo 'Yes' e 'No' em 1 e 0 nas variáveis 'Laptop Compartment' e 'Waterproof'
    df_eng['Laptop Compartment'] = df_eng['Laptop Compartment'].replace({'Yes': 1, 'No': 0})
    df_eng['Waterproof'] = df_eng['Waterproof'].replace({'Yes': 1, 'No': 0})


    # Engenharia de atributos: get dummies para as variáveis categóricas
    cat_columns = df_eng.select_dtypes(include=['object']).columns
    for column in cat_columns:
        df_eng = pd.get_dummies(df_eng, columns=[column], prefix=['dum'], dtype=int)

    # Engenharia de atributos: normalizar os valores da colna 'Weight Capacity (kg)'
    scaler = MinMaxScaler()
    df_eng['Weight Capacity (kg)'] = scaler.fit_transform(df_eng[['Weight Capacity (kg)']])

    return df_eng

In [60]:
# Carregando os datasets para o treinamento e concantenando-os
df_1 = pd.read_csv("../data/train.csv")
df_2 = pd.read_csv("../data/training_extra.csv")
df = pd.concat([df_1, df_2], axis=0)

# Preparando os dados de treino
df_prep = data_preparation_2(df)
df_prep = df_prep.drop(columns='id')

# Treinando o modelo
model = model_training(df_prep)

# Carregando o data set de teste
df_test = pd.read_csv("../data/test.csv")

# Guardando os 'ids'
ids = df_test['id']

# Preparando os dados de teste
df_test_prep = data_preparation_2(df_test)
df_to_predict = df_test_prep.drop(columns='id')

# Prevendo os valores de 'Price' no dataset de teste
y_pred = model.predict(df_to_predict)

# Criando o arquivo '.csv' para submissão
csv_create(y_pred, ids)

File 'submission_14_02_25_10_07.csv' has been created in the 'submissions' folder!


### A ESTRATÉGIA AGORA É REDUZIR O TAMANHO DO DATASET PARA VER SE OUTROS ALGORITMOS TRAZEM RESULTADOS MAIS INTERESSATES

In [6]:
# Carregando os datasets para o treinamento e concantenando-os
df_1 = pd.read_csv("../data/train.csv")
df_2 = pd.read_csv("../data/training_extra.csv")
df = pd.concat([df_1, df_2], axis=0)

# Tirando uma amostra do dataset
df_sample = df.sample(frac=0.1, random_state=42)

# Preparando os dados de treino
df_prep = data_preparation_2(df)
df_prep = df_prep.drop(columns='id')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
import numpy as np

# Dividindo as variáveis independentes (X) e o target (y)
X = df_prep.drop('Price', axis=1)  # Considerando 'Price' como o target
y = df_prep['Price']

# Dividindo o dataset em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Dicionário de modelos a serem testados
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'K-Nearest Neighbors': KNeighborsRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Support Vector Regressor': SVR(),
    'Gradient Boosting': GradientBoostingRegressor()
}

# Função para calcular o RMSE
def calculate_rmse(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    return rmse

# Testando cada modelo e imprimindo o RMSE
for name, model in models.items():
    rmse = calculate_rmse(model, X_train, X_test, y_train, y_test)
    print(f'{name}: {rmse:.4f}')


Linear Regression: 38.8948
Ridge: 38.8948
Lasso: 38.9157
K-Nearest Neighbors: 42.5974
Decision Tree: 55.6676
Random Forest: 40.4343
