# Análise comparativa de modelos

In [74]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer

from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

## 1. Obtenção de dados

In [75]:
df = pd.read_csv("../data/raw/data.csv")
display(df.head())
display(df.describe())

df_dict = pd.read_csv("../data/external/dictionary.csv")
display(df_dict)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


Unnamed: 0,variavel,descricao,tipo,subtipo
0,survived,Sobreviventes ou não do passageiro,quantitativa,discreta
1,pclass,Número da classe do passageiro,qualitativa,ordinal
2,sex,Sexo do passageiro,qualitativa,nominal
3,age,Idade do passageiro,quantitativa,discreta
4,sibsp,Quantidade de irmãos/cônjuges a bordo,quantitativa,discreta
5,parch,Quantidade de pais / crianças a bordo,quantitativa,discreta
6,fare,Tarifa paga pelo passageiro,quantitativa,discreta
7,embarked,Sigla do porto de Embarque,qualitativa,nominal
8,class,Nome da classe do passageiro,qualitativa,ordinal
9,who,Classificação do passageiro,qualitativa,nominal


## 2.Tratamento dos dados

In [76]:
#Número de dados faltantes
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [77]:
# Modelos, quantitativa
imputer = IterativeImputer(estimator=LinearRegression())
imputer.fit(df[quantitative_variables])
df[quantitative_variables] = imputer.transform(df[quantitative_variables])
display(df.describe())

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.638277,0.523008,0.381594,32.204208
std,0.486592,0.836071,13.450228,1.102743,0.806057,49.693429
min,0.0,1.0,-5.383199,0.0,0.0,0.0
25%,0.0,2.0,22.0,0.0,0.0,7.9104
50%,0.0,3.0,30.0,0.0,0.0,14.4542
75%,1.0,3.0,35.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [78]:
# Imputação, qualitativa
imputer = SimpleImputer(strategy='constant', fill_value="MISSING")
imputer.fit(df[qualitative_variables])
df[qualitative_variables] = imputer.transform(df[qualitative_variables])
display(df['deck'].describe())

count         891
unique          8
top       MISSING
freq          688
Name: deck, dtype: object

In [79]:
#Número atual de dados faltantes
df.isnull().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alive          0
alone          0
dtype: int64

## 3. Limpeza de Dados

In [80]:
def tratar_outliers(df):
    colunas_numericas = df.select_dtypes(include=[np.number]).columns
    
    for coluna in colunas_numericas:
        Q1 = df[coluna].quantile(0.25)
        Q3 = df[coluna].quantile(0.75)
        IQR = Q3 - Q1
        
        df[coluna] = np.where((df[coluna] < (Q1 - 1.5 * IQR)) | (df[coluna] > (Q3 + 1.5 * IQR)), np.nan, df[coluna])
        df[coluna].fillna(df[coluna].median(), inplace=True)

    return df

df_cleaned = tratar_outliers(df)

print(df_cleaned.head())

   survived pclass     sex   age  sibsp  parch    fare embarked  class    who  \
0       0.0      3    male  22.0    1.0    0.0   7.250        S  Third    man   
1       1.0      1  female  38.0    1.0    0.0  13.000        C  First  woman   
2       1.0      3  female  26.0    0.0    0.0   7.925        S  Third  woman   
3       1.0      1  female  35.0    1.0    0.0  53.100        S  First  woman   
4       0.0      3    male  35.0    0.0    0.0   8.050        S  Third    man   

  adult_male     deck  embark_town alive  alone  
0       True  MISSING  Southampton    no  False  
1      False        C    Cherbourg   yes  False  
2      False  MISSING  Southampton   yes   True  
3      False        C  Southampton   yes  False  
4       True  MISSING  Southampton    no   True  


## 4. Preparação de dados

Normalização, codificação e o tratamento de dados discrepantes e/ou faltantes.

In [81]:
target_variable = 'survived'
quantitative_variables = (
    df_dict
    .query("tipo == 'quantitativa' and variavel != @target_variable")
    .variavel
    .to_list()
)
qualitative_variables = (
    df_dict
    .query("tipo == 'qualitativa' and variavel != @target_variable")
    .variavel
    .to_list()
)

In [82]:
X = df.drop(columns = [target_variable])
y = df[[target_variable]]

In [83]:
# Tratamento de dados discrepantes
quantitative_preprocess = Pipeline([
   ('missing', SimpleImputer(strategy='median')), # Tratamento de dados ausentes
   ('normalization', StandardScaler()), # Normalização
])

qualitative_preprocess = Pipeline([
   ('missing', SimpleImputer(strategy='most_frequent')), # Tratamento de dados ausentes
   ('encoding', OneHotEncoder()), # Transformação de dados 
])

preprocess = ColumnTransformer([
    ('quantitative', quantitative_preprocess, quantitative_variables),
    ('qualitative', qualitative_preprocess, qualitative_variables)
])

In [84]:
preprocess.fit(X)

In [85]:
X_transformed = preprocess.transform(X)
X_transformed.shape

(891, 37)