In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# 1) Carregar dataset Titanic
df = pd.read_csv("data/raw/titanic.csv")
print(df.head())

# 2) Checar valores nulos
print(df.isnull().sum())
print(df.isnull().mean() * 100)
print(df[df.isnull().any(axis=1)].head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
Pa

In [48]:
# Remover colunas com mais de 50% de nulos
df_menos_colunas = df.dropna(axis=1, thresh=0.5*len(df))

# Preencher numéricos com mediana
df_preenchido = df.fillna(df.median(numeric_only=True))
df_preenchido['Age'] = df_preenchido['Age'].astype(int)

# Preencher categóricos com moda
for col in df.select_dtypes(include=["object"]).columns:
    df_preenchido[col] = df_preenchido[col].fillna(df[col].mode()[0])

df_preenchido.to_csv("data/processed/titanic_clean.csv", index=False)

In [46]:
def detectar_outliers(col):
    Q1 = col.quantile(0.25)
    Q3 = col.quantile(0.75)
    IQR = Q3 - Q1
    limite_inferior = Q1 - 1.5 * IQR
    limite_superior = Q3 + 1.5 * IQR
    return col[(col < limite_inferior) | (col > limite_superior)]

outliers = {}
for col in df_preenchido.select_dtypes(include=[np.number]).columns:
    outliers[col] = detectar_outliers(df_preenchido[col])

print("\nQuantidade de outliers detectados por coluna:")
for col, vals in outliers.items():
    print(f"{col}: {len(vals)}")

# Substituir outliers pela mediana
df_tratado = df_preenchido.copy()
for col in df_tratado.select_dtypes(include=[np.number]).columns:
    Q1 = df_tratado[col].quantile(0.25)
    Q3 = df_tratado[col].quantile(0.75)
    IQR = Q3 - Q1
    limite_inferior = Q1 - 1.5 * IQR
    limite_superior = Q3 + 1.5 * IQR
    mediana = df_tratado[col].median()
    df_tratado.loc[(df_tratado[col] < limite_inferior) | (df_tratado[col] > limite_superior), col] = mediana

df_tratado.to_csv("data/processed/titanic_final.csv", index=False)



Quantidade de outliers detectados por coluna:
PassengerId: 0
Survived: 0
Pclass: 0
Age: 66
SibSp: 46
Parch: 213
Fare: 116


In [41]:
df = pd.read_csv("data/processed/titanic_final.csv")

print("Tipos de dados antes da conversão:\n", df.dtypes)

cols_categoricas = df.select_dtypes(include=["object"]).columns.tolist()
print("Colunas categóricas detectadas:", cols_categoricas)

# Aplicar One-Hot Encoding
df_numerico = pd.get_dummies(df, columns=cols_categoricas, drop_first=True)

print("\nDimensões após encoding:", df_numerico.shape)
print("Tipos de dados depois da conversão:\n", df_numerico.dtypes)

df_numerico.to_csv("data/processed/titanic_ml_ready.csv", index=False)
print("\nDataset convertido salvo em data/processed/titanic_ml_ready.csv")


Tipos de dados antes da conversão:
 PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age              int64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object
Colunas categóricas detectadas: ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

Dimensões após encoding: (891, 1726)
Tipos de dados depois da conversão:
 PassengerId    int64
Survived       int64
Pclass         int64
Age            int64
SibSp          int64
               ...  
Cabin_F4        bool
Cabin_G6        bool
Cabin_T         bool
Embarked_Q      bool
Embarked_S      bool
Length: 1726, dtype: object

Dataset convertido salvo em data/processed/titanic_ml_ready.csv


In [47]:
df = pd.read_csv("data/processed/titanic_ml_ready.csv")

print("Antes da normalização:")
print(df.describe().T.head())

y = df["Survived"] if "Survived" in df.columns else None
X = df.drop(columns=["Survived"]) if "Survived" in df.columns else df

scaler_std = StandardScaler()
X_std = pd.DataFrame(scaler_std.fit_transform(X), columns=X.columns)

scaler_mm = MinMaxScaler()
X_mm = pd.DataFrame(scaler_mm.fit_transform(X), columns=X.columns)

if y is not None:
    df_std = pd.concat([X_std, y], axis=1)
    df_mm = pd.concat([X_mm, y], axis=1)
else:
    df_std, df_mm = X_std, X_mm

df_std.to_csv("data/processed/titanic_ml_standard.csv", index=False)
df_mm.to_csv("data/processed/titanic_ml_minmax.csv", index=False)

print("\nDatasets salvos:")
print("StandardScaler: data/processed/titanic_ml_standard.csv")
print("MinMaxScaler:   data/processed/titanic_ml_minmax.csv")


Antes da normalização:
             count        mean         std  min    25%    50%    75%    max
PassengerId  891.0  446.000000  257.353842  1.0  223.5  446.0  668.5  891.0
Survived     891.0    0.383838    0.486592  0.0    0.0    0.0    1.0    1.0
Pclass       891.0    2.308642    0.836071  1.0    2.0    3.0    3.0    3.0
Age          891.0   28.468013    9.790722  3.0   23.5   28.0   33.0   54.0
SibSp        891.0    0.297419    0.521648  0.0    0.0    0.0    1.0    2.0

Datasets salvos:
StandardScaler: data/processed/titanic_ml_standard.csv
MinMaxScaler:   data/processed/titanic_ml_minmax.csv
