In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler


# 1. Carregar o Dataset

column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
                'marital-status', 'occupation', 'relationship', 'race', 'sex',
                'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

df = pd.read_csv("adult.data", header=None, names=column_names, na_values=' ?')


In [9]:
# 2. Padronização de Dados

# Remover espaços extras dos textos
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].str.strip()

# Padronizar valores específicos de países
country_mapping = {
    'United-States': 'United States',
    'United States': 'United States'
}
df['native-country'] = df['native-country'].replace(country_mapping)

# Converter valores "?" em NaN
df.replace("?", np.nan, inplace=True)


In [6]:
# 3. Normalização dos Dados Numéricos

numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Min-Max Scaling
minmax_scaler = MinMaxScaler()
df_minmax = df.copy()
df_minmax[numeric_cols] = minmax_scaler.fit_transform(df[numeric_cols])

# Z-score Standardization
standard_scaler = StandardScaler()
df_zscore = df.copy()
df_zscore[numeric_cols] = standard_scaler.fit_transform(df[numeric_cols])

# Robust Scaling
robust_scaler = RobustScaler()
df_robust = df.copy()
df_robust[numeric_cols] = robust_scaler.fit_transform(df[numeric_cols])


In [10]:
# 4. Comparação rápida


print("\n=== Min-Max (0 a 1) ===")
print(df_minmax[numeric_cols].head())

print("\n=== Z-Score (média 0, desvio 1) ===")
print(df_zscore[numeric_cols].head())

print("\n=== Robust Scaling (menos sensível a outliers) ===")
print(df_robust[numeric_cols].head())



=== Min-Max (0 a 1) ===
        age    fnlwgt  education-num  capital-gain  capital-loss  \
0  0.301370  0.044302       0.800000       0.02174           0.0   
1  0.452055  0.048238       0.800000       0.00000           0.0   
2  0.287671  0.138113       0.533333       0.00000           0.0   
3  0.493151  0.151068       0.400000       0.00000           0.0   
4  0.150685  0.221488       0.800000       0.00000           0.0   

   hours-per-week  
0        0.397959  
1        0.122449  
2        0.397959  
3        0.397959  
4        0.397959  

=== Z-Score (média 0, desvio 1) ===
        age    fnlwgt  education-num  capital-gain  capital-loss  \
0  0.030671 -1.063611       1.134739      0.148453      -0.21666   
1  0.837109 -1.008707       1.134739     -0.145920      -0.21666   
2 -0.042642  0.245079      -0.420060     -0.145920      -0.21666   
3  1.057047  0.425801      -1.197459     -0.145920      -0.21666   
4 -0.775768  1.408176       1.134739     -0.145920      -0.21666   

