In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler

df = pd.read_csv("data/raw/adult.csv")
print(df.head())

   age  workclass  fnlwgt     education  educational-num      marital-status  \
0   25    Private  226802          11th                7       Never-married   
1   38    Private   89814       HS-grad                9  Married-civ-spouse   
2   28  Local-gov  336951    Assoc-acdm               12  Married-civ-spouse   
3   44    Private  160323  Some-college               10  Married-civ-spouse   
4   18          ?  103497  Some-college               10       Never-married   

          occupation relationship   race  gender  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black    Male             0             0   
1    Farming-fishing      Husband  White    Male             0             0   
2    Protective-serv      Husband  White    Male             0             0   
3  Machine-op-inspct      Husband  Black    Male          7688             0   
4                  ?    Own-child  White  Female             0             0   

   hours-per-week native-country incom

In [32]:
print(df.isnull().sum())

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64


In [33]:
print(df.isnull().mean() * 100)

age                0.0
workclass          0.0
fnlwgt             0.0
education          0.0
educational-num    0.0
marital-status     0.0
occupation         0.0
relationship       0.0
race               0.0
gender             0.0
capital-gain       0.0
capital-loss       0.0
hours-per-week     0.0
native-country     0.0
income             0.0
dtype: float64


In [34]:
print(df[df.isnull().any(axis=1)].head())

Empty DataFrame
Columns: [age, workclass, fnlwgt, education, educational-num, marital-status, occupation, relationship, race, gender, capital-gain, capital-loss, hours-per-week, native-country, income]
Index: []


In [35]:
df_sem_nulos = df.dropna()

df_menos_colunas = df.dropna(axis=1, thresh=0.5*len(df))

In [36]:
df_preenchido = df.fillna(df.median(numeric_only=True))

In [37]:
for col in df.select_dtypes(include=["object"]).columns:
    df_preenchido[col] = df_preenchido[col].fillna(df[col].mode()[0])


In [38]:
df_preenchido.to_csv("data/processed/adult_clean.csv", index=False)

In [39]:
def detectar_outliers(col):
    Q1 = col.quantile(0.25)
    Q3 = col.quantile(0.75)
    IQR = Q3 - Q1
    limite_inferior = Q1 - 1.5 * IQR
    limite_superior = Q3 + 1.5 * IQR
    return col[(col < limite_inferior) | (col > limite_superior)]

outliers = {}
for col in df_preenchido.select_dtypes(include=[np.number]).columns:
    outliers[col] = detectar_outliers(df_preenchido[col])

print("\nQuantidade de outliers detectados por coluna:")
for col, vals in outliers.items():
    print(f"{col}: {len(vals)}")

df_tratado = df_preenchido.copy()
for col in df_tratado.select_dtypes(include=[np.number]).columns:
    Q1 = df_tratado[col].quantile(0.25)
    Q3 = df_tratado[col].quantile(0.75)
    IQR = Q3 - Q1
    limite_inferior = Q1 - 1.5 * IQR
    limite_superior = Q3 + 1.5 * IQR
    mediana = df_tratado[col].median()
    df_tratado.loc[(df_tratado[col] < limite_inferior) | (df_tratado[col] > limite_superior), col] = mediana

df_tratado.to_csv("data/processed/adult_final.csv", index=False)


Quantidade de outliers detectados por coluna:
age: 216
fnlwgt: 1453
educational-num: 1794
capital-gain: 4035
capital-loss: 2282
hours-per-week: 13496


  df_tratado.loc[(df_tratado[col] < limite_inferior) | (df_tratado[col] > limite_superior), col] = mediana


In [40]:
df = pd.read_csv("data/processed/adult_final.csv")

print("Tipos de dados antes da conversão:\n", df.dtypes)



Tipos de dados antes da conversão:
 age                  int64
workclass           object
fnlwgt             float64
education           object
educational-num      int64
marital-status      object
occupation          object
relationship        object
race                object
gender              object
capital-gain         int64
capital-loss         int64
hours-per-week       int64
native-country      object
income              object
dtype: object


In [41]:
df = pd.read_csv("data/processed/adult_final.csv")

print("Tipos de dados antes da conversão:\n", df.dtypes)


if "education" in df.columns and "education-num" in df.columns:
    df = df.drop(columns=["education"])


cols_categoricas = df.select_dtypes(include=["object"]).columns.tolist()
print("Colunas categóricas detectadas:", cols_categoricas)

# Aplicar One-Hot Encoding
df_numerico = pd.get_dummies(df, columns=cols_categoricas, drop_first=True)

print("\nDimensões após encoding:", df_numerico.shape)
print("Tipos de dados depois da conversão:\n", df_numerico.dtypes)

df_numerico.to_csv("data/processed/adult_ml_ready.csv", index=False)
print("\nDataset convertido salvo em data/processed/adult_ml_ready.csv")

Tipos de dados antes da conversão:
 age                  int64
workclass           object
fnlwgt             float64
education           object
educational-num      int64
marital-status      object
occupation          object
relationship        object
race                object
gender              object
capital-gain         int64
capital-loss         int64
hours-per-week       int64
native-country      object
income              object
dtype: object
Colunas categóricas detectadas: ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']

Dimensões após encoding: (48842, 101)
Tipos de dados depois da conversão:
 age                                 int64
fnlwgt                            float64
educational-num                     int64
capital-gain                        int64
capital-loss                        int64
                                   ...   
native-country_Trinadad&Tobago       bool
native-country_United-

In [42]:
df = pd.read_csv("data/processed/adult_ml_ready.csv")

print("Antes da normalização:")
print(df.describe().T.head())


y = df["income_>50K"] if "income_>50K" in df.columns else None
X = df.drop(columns=["income_>50K"]) if "income_>50K" in df.columns else df


scaler_std = StandardScaler()
X_std = pd.DataFrame(scaler_std.fit_transform(X), columns=X.columns)


scaler_mm = MinMaxScaler()
X_mm = pd.DataFrame(scaler_mm.fit_transform(X), columns=X.columns)


if y is not None:
    df_std = pd.concat([X_std, y], axis=1)
    df_mm = pd.concat([X_mm, y], axis=1)
else:
    df_std, df_mm = X_std, X_mm


df_std.to_csv("data/processed/adult_ml_standard.csv", index=False)
df_mm.to_csv("data/processed/adult_ml_minmax.csv", index=False)

print("\nDatasets salvos:")
print("StandardScaler: data/processed/adult_ml_standard.csv")
print("MinMaxScaler:   data/processed/adult_ml_minmax.csv")

Antes da normalização:
                   count           mean           std      min       25%  \
age              48842.0      38.437247     13.375595     17.0      28.0   
fnlwgt           48842.0  179641.816214  86321.206519  12285.0  117550.5   
educational-num  48842.0      10.324106      2.198878      5.0       9.0   
capital-gain     48842.0       0.000000      0.000000      0.0       0.0   
capital-loss     48842.0       0.000000      0.000000      0.0       0.0   

                       50%       75%       max  
age                  37.00      47.0      78.0  
fnlwgt           178143.25  226891.0  417668.0  
educational-num      10.00      12.0      16.0  
capital-gain          0.00       0.0       0.0  
capital-loss          0.00       0.0       0.0  

Datasets salvos:
StandardScaler: data/processed/adult_ml_standard.csv
MinMaxScaler:   data/processed/adult_ml_minmax.csv
