In [26]:
## Importando as bibliotecas necessárias

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [13]:
## Carregar nosso dataset

df = pd.read_csv('adult11.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,salary
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [None]:
## Verificar o tipo de cada coluna, utilizaremos um modelo de regressão linear e devemos garantir que todos os dados possam ser lidos corretamente

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       48842 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      48842 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   gender          48842 non-null  object
 10  capital-gain    48842 non-null  int64 
 11  capital-loss    48842 non-null  int64 
 12  hours-per-week  48842 non-null  int64 
 13  native-country  48842 non-null  object
 14  salary          48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [None]:
## Verificar a existência de nulos

df.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
gender            0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
salary            0
dtype: int64

Não há nulos mas há campos com "?", e devemos tratar isso.

In [12]:
## Tratar os ? no dataset

df.replace(r'\s*\?\s*', np.nan, regex = True, inplace= True)

## Remove todas as linhas que contém nulos

df.dropna(inplace= True)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,salary
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K


Com os dados tratados, podemos seguir para o tratamento do salário

In [19]:
## Converter a coluna de salários para 0 e 1 com base em ser <=50k ou >50k respectivamente.

df['salary_numeric'] = df['salary'].str.strip().map({ 
'<=50K': 0,
'>50K': 1
})

df = df.drop('salary', axis = 1)

In [None]:
## Transformação das Features Categóricas (get_dummies)

col_categ = df.select_dtypes(include=['object']).columns
df_tratado = pd.get_dummies(df, columns=col_categ, drop_first= True)

(48842, 101)

Dataset tratado, vamos verificar seu shape

In [None]:
## Verificar shape

df_tratado.shape

(48842, 101)

### Agora vamos para a parte de treino e teste do modelo

In [24]:
## Separar X (features) e y (salário)

X = df_tratado.drop('salary_numeric', axis = 1)
y = df_tratado['salary_numeric']

In [29]:
## Dividir em treino/teste

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42)

In [30]:
## Feature Scaling
## Para colocar todas as features na mesma escala

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [32]:
## Criação e Treinamento do modelo

logistic = LogisticRegression(random_state= 42, max_iter= 1000)
logistic.fit(X_train_scaled, y_train)

print("Modelo treinado!")

Modelo treinado!


In [41]:
## Avaliação do modelo

previsoes = logistic.predict(X_test_scaled)

## Calcular as métricas de classificação

accuracy = accuracy_score(y_test, previsoes)
print(f"Acurácia: {accuracy:.4f}")

print("\n--- Matriz de Confusão ---")
print(confusion_matrix(y_test, previsoes))

# Um relatório completo da performance

print(classification_report(y_test, previsoes, target_names=['<=50K (0)', '>50K (1)']))

Acurácia: 0.8573

--- Matriz de Confusão ---
[[6983  496]
 [ 898 1392]]
              precision    recall  f1-score   support

   <=50K (0)       0.89      0.93      0.91      7479
    >50K (1)       0.74      0.61      0.67      2290

    accuracy                           0.86      9769
   macro avg       0.81      0.77      0.79      9769
weighted avg       0.85      0.86      0.85      9769



In [42]:
## Análise das previsões

print("\n --- Exemplo de Previsões vs. Reais ---")

df_resultados = pd.DataFrame({'Valor_Real': y_test, 'Previsão_Final': previsoes})

print(df_resultados.head(10))


 --- Exemplo de Previsões vs. Reais ---
       Valor_Real  Previsão_Final
7762            0               0
23881           0               0
30507           1               1
28911           0               0
19484           0               1
43031           0               0
28188           0               0
12761           1               1
40834           0               0
27875           0               0
