# Feature Engineering
 - Vamos realizar limpezas e transformações de dados brutos para estarem prontos para serem aplicados em modelos de Machine Learning
 

# Importando Bibliotecas

In [1]:

import pandas as pd
import numpy as np
import os
import warnings
import joblib  
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

## 1.0 Limpeza e Correção de tipos

In [2]:
df = pd.read_csv('../data/processed/churn_pos_eda.csv')
df['totalcharges'] = pd.to_numeric(df['totalcharges'], errors='coerce')
df['totalcharges'] = df['totalcharges'].fillna(0)
df['churn'] = df['churn'].map({'Yes': 1, 'No': 0})

print("Tipos de dados corrigidos:")
print(df.dtypes[['totalcharges', 'churn']])
print(f"\nQuantidade de Nulos restantes: {df['totalcharges'].isnull().sum()}")

Tipos de dados corrigidos:
totalcharges    float64
churn             int64
dtype: object

Quantidade de Nulos restantes: 0


## 2.0 Treino e Teste

In [3]:
X = df.drop('churn', axis=1)
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=1997, 
    stratify=y  # Mantém a proporção original de classes no split
)

print(f"Treino: {X_train.shape[0]} linhas")
print(f"Teste:  {X_test.shape[0]} linhas")

# Verificando a proporção
print("\nProporção de Churn no Treino:")
print(y_train.value_counts(normalize=True))
print("\nProporção de Churn no Teste:")
print(y_test.value_counts(normalize=True))

Treino: 5634 linhas
Teste:  1409 linhas

Proporção de Churn no Treino:
churn
0    0.734647
1    0.265353
Name: proportion, dtype: float64

Proporção de Churn no Teste:
churn
0    0.734564
1    0.265436
Name: proportion, dtype: float64


## 3.0. Transformação de variaveis

In [4]:
# Numéricas: Vamos colocar na mesma escala (Média 0, Desvio 1)
cols_numericas = ['tenure', 'monthlycharges', 'totalcharges']

# Categóricas: Vamos transformar em vetores (0 e 1)
cols_categoricas = ['internetservice', 'contract', 'paymentmethod', 'techsupport', 
                    'gender', 'partner', 'dependents', 'phoneservice', 'multiplelines', 
                    'onlinesecurity', 'onlinebackup', 'deviceprotection', 
                    'streamingtv', 'streamingmovies', 'paperlessbilling']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), cols_numericas),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), cols_categoricas)
    ])

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
feature_names = (cols_numericas + 
                 list(preprocessor.named_transformers_['cat'].get_feature_names_out(cols_categoricas)))

X_train_final = pd.DataFrame(X_train_processed, columns=feature_names)
X_test_final = pd.DataFrame(X_test_processed, columns=feature_names)

print(f"Treino Final: {X_train_final.shape}")
print(f"Teste Final:  {X_test_final.shape}")
X_train_final.head()

Treino Final: (5634, 29)
Teste Final:  (1409, 29)


Unnamed: 0,tenure,monthlycharges,totalcharges,internetservice_Fiber optic,internetservice_No,contract_One year,contract_Two year,paymentmethod_Credit card (automatic),paymentmethod_Electronic check,paymentmethod_Mailed check,techsupport_No internet service,techsupport_Yes,gender_Male,partner_Yes,dependents_Yes,phoneservice_Yes,multiplelines_No phone service,multiplelines_Yes,onlinesecurity_No internet service,onlinesecurity_Yes,onlinebackup_No internet service,onlinebackup_Yes,deviceprotection_No internet service,deviceprotection_Yes,streamingtv_No internet service,streamingtv_Yes,streamingmovies_No internet service,streamingmovies_Yes,paperlessbilling_Yes
0,1.611856,1.644784,2.557941,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
1,-1.033718,0.499439,-0.747452,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.67573,1.44002,1.326982,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
3,-1.277924,0.176478,-0.974875,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.530453,-1.331782,-0.26275,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


In [5]:
X_train_final.to_csv('../data/processed/X_train_processed.csv', index=False)
y_train.to_csv('../data/processed/y_train_processed.csv', index=False)

X_test_final.to_csv('../data/processed/X_test_processed.csv', index=False)
y_test.to_csv('../data/processed/y_test_processed.csv', index=False)

print("Arquivos salvos em data/processed:")
print("- X_train_processed.csv")
print("- y_train_processed.csv")
print("- X_test_processed.csv")
print("- y_test_processed.csv")

Arquivos salvos em data/processed:
- X_train_processed.csv
- y_train_processed.csv
- X_test_processed.csv
- y_test_processed.csv
