**Projeto de Feature Engineering**

**Importando Bibliotecas**

In [2]:
import pandas as pd

**Trazendo o arquivo**

In [5]:
df = pd.read_csv('clientes_churn.csv')

**Verificações inicias do arquivo**

In [10]:
# Dados iniciais
df.head()

Unnamed: 0,cliente_id,idade,genero,plano,tempo_cliente_meses,gasto_mensal,churn
0,1,25,M,basico,6,79.9,1
1,2,34,F,premium,24,129.9,0
2,3,45,M,standard,36,99.9,0
3,4,29,F,basico,12,69.9,1
4,5,52,M,premium,60,149.9,0


In [12]:
# Verificar tipos de dados das colunas
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   cliente_id           10 non-null     int64  
 1   idade                10 non-null     int64  
 2   genero               10 non-null     object 
 3   plano                10 non-null     object 
 4   tempo_cliente_meses  10 non-null     int64  
 5   gasto_mensal         10 non-null     float64
 6   churn                10 non-null     int64  
dtypes: float64(1), int64(4), object(2)
memory usage: 692.0+ bytes


In [19]:
# Verificar a quantidade de colunas categóricas e numéricas
df.dtypes.value_counts()

int64      4
object     2
float64    1
Name: count, dtype: int64

In [17]:
df['churn'].value_counts()

churn
0    6
1    4
Name: count, dtype: int64

In [22]:
# Estatísticas iniciais

df.describe()

Unnamed: 0,cliente_id,idade,tempo_cliente_meses,gasto_mensal,churn
count,10.0,10.0,10.0,10.0,10.0
mean,5.5,36.4,28.8,107.4,0.4
std,3.02765,9.72054,24.174367,35.059473,0.516398
min,1.0,23.0,3.0,59.9,0.0
25%,3.25,29.5,9.75,82.4,0.0
50%,5.5,35.5,21.0,97.4,0.0
75%,7.75,44.0,45.0,137.4,1.0
max,10.0,52.0,72.0,159.9,1.0


**Verificação de dados nulos**

In [25]:
df.isnull().sum()

# Não consta valores ausentes.

cliente_id             0
idade                  0
genero                 0
plano                  0
tempo_cliente_meses    0
gasto_mensal           0
churn                  0
dtype: int64

**Separar target**

In [36]:
X = df.drop(columns=['churn'])   
y = df['churn']

**Identificar valores categóricos**

In [38]:
X.select_dtypes(include='object').columns

Index(['genero', 'plano'], dtype='object')

**Aplicando One-Hot Encoding**

In [51]:
X_encoded = pd.get_dummies(
    X,
    columns=['genero', 'plano'],
    drop_first=True
)

In [50]:
X_encoded.head()

Unnamed: 0,cliente_id,idade,tempo_cliente_meses,gasto_mensal,genero_F,genero_M,plano_basico,plano_premium,plano_standard
0,1,25,6,79.9,False,True,True,False,False
1,2,34,24,129.9,True,False,False,True,False
2,3,45,36,99.9,False,True,False,False,True
3,4,29,12,69.9,True,False,True,False,False
4,5,52,60,149.9,False,True,False,True,False


**Dataset final**

In [53]:
df_final = pd.concat([X_encoded, y], axis=1)

In [55]:
df_final.head()

Unnamed: 0,cliente_id,idade,tempo_cliente_meses,gasto_mensal,genero_M,plano_premium,plano_standard,churn
0,1,25,6,79.9,True,False,False,1
1,2,34,24,129.9,False,True,False,0
2,3,45,36,99.9,True,False,True,0
3,4,29,12,69.9,False,False,False,1
4,5,52,60,149.9,True,True,False,0


In [57]:
# Dependendo da versão do pandas, pode ficar ainda como Boolean. Logo, precisa ser convertido em int.
df_final = pd.get_dummies(
    df,
    columns=['genero', 'plano'],
    drop_first=True,
    dtype=int
)

In [60]:
df_final = df_final.astype(int)

In [64]:
df_final['genero_M'].value_counts()

genero_M
1    5
0    5
Name: count, dtype: int64