In [23]:
import pandas as pd
import numpy as np

from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import recall_score

###  Carga de Dados

In [2]:
# Carregar dados
df_churn = pd.read_csv('./datasets/churn_telecom.csv')

In [3]:
# Visualizar Estrutura
df_churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   IDCliente         7032 non-null   object 
 1   Genero            7032 non-null   object 
 2   Mais65anos        7032 non-null   int64  
 3   TemParceiro       7032 non-null   object 
 4   TemDependentes    7032 non-null   object 
 5   PhoneService      7032 non-null   object 
 6   MultipleLines     7032 non-null   object 
 7   InternetService   7032 non-null   object 
 8   OnlineSecurity    7032 non-null   object 
 9   OnlineBackup      7032 non-null   object 
 10  DeviceProtection  7032 non-null   object 
 11  TechSupport       7032 non-null   object 
 12  StreamingTV       7032 non-null   object 
 13  StreamingMovies   7032 non-null   object 
 14  tenure            7032 non-null   int64  
 15  Contract          7032 non-null   object 
 16  PaperlessBilling  7032 non-null   object 


In [None]:
# Visualizar as primeiras linhas
df_churn.head(10)

Unnamed: 0,IDCliente,Genero,Mais65anos,TemParceiro,TemDependentes,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,tenure,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,No,No phone service,DSL,No,Yes,No,No,No,No,1,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,34,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,Yes,No,DSL,Yes,Yes,No,No,No,No,2,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,45,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,2,Month-to-month,Yes,Electronic check,70.7,151.65,Yes
5,9305-CDSKC,Female,0,No,No,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,8,Month-to-month,Yes,Electronic check,99.65,820.5,Yes
6,1452-KIOVK,Male,0,No,Yes,Yes,Yes,Fiber optic,No,Yes,No,No,Yes,No,22,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,No
7,6713-OKOMC,Female,0,No,No,No,No phone service,DSL,Yes,No,No,No,No,No,10,Month-to-month,No,Mailed check,29.75,301.9,No
8,7892-POOKP,Female,0,Yes,No,Yes,Yes,Fiber optic,No,No,Yes,Yes,Yes,Yes,28,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes
9,6388-TABGU,Male,0,No,Yes,Yes,No,DSL,Yes,Yes,No,No,No,No,62,One year,No,Bank transfer (automatic),56.15,3487.95,No


In [5]:
# Visualizar as últimos linhas
df_churn.tail(10)

Unnamed: 0,IDCliente,Genero,Mais65anos,TemParceiro,TemDependentes,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,tenure,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
7022,9767-FFLEM,Male,0,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,38,Month-to-month,Yes,Credit card (automatic),69.5,2625.25,No
7023,0639-TSIQW,Female,0,No,No,Yes,Yes,Fiber optic,Yes,Yes,Yes,No,Yes,No,67,Month-to-month,Yes,Credit card (automatic),102.95,6886.25,Yes
7024,8456-QDAVC,Male,0,No,No,Yes,No,Fiber optic,No,No,No,No,Yes,No,19,Month-to-month,Yes,Bank transfer (automatic),78.7,1495.1,No
7025,7750-EYXWZ,Female,0,No,No,No,No phone service,DSL,No,Yes,Yes,Yes,Yes,Yes,12,One year,No,Electronic check,60.65,743.3,No
7026,2569-WGERO,Female,0,No,No,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,72,Two year,Yes,Bank transfer (automatic),21.15,1419.4,No
7027,6840-RESVB,Male,0,Yes,Yes,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,24,One year,Yes,Mailed check,84.8,1990.5,No
7028,2234-XADUH,Female,0,Yes,Yes,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,72,One year,Yes,Credit card (automatic),103.2,7362.9,No
7029,4801-JZAZL,Female,0,Yes,Yes,No,No phone service,DSL,Yes,No,No,No,No,No,11,Month-to-month,Yes,Electronic check,29.6,346.45,No
7030,8361-LTMKD,Male,1,Yes,No,Yes,Yes,Fiber optic,No,No,No,No,No,No,4,Month-to-month,Yes,Mailed check,74.4,306.6,Yes
7031,3186-AJIEK,Male,0,No,No,Yes,No,Fiber optic,Yes,No,Yes,Yes,Yes,Yes,66,Two year,Yes,Bank transfer (automatic),105.65,6844.5,No


In [7]:
# Contar clientes usando a variável Churn como referência
df_churn.Churn.value_counts()

Churn
No     5163
Yes    1869
Name: count, dtype: int64

In [8]:
# Distribuição percentual de Clientes pela variável Churn
df_churn.Churn.value_counts(normalize=True)


Churn
No     0.734215
Yes    0.265785
Name: proportion, dtype: float64

### Preparação da Base para Algoritmo LOF

In [9]:
# Selecionando as colunas para o algoritmo
X = df_churn.drop(columns=['IDCliente', 'Churn'])
y = df_churn['Churn']

In [10]:
# Definir uma função para transformar "Yes" em 1 e "No" em 0
def binary_tranformer_function(X):
    return X.map(lambda x: 1 if x == 'Yes' else 0)

In [12]:
# Tranformações
numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical_features = ['Genero', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']
binary_features = ['TemParceiro', 'TemDependentes', 'TechSupport', 'PhoneService', 'PaperlessBilling']
no_transformation_features = ['Mais65anos']

# Criar Transformer
numeric_transformer = StandardScaler()
categorical_tranformer = OneHotEncoder()
binary_tranformer = FunctionTransformer(binary_tranformer_function)

# Criar o preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_tranformer, categorical_features),
        ('bin', binary_tranformer, binary_features),
        ('pass', 'passthrough', no_transformation_features)
    ]
)

# Tranformar os dados
X_transformed = preprocessor.fit_transform(X)

In [13]:
# Visualizar X_transformed
X_transformed

array([[-1.28024804, -1.16169394, -0.99419409, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.06430269, -0.26087792, -0.17373982, ...,  1.        ,
         0.        ,  0.        ],
       [-1.23950408, -0.36392329, -0.95964911, ...,  1.        ,
         1.        ,  0.        ],
       ...,
       [-0.87280842, -1.17000405, -0.85451414, ...,  0.        ,
         1.        ,  0.        ],
       [-1.15801615,  0.31916782, -0.87209546, ...,  1.        ,
         1.        ,  1.        ],
       [ 1.36810945,  1.35793167,  2.01234407, ...,  1.        ,
         1.        ,  0.        ]], shape=(7032, 39))

In [14]:
X_transformed.shape

(7032, 39)

### Treinar o algoritmo LOF

In [15]:
# Instanciar um objeto LOF
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.26)

In [16]:
# Treinar o algoritmo e já gerar as classificação de anomalia para cada registro (ponto de dados)
y_pred = lof.fit_predict(X_transformed)

In [20]:
# Mostrar valores preditos (anomalia ou não anomalia)
## No sklearn, o predicr gera um valor = -1 (anomalia) e valor = 1 (pontos normais)
y_pred

array([ 1,  1,  1, ...,  1, -1,  1], shape=(7032,))

In [22]:
# Mostrar o LOF calculadado para cada ponto de dados
# No sklearn, o LOF calculado fica na propriedade negative_outlier_factor_
# negative_outlier_factor_ é o inversi di LOF. Quanto menor, mais anormal.
-lof.negative_outlier_factor_

array([1.0238333 , 1.03547225, 1.02610568, ..., 1.07053634, 1.19840027,
       1.08901757], shape=(7032,))

### Apresentar resultados

In [24]:
# Identificar anomalias
outliers = y_pred == -1
inliers = y_pred == 1

# Contar as anomalias e os pontos normais
num_outliers = np.sum(outliers)
num_inliers = np.sum(inliers)

# Apresentar estatísticas
print(f'Anomalias detectadas: {num_outliers}')
print(f'Pontos Normais: {num_inliers}')

Anomalias detectadas: 1829
Pontos Normais: 5203


In [25]:
# Converter y para a mesma base do y_pred
y_true = y.map(lambda x: -1 if x == 'Yes' else 1)

In [26]:
# Calcular Score com base no valor de y (Churn real da base)
# Usar Recall, pois o objetivo principal é maximizar o TPR (True Positive Rate)
recall_score(y_true, y_pred)

0.7515010652721286