In [59]:
import pandas as pd
import numpy as np
import requests
import re

In [149]:
# Buscando os Dados

# Url da base de dados
data_url = 'https://raw.githubusercontent.com/ingridcristh/challenge2-data-science/refs/heads/main/TelecomX_Data.json'

response = requests.get(data_url)

if response.status_code == 200:
    json_file = response.json()
else:
    print(f"Bad response: status code: {response.status_code}")
    


In [150]:
# Criando o Dataframe
raw_data = pd.DataFrame(json_file)

# Primeira visualização
raw_data.head()

Unnamed: 0,customerID,Churn,customer,phone,internet,account
0,0002-ORFBO,No,"{'gender': 'Female', 'SeniorCitizen': 0, 'Part...","{'PhoneService': 'Yes', 'MultipleLines': 'No'}","{'InternetService': 'DSL', 'OnlineSecurity': '...","{'Contract': 'One year', 'PaperlessBilling': '..."
1,0003-MKNFE,No,"{'gender': 'Male', 'SeniorCitizen': 0, 'Partne...","{'PhoneService': 'Yes', 'MultipleLines': 'Yes'}","{'InternetService': 'DSL', 'OnlineSecurity': '...","{'Contract': 'Month-to-month', 'PaperlessBilli..."
2,0004-TLHLJ,Yes,"{'gender': 'Male', 'SeniorCitizen': 0, 'Partne...","{'PhoneService': 'Yes', 'MultipleLines': 'No'}","{'InternetService': 'Fiber optic', 'OnlineSecu...","{'Contract': 'Month-to-month', 'PaperlessBilli..."
3,0011-IGKFF,Yes,"{'gender': 'Male', 'SeniorCitizen': 1, 'Partne...","{'PhoneService': 'Yes', 'MultipleLines': 'No'}","{'InternetService': 'Fiber optic', 'OnlineSecu...","{'Contract': 'Month-to-month', 'PaperlessBilli..."
4,0013-EXCHZ,Yes,"{'gender': 'Female', 'SeniorCitizen': 1, 'Part...","{'PhoneService': 'Yes', 'MultipleLines': 'No'}","{'InternetService': 'Fiber optic', 'OnlineSecu...","{'Contract': 'Month-to-month', 'PaperlessBilli..."


In [151]:
# Criando uma lista com as colunas que precisam ser normalizadas
df = pd.json_normalize(json_file)
df.head()

Unnamed: 0,customerID,Churn,customer.gender,customer.SeniorCitizen,customer.Partner,customer.Dependents,customer.tenure,phone.PhoneService,phone.MultipleLines,internet.InternetService,...,internet.OnlineBackup,internet.DeviceProtection,internet.TechSupport,internet.StreamingTV,internet.StreamingMovies,account.Contract,account.PaperlessBilling,account.PaymentMethod,account.Charges.Monthly,account.Charges.Total
0,0002-ORFBO,No,Female,0,Yes,Yes,9,Yes,No,DSL,...,Yes,No,Yes,Yes,No,One year,Yes,Mailed check,65.6,593.3
1,0003-MKNFE,No,Male,0,No,No,9,Yes,Yes,DSL,...,No,No,No,No,Yes,Month-to-month,No,Mailed check,59.9,542.4
2,0004-TLHLJ,Yes,Male,0,No,No,4,Yes,No,Fiber optic,...,No,Yes,No,No,No,Month-to-month,Yes,Electronic check,73.9,280.85
3,0011-IGKFF,Yes,Male,1,Yes,No,13,Yes,No,Fiber optic,...,Yes,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,98.0,1237.85
4,0013-EXCHZ,Yes,Female,1,Yes,No,3,Yes,No,Fiber optic,...,No,No,Yes,Yes,No,Month-to-month,Yes,Mailed check,83.9,267.4


In [152]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7267 entries, 0 to 7266
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   customerID                 7267 non-null   object 
 1   Churn                      7267 non-null   object 
 2   customer.gender            7267 non-null   object 
 3   customer.SeniorCitizen     7267 non-null   int64  
 4   customer.Partner           7267 non-null   object 
 5   customer.Dependents        7267 non-null   object 
 6   customer.tenure            7267 non-null   int64  
 7   phone.PhoneService         7267 non-null   object 
 8   phone.MultipleLines        7267 non-null   object 
 9   internet.InternetService   7267 non-null   object 
 10  internet.OnlineSecurity    7267 non-null   object 
 11  internet.OnlineBackup      7267 non-null   object 
 12  internet.DeviceProtection  7267 non-null   object 
 13  internet.TechSupport       7267 non-null   objec

In [153]:
rows, columns = df.shape
print(f'linhas: {rows}, colunas: {columns}')

linhas: 7267, colunas: 21


In [154]:
from dict_data import dict_data

for key, value in dict_data.items():
    print(f"{key}: {value}")

customerID: número de identificação único de cada cliente
Churn: se o cliente deixou ou não a empresa
gender: gênero (masculino e feminino)
SeniorCitizen:  informação sobre um cliente ter ou não idade igual ou maior que 65 anos 
Partner: se o cliente possui ou não um parceiro ou parceira
Dependents: se o cliente possui ou não dependentes
tenure:  meses de contrato do cliente
PhoneService: assinatura de serviço telefônico
MultipleLines: assisnatura de mais de uma linha de telefone
InternetService: assinatura de um provedor internet
OnlineSecurity: assinatura adicional de segurança online
OnlineBackup: assinatura adicional de backup online 
DeviceProtection: assinatura adicional de proteção no dispositivo
TechSupport: assinatura adicional de suporte técnico, menos tempo de espera
StreamingTV: assinatura de TV a cabo 
StreamingMovies: assinatura de streaming de filmes 
Contract: tipo de contrato
PaperlessBilling: se o cliente prefere receber online a fatura
PaymentMethod: forma de pagamen

Limpando

In [155]:
#renomeando as colunas para facilitar

# Função para padronização dos nomes das colunas
def camel_to_snake(name):
    to_remove= ['']
    name = name.replace('.', '')  # remove pontos
    # insere _ antes de uma letra maiúscula que vem depois de uma letra minúscula ou número
    name = re.sub(r'(?<=[a-z0-9])(?=[A-Z])', '_', name)
    return name.lower()


In [156]:
old_columns= df.columns.tolist()

new_columns= list(map(camel_to_snake, list(dict_data.keys())))
dict_columns = {key: value for key, value in zip(old_columns, new_columns)}

renomear colunas facilita manipulacao

In [157]:
df.rename(columns=dict_columns, inplace=True)

In [158]:
df.head()

Unnamed: 0,customer_id,churn,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service,...,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,contract,paperless_billing,payment_method,charges_monthly,charges_total
0,0002-ORFBO,No,Female,0,Yes,Yes,9,Yes,No,DSL,...,Yes,No,Yes,Yes,No,One year,Yes,Mailed check,65.6,593.3
1,0003-MKNFE,No,Male,0,No,No,9,Yes,Yes,DSL,...,No,No,No,No,Yes,Month-to-month,No,Mailed check,59.9,542.4
2,0004-TLHLJ,Yes,Male,0,No,No,4,Yes,No,Fiber optic,...,No,Yes,No,No,No,Month-to-month,Yes,Electronic check,73.9,280.85
3,0011-IGKFF,Yes,Male,1,Yes,No,13,Yes,No,Fiber optic,...,Yes,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,98.0,1237.85
4,0013-EXCHZ,Yes,Female,1,Yes,No,3,Yes,No,Fiber optic,...,No,No,Yes,Yes,No,Month-to-month,Yes,Mailed check,83.9,267.4


In [159]:
# Exibindo os valores unicos das colunas com menos de 50 valores unicos.
key_words = ['No', 'Yes', '', 'No phone service', 'No internet service', 'DSL', 'Fiber optic']
new_data = [0, 1, np.nan, 'not informed', 'not informed','dsl' ,'fiber optic']

data_to_change = []

for col in df.columns.tolist():
    if df[col].nunique() < 50:
        print(f'{col} => {df[col].unique()}')
        for  _ in df[col].unique().tolist():
            if _ in key_words:
                data_to_change.append(col)
print('++++++++++++++++++++++++++++++++' \
'')
print('Colunas para formatar os dados')
data_to_change = set(data_to_change)
data_to_change 


churn => ['No' 'Yes' '']
gender => ['Female' 'Male']
senior_citizen => [0 1]
partner => ['Yes' 'No']
dependents => ['Yes' 'No']
phone_service => ['Yes' 'No']
multiple_lines => ['No' 'Yes' 'No phone service']
internet_service => ['DSL' 'Fiber optic' 'No']
online_security => ['No' 'Yes' 'No internet service']
online_backup => ['Yes' 'No' 'No internet service']
device_protection => ['No' 'Yes' 'No internet service']
tech_support => ['Yes' 'No' 'No internet service']
streaming_tv => ['Yes' 'No' 'No internet service']
streaming_movies => ['No' 'Yes' 'No internet service']
contract => ['One year' 'Month-to-month' 'Two year']
paperless_billing => ['Yes' 'No']
payment_method => ['Mailed check' 'Electronic check' 'Credit card (automatic)'
 'Bank transfer (automatic)']
++++++++++++++++++++++++++++++++
Colunas para formatar os dados


{'churn',
 'dependents',
 'device_protection',
 'internet_service',
 'multiple_lines',
 'online_backup',
 'online_security',
 'paperless_billing',
 'partner',
 'phone_service',
 'streaming_movies',
 'streaming_tv',
 'tech_support'}

In [160]:
dict_data_to_change = dict(zip(key_words, new_data))

In [161]:
for _ in data_to_change:
    print(f'{_}: {df[_].value_counts()}')
    print('==========================')

online_security: online_security
No                     3608
Yes                    2078
No internet service    1581
Name: count, dtype: int64
online_backup: online_backup
No                     3182
Yes                    2504
No internet service    1581
Name: count, dtype: int64
streaming_tv: streaming_tv
No                     2896
Yes                    2790
No internet service    1581
Name: count, dtype: int64
internet_service: internet_service
Fiber optic    3198
DSL            2488
No             1581
Name: count, dtype: int64
partner: partner
No     3749
Yes    3518
Name: count, dtype: int64
device_protection: device_protection
No                     3195
Yes                    2491
No internet service    1581
Name: count, dtype: int64
streaming_movies: streaming_movies
No                     2870
Yes                    2816
No internet service    1581
Name: count, dtype: int64
tech_support: tech_support
No                     3582
Yes                    2104
No internet servic

In [162]:
for col in data_to_change:
    df[col] = df[col].map(dict_data_to_change)

In [163]:
for _ in data_to_change:
    print(f'{_}: {df[_].value_counts()}')
    print('==========================')

online_security: online_security
0               3608
1               2078
not informed    1581
Name: count, dtype: int64
online_backup: online_backup
0               3182
1               2504
not informed    1581
Name: count, dtype: int64
streaming_tv: streaming_tv
0               2896
1               2790
not informed    1581
Name: count, dtype: int64
internet_service: internet_service
fiber optic    3198
dsl            2488
0              1581
Name: count, dtype: int64
partner: partner
0    3749
1    3518
Name: count, dtype: int64
device_protection: device_protection
0               3195
1               2491
not informed    1581
Name: count, dtype: int64
streaming_movies: streaming_movies
0               2870
1               2816
not informed    1581
Name: count, dtype: int64
tech_support: tech_support
0               3582
1               2104
not informed    1581
Name: count, dtype: int64
multiple_lines: multiple_lines
0               3495
1               3065
not informed     707


In [164]:
for col in df.columns.tolist():
    if df[col].nunique() < 50:
        print(f'{col} => {df[col].unique()}')

churn => [0 1 nan]
gender => ['Female' 'Male']
senior_citizen => [0 1]
partner => [1 0]
dependents => [1 0]
phone_service => [1 0]
multiple_lines => [0 1 'not informed']
internet_service => ['dsl' 'fiber optic' 0]
online_security => [0 1 'not informed']
online_backup => [1 0 'not informed']
device_protection => [0 1 'not informed']
tech_support => [1 0 'not informed']
streaming_tv => [1 0 'not informed']
streaming_movies => [0 1 'not informed']
contract => ['One year' 'Month-to-month' 'Two year']
paperless_billing => [1 0]
payment_method => ['Mailed check' 'Electronic check' 'Credit card (automatic)'
 'Bank transfer (automatic)']


In [137]:
df.head()

Unnamed: 0,customer_id,churn,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service,...,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,contract,paperless_billing,payment_method,charges_monthly,charges_total
0,0002-ORFBO,0,Female,0,1,1,9,1,0,dsl,...,1,0,1,1,0,One year,1,Mailed check,65.6,593.3
1,0003-MKNFE,0,Male,0,0,0,9,1,1,dsl,...,0,0,0,0,1,Month-to-month,0,Mailed check,59.9,542.4
2,0004-TLHLJ,1,Male,0,0,0,4,1,0,fiber optic,...,0,1,0,0,0,Month-to-month,1,Electronic check,73.9,280.85
3,0011-IGKFF,1,Male,1,1,0,13,1,0,fiber optic,...,1,1,0,1,1,Month-to-month,1,Electronic check,98.0,1237.85
4,0013-EXCHZ,1,Female,1,1,0,3,1,0,fiber optic,...,0,0,1,1,0,Month-to-month,1,Mailed check,83.9,267.4


In [140]:
df.dtypes

customer_id           object
churn                 object
gender                object
senior_citizen         int64
partner               object
dependents            object
tenure                 int64
phone_service         object
multiple_lines        object
internet_service      object
online_security       object
online_backup         object
device_protection     object
tech_support          object
streaming_tv          object
streaming_movies      object
contract              object
paperless_billing     object
payment_method        object
charges_monthly      float64
charges_total         object
dtype: object

Mudar tipo colunas que ja tem dados tratados

In [180]:
# df[data_to_change] = df[data_to_change].astype('Int64')
for col in data_to_change:
    if len(df[col].unique()) == 2:
        df[col] = df[col].astype(np.int64)
    else:
        df[col] = df[col].astype('category')
    

In [177]:
df['phone_service'].unique().tolist()

[1, 0]

In [181]:
df.dtypes

customer_id            object
churn                category
gender                 object
senior_citizen          int64
partner                 int64
dependents              int64
tenure                  int64
phone_service           int64
multiple_lines       category
internet_service     category
online_security      category
online_backup        category
device_protection    category
tech_support         category
streaming_tv         category
streaming_movies     category
contract               object
paperless_billing       int64
payment_method         object
charges_monthly       float64
charges_total          object
dtype: object

In [189]:
last_type_objects = df.select_dtypes(include=['object']).columns.tolist()
last_type_objects = last_type_objects[1:4]

In [191]:
df[last_type_objects].head()

Unnamed: 0,gender,contract,payment_method
0,Female,One year,Mailed check
1,Male,Month-to-month,Mailed check
2,Male,Month-to-month,Electronic check
3,Male,Month-to-month,Electronic check
4,Female,Month-to-month,Mailed check


In [194]:
#df[colunas_para_lower].apply(lambda x: x.str.lower())
df[last_type_objects] = df[last_type_objects].apply(lambda x: x.str.lower()).astype('category')

In [195]:
df.dtypes

customer_id            object
churn                category
gender               category
senior_citizen          int64
partner                 int64
dependents              int64
tenure                  int64
phone_service           int64
multiple_lines       category
internet_service     category
online_security      category
online_backup        category
device_protection    category
tech_support         category
streaming_tv         category
streaming_movies     category
contract             category
paperless_billing       int64
payment_method       category
charges_monthly       float64
charges_total          object
dtype: object

In [196]:
df.head()

Unnamed: 0,customer_id,churn,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service,...,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,contract,paperless_billing,payment_method,charges_monthly,charges_total
0,0002-ORFBO,0,female,0,1,1,9,1,0,dsl,...,1,0,1,1,0,one year,1,mailed check,65.6,593.3
1,0003-MKNFE,0,male,0,0,0,9,1,1,dsl,...,0,0,0,0,1,month-to-month,0,mailed check,59.9,542.4
2,0004-TLHLJ,1,male,0,0,0,4,1,0,fiber optic,...,0,1,0,0,0,month-to-month,1,electronic check,73.9,280.85
3,0011-IGKFF,1,male,1,1,0,13,1,0,fiber optic,...,1,1,0,1,1,month-to-month,1,electronic check,98.0,1237.85
4,0013-EXCHZ,1,female,1,1,0,3,1,0,fiber optic,...,0,0,1,1,0,month-to-month,1,mailed check,83.9,267.4


In [138]:
for col in df.columns.tolist():
    if df[col].nunique() > 50:
        print(f'{col} => {df[col].unique()}')

customer_id => ['0002-ORFBO' '0003-MKNFE' '0004-TLHLJ' ... '9992-UJOEL' '9993-LHIEB'
 '9995-HOTOH']
tenure => [ 9  4 13  3 71 63  7 65 54 72  5 56 34  1 45 50 23 55 26 69 11 37 49 66
 67 20 43 59 12 27  2 25 29 14 35 64 39 40  6 30 70 57 58 16 32 33 10 21
 61 15 44 22 24 19 47 62 46 52  8 60 48 28 41 53 68 51 31 36 17 18 38 42
  0]
charges_monthly => [65.6  59.9  73.9  ... 91.75 68.8  67.85]
charges_total => ['593.3' '542.4' '280.85' ... '742.9' '4627.65' '3707.6']


In [95]:
empty_strings= df.apply(lambda x: x.astype(str).str.strip() == '').sum()
empty_strings[empty_strings > 0]

churn            224
charges_total     11
dtype: int64

ternure justifica string vazia
novos clientes

In [94]:
df[df['charges_total'] == ' '][['tenure','charges_total', 'charges_monthly']]

Unnamed: 0,tenure,charges_total,charges_monthly
975,0,,56.05
1775,0,,20.0
1955,0,,61.9
2075,0,,19.7
2232,0,,20.25
2308,0,,25.35
2930,0,,73.35
3134,0,,25.75
3203,0,,52.55
4169,0,,80.85


In [206]:
idx_empty_strings = df[df['charges_total'] == ' '].index.tolist()
#df.loc[empty_string, 'Charges.Total'] = 0

df.loc[idx_empty_strings, 'charges_total'] = 0
df['charges_total'] = df['charges_total'].astype(np.float64)

In [207]:
df.dtypes

customer_id            object
churn                category
gender               category
senior_citizen          int64
partner                 int64
dependents              int64
tenure                  int64
phone_service           int64
multiple_lines       category
internet_service     category
online_security      category
online_backup        category
device_protection    category
tech_support         category
streaming_tv         category
streaming_movies     category
contract             category
paperless_billing       int64
payment_method       category
charges_monthly       float64
charges_total         float64
dtype: object

In [208]:
df.sample(10)

Unnamed: 0,customer_id,churn,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service,...,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,contract,paperless_billing,payment_method,charges_monthly,charges_total
6250,8603-IJWDN,1,male,0,0,0,1,1,1,fiber optic,...,0,0,0,0,1,month-to-month,1,electronic check,86.6,86.6
5449,7465-ZZRVX,0,male,0,0,0,1,1,0,fiber optic,...,0,0,0,0,0,month-to-month,1,electronic check,70.35,70.35
6118,8375-DKEBR,1,female,1,0,0,1,1,0,fiber optic,...,0,0,0,0,0,month-to-month,1,electronic check,69.6,69.6
1719,2446-PLQVO,1,male,0,0,0,1,1,0,fiber optic,...,0,0,0,0,0,month-to-month,1,electronic check,70.3,70.3
4772,6538-POCHL,0,male,0,0,0,33,1,0,fiber optic,...,1,1,0,0,0,month-to-month,1,credit card (automatic),79.0,2576.8
4596,6298-QDFNH,1,male,0,0,0,22,1,1,fiber optic,...,1,0,0,0,0,month-to-month,1,electronic check,79.35,1730.35
2425,3389-YGYAI,1,female,1,0,0,8,1,1,fiber optic,...,1,1,0,1,1,month-to-month,1,electronic check,105.5,829.55
5272,7197-VOJMM,0,male,0,1,0,67,1,0,dsl,...,1,0,1,0,1,two year,1,credit card (automatic),69.2,4671.65
516,0730-KOAVE,0,male,0,0,0,30,1,1,fiber optic,...,1,0,0,0,1,month-to-month,1,credit card (automatic),94.3,2679.7
7140,9823-EALYC,0,male,0,1,1,72,1,1,dsl,...,1,1,1,1,0,two year,1,bank transfer (automatic),80.85,5727.45


transform

In [209]:
df['charge_daily'] = df['charges_monthly'] / 30

In [210]:
df.sample(5)

Unnamed: 0,customer_id,churn,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service,...,device_protection,tech_support,streaming_tv,streaming_movies,contract,paperless_billing,payment_method,charges_monthly,charges_total,charge_daily
626,0895-DQHEW,1,male,0,1,0,54,1,1,fiber optic,...,1,0,1,1,month-to-month,1,electronic check,104.3,5278.15,3.476667
5031,6877-TJMBR,1,male,0,1,0,1,1,0,fiber optic,...,0,1,1,0,month-to-month,1,electronic check,84.8,84.8,2.826667
390,0562-KBDVM,0,female,0,0,0,70,0,not informed,dsl,...,1,1,0,0,two year,1,bank transfer (automatic),44.6,3058.15,1.486667
765,1090-ESELR,0,male,0,1,1,72,1,0,fiber optic,...,0,1,1,1,two year,0,bank transfer (automatic),105.5,7611.55,3.516667
4008,5481-NTDOH,0,female,1,1,0,67,1,1,fiber optic,...,1,0,1,1,one year,1,credit card (automatic),107.05,7142.5,3.568333


dtype('float64')

salvar - Load

In [212]:
df.to_csv('clened_telecomX_db/teleconX_clean.csv', index=False)

Brazil

In [213]:
ptbr_columns = ['id_cliente', 'contrato_cancelado', 'genero', 'idoso', 'tem_parceiro','tem_dependente', 'meses_contrato', 'servicos_telefonicos', 'multiplas_linhas', 'servicos_internet', 'seguranca_online','backup_online','seguro_protecao','suporte_tecnico','tem_tv_cabo', 'tem_stream_filmes', 'tipo_contrato','fatura_online','forma_pagamento','valor_mensal','valor_total','valor_diario']

In [217]:
dict_ptbr = dict(zip(df.columns.tolist(), ptbr_columns))
dict_ptbr

{'customer_id': 'id_cliente',
 'churn': 'contrato_cancelado',
 'gender': 'genero',
 'senior_citizen': 'idoso',
 'partner': 'tem_parceiro',
 'dependents': 'tem_dependente',
 'tenure': 'meses_contrato',
 'phone_service': 'servicos_telefonicos',
 'multiple_lines': 'multiplas_linhas',
 'internet_service': 'servicos_internet',
 'online_security': 'seguranca_online',
 'online_backup': 'backup_online',
 'device_protection': 'seguro_protecao',
 'tech_support': 'suporte_tecnico',
 'streaming_tv': 'tem_tv_cabo',
 'streaming_movies': 'tem_stream_filmes',
 'contract': 'tipo_contrato',
 'paperless_billing': 'fatura_online',
 'payment_method': 'forma_pagamento',
 'charges_monthly': 'valor_mensal',
 'charges_total': 'valor_total',
 'charge_daily': 'valor_diario'}

In [218]:
df_br = df.copy()

In [221]:
df_br.rename(columns=dict_ptbr, inplace=True)
df_br

Unnamed: 0,id_cliente,contrato_cancelado,genero,idoso,tem_parceiro,tem_dependente,meses_contrato,servicos_telefonicos,multiplas_linhas,servicos_internet,...,seguro_protecao,suporte_tecnico,tem_tv_cabo,tem_stream_filmes,tipo_contrato,fatura_online,forma_pagamento,valor_mensal,valor_total,valor_diario
0,0002-ORFBO,0,female,0,1,1,9,1,0,dsl,...,0,1,1,0,one year,1,mailed check,65.60,593.30,2.186667
1,0003-MKNFE,0,male,0,0,0,9,1,1,dsl,...,0,0,0,1,month-to-month,0,mailed check,59.90,542.40,1.996667
2,0004-TLHLJ,1,male,0,0,0,4,1,0,fiber optic,...,1,0,0,0,month-to-month,1,electronic check,73.90,280.85,2.463333
3,0011-IGKFF,1,male,1,1,0,13,1,0,fiber optic,...,1,0,1,1,month-to-month,1,electronic check,98.00,1237.85,3.266667
4,0013-EXCHZ,1,female,1,1,0,3,1,0,fiber optic,...,0,1,1,0,month-to-month,1,mailed check,83.90,267.40,2.796667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7262,9987-LUTYD,0,female,0,0,0,13,1,0,dsl,...,0,1,0,0,one year,0,mailed check,55.15,742.90,1.838333
7263,9992-RRAMN,1,male,0,1,0,22,1,1,fiber optic,...,0,0,0,1,month-to-month,1,electronic check,85.10,1873.70,2.836667
7264,9992-UJOEL,0,male,0,0,0,2,1,0,dsl,...,0,0,0,0,month-to-month,1,mailed check,50.30,92.75,1.676667
7265,9993-LHIEB,0,male,0,1,1,67,1,0,dsl,...,1,1,0,1,two year,0,mailed check,67.85,4627.65,2.261667


traduzindo dados

In [222]:
df.select_dtypes(include=['category']).columns.tolist()

['churn',
 'gender',
 'multiple_lines',
 'internet_service',
 'online_security',
 'online_backup',
 'device_protection',
 'tech_support',
 'streaming_tv',
 'streaming_movies',
 'contract',
 'payment_method']