#📌 Extracão

In [5]:
import requests
import pandas as pd

In [38]:
url = "https://raw.githubusercontent.com/ingridcristh/challenge2-data-science/refs/heads/main/TelecomX_Data.json"

response = requests.get(url)
response.raise_for_status()

data_json = response.json()

df = pd.DataFrame(data_json)

print(df.head())

   customerID Churn                                           customer  \
0  0002-ORFBO    No  {'gender': 'Female', 'SeniorCitizen': 0, 'Part...   
1  0003-MKNFE    No  {'gender': 'Male', 'SeniorCitizen': 0, 'Partne...   
2  0004-TLHLJ   Yes  {'gender': 'Male', 'SeniorCitizen': 0, 'Partne...   
3  0011-IGKFF   Yes  {'gender': 'Male', 'SeniorCitizen': 1, 'Partne...   
4  0013-EXCHZ   Yes  {'gender': 'Female', 'SeniorCitizen': 1, 'Part...   

                                             phone  \
0   {'PhoneService': 'Yes', 'MultipleLines': 'No'}   
1  {'PhoneService': 'Yes', 'MultipleLines': 'Yes'}   
2   {'PhoneService': 'Yes', 'MultipleLines': 'No'}   
3   {'PhoneService': 'Yes', 'MultipleLines': 'No'}   
4   {'PhoneService': 'Yes', 'MultipleLines': 'No'}   

                                            internet  \
0  {'InternetService': 'DSL', 'OnlineSecurity': '...   
1  {'InternetService': 'DSL', 'OnlineSecurity': '...   
2  {'InternetService': 'Fiber optic', 'OnlineSecu...   
3  {'I

#🔧 Transformação

In [39]:
# Quantidade de valores ausentes por coluna
df.isnull().sum()

(df.isnull().sum() / len(df)) * 100

Unnamed: 0,0
customerID,0.0
Churn,0.0
customer,0.0
phone,0.0
internet,0.0
account,0.0


In [28]:
# Verificando registros duplicados
cols_dict = []
for col in df.columns:
    if df[col].apply(lambda x: isinstance(x, dict)).any():
        cols_dict.append(col)

print(f"Colunas com dict: {cols_dict}")

for col in cols_dict:
    df = pd.json_normalize(df[col]).add_prefix(f'{col}.').merge(df.drop(columns=[col]), left_index=True, right_index=True)

print("Duplicados:", df.duplicated().sum())
df = df.drop_duplicates()

Colunas com dict: []
Duplicados: 0


In [29]:
# Contagem de Valores
display(df['customer.gender'].value_counts())
display(df['account.PaymentMethod'].value_counts())

Unnamed: 0_level_0,count
customer.gender,Unnamed: 1_level_1
Male,3675
Female,3592


Unnamed: 0_level_0,count
account.PaymentMethod,Unnamed: 1_level_1
Electronic check,2445
Mailed check,1665
Bank transfer (automatic),1589
Credit card (automatic),1568


In [30]:
df['customer.tenure'].describe()

Unnamed: 0,customer.tenure
count,7267.0
mean,32.346498
std,24.571773
min,0.0
25%,9.0
50%,29.0
75%,55.0
max,72.0


In [34]:
# Coluna Contas Diarias
df['Contas_Diarias'] = df['account.Charges.Monthly'] / 30
print(df['Contas_Diarias'].describe())

count    7267.000000
mean        2.157337
std         1.004319
min         0.608333
25%         1.180833
50%         2.343333
75%         2.995833
max         3.958333
Name: Contas_Diarias, dtype: float64


In [35]:
#Conversão de valores Yes or No para Binário
binary_cols = []
for col in df.columns:
    unique_values = df[col].unique()
    if len(unique_values) == 2:
        lower_unique = [str(val).lower() for val in unique_values]
        if set(lower_unique) == {'yes', 'no'}:
            binary_cols.append(col)

print("Columns identified for binary conversion:", binary_cols)

Columns identified for binary conversion: ['account.PaperlessBilling', 'phone.PhoneService', 'customer.Partner', 'customer.Dependents']


#📊 Carga e análise

In [37]:
# Realizar análise descritiva dos dados
display(df.describe())

Unnamed: 0,account.Charges.Monthly,customer.SeniorCitizen,customer.tenure,Contas_Diarias
count,7267.0,7267.0,7267.0,7267.0
mean,64.720098,0.162653,32.346498,2.157337
std,30.129572,0.369074,24.571773,1.004319
min,18.25,0.0,0.0,0.608333
25%,35.425,0.0,9.0,1.180833
50%,70.3,0.0,29.0,2.343333
75%,89.875,0.0,55.0,2.995833
max,118.75,1.0,72.0,3.958333


#📄Relatorio Final