<a href="https://colab.research.google.com/github/MAGNNATTA/telecom_x_etl_eda.ipynb/blob/main/telecom_x_etl_eda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import requests
from urllib.parse import urlparse
import os

# Função para extrair dados da API
def extract_data(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        df = pd.DataFrame(data)
        return df
    except requests.exceptions.RequestException as e:
        print(f"Erro ao extrair dados: {e}")
        return None

# Função para transformar e limpar os dados
def transform_data(df):
    # Verificar valores nulos
    print("Valores nulos antes da limpeza:\n", df.isnull().sum())

    # The original code attempted to treat 'tenure', 'MonthlyCharges', and 'TotalCharges'
    # as columns, but they are not in the dataset. Based on the available columns,
    # there are no missing values to fill based on the initial check.
    # Therefore, the fillna part is removed.

    # Convert Churn to binary if it exists
    if 'Churn' in df.columns:
        df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
    else:
        print("Column 'Churn' not found in the DataFrame.")

    return df

# Função para análise exploratória
def exploratory_analysis(df, output_dir='plots'):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Configurar estilo dos gráficos
    sns.set_style("whitegrid")

    # 1. Distribuição de Churn (if Churn column exists)
    if 'Churn' in df.columns:
        plt.figure(figsize=(8, 6))
        sns.countplot(x='Churn', data=df)
        plt.title('Distribuição de Churn')
        plt.savefig(f'{output_dir}/churn_distribution.png')
        plt.close()
    else:
        print("Cannot generate Churn distribution plot as 'Churn' column is missing.")

    # The original plots for tenure_group, correlation matrix, and MonthlyCharges by Churn
    # rely on columns that are not in the dataset ('tenure', 'MonthlyCharges').
    # These plots are removed as they cannot be generated with the current data.


# Função para gerar o relatório
def generate_report(df, output_dir='plots'):
    # Calculate churn percentage if 'Churn' column exists
    churn_percentage = "N/A"
    if 'Churn' in df.columns:
        churn_percentage = f"{df['Churn'].mean() * 100:.2f}%"

    report = f"""
# Relatório de Análise de Evasão de Clientes - Telecom X

## Introdução
O objetivo desta análise é realizar o processo de ETL (Extração, Transformação e Carga) e uma análise exploratória de dados (EDA) para identificar padrões relacionados à evasão de clientes (churn) na Telecom X. A empresa enfrenta um alto índice de churn, e este projeto visa fornecer insights que auxiliem na redução dessa taxa.

## Extração
Os dados foram extraídos de uma API hospedada no GitHub, disponível em: [TelecomX_Data.json](https://raw.githubusercontent.com/ingridcristh/challenge2-data-science/refs/heads/main/TelecomX_Data.json). A extração foi realizada utilizando a biblioteca `requests` do Python, e os dados foram carregados em um DataFrame do Pandas.

## Transformação
O processo de transformação incluiu:
- Conversão da coluna `Churn` para formato binário (0 para 'No', 1 para 'Yes') if the column exists.

## Análise Exploratória
The original exploratory analysis focused on columns that are not present in the dataset.
With the available columns ('customerID', 'Churn', 'customer', 'phone', 'internet', and 'account'),
a detailed exploratory analysis as initially planned is not possible without more information
or different data.

Based on the available 'Churn' column, we can see the distribution of churned customers.
1. **Distribuição de Churn**: Aproximadamente {churn_percentage} dos clientes churnaram.

Further analysis would require additional data or a clearer understanding of how the
'customer', 'phone', 'internet', and 'account' columns can be used to infer relevant
features for churn prediction.

## Conclusão
Based on the limited data available in the provided JSON file, we can only observe the overall churn rate. The initial insights regarding tenure and monthly charges could not be validated with this dataset. To provide meaningful insights and recommendations, a dataset with more relevant customer behavior and service usage information is needed.

## Sugestões
1. Obtain a more comprehensive dataset that includes features like contract tenure, monthly charges, total charges, service types, and customer demographics.
2. With a richer dataset, revisit the exploratory analysis to identify key drivers of churn.
3. Develop and evaluate predictive models for churn using the comprehensive dataset.
"""
    with open('report.md', 'w') as f:
        f.write(report)
    return report

# Função principal
def main():
    url = "https://raw.githubusercontent.com/ingridcristh/challenge2-data-science/refs/heads/main/TelecomX_Data.json"

    # Extração
    df = extract_data(url)
    if df is None:
        print("Falha na extração dos dados.")
        return

    # Transformação
    df = transform_data(df)

    # Análise Exploratória
    exploratory_analysis(df)

    # Gerar Relatório
    report = generate_report(df)
    print("Relatório gerado com sucesso: report.md")

if __name__ == "__main__":
    main()

In [None]:
import requests

url = "https://raw.githubusercontent.com/ingridcristh/challenge2-data-science/refs/heads/main/TelecomX_Data.json"
response = requests.get(url)
data = response.json()

# Print the keys of the first dictionary in the list to see the column names
if data and isinstance(data, list) and isinstance(data[0], dict):
    print("Available keys in the data:")
    print(data[0].keys())
else:
    print("Could not retrieve data or data is not in the expected format.")