In [5]:
import pandas as pd
import json

# Cargar el dataset
df = pd.read_csv('./data/Variant V.csv')

# Asegurarse de que 'fraud_bool' está presente
assert 'fraud_bool' in df.columns

# === Análisis EDA ===

# 1. Distribución de fraude
fraud_counts = df['fraud_bool'].value_counts().to_dict()
fraud_data = [{'label': 'Legit', 'value': fraud_counts.get(0, 0)},
              {'label': 'Fraud', 'value': fraud_counts.get(1, 0)}]

# 2. Ingreso promedio: fraude vs no fraude
income_data = df.groupby('fraud_bool')['income'].mean().to_dict()
income_data = [
    {'label': 'Legit', 'value': round(income_data.get(0, 0), 2)},
    {'label': 'Fraud', 'value': round(income_data.get(1, 0), 2)}
]

# 3. Distribución por estado laboral (top 5)
employment = df[df['employment_status'].notna()]
top_employment = employment['employment_status'].value_counts().nlargest(5).index.tolist()
employment_data = (
    employment[employment['employment_status'].isin(top_employment)]
    .groupby(['employment_status'])['fraud_bool']
    .mean()
    .reset_index()
)
employment_chart = [
    {'label': row['employment_status'], 'value': round(row['fraud_bool'] * 100, 2)}
    for _, row in employment_data.iterrows()
]

# 4. Velocidad de solicitudes en 6h: fraude vs no fraude
velocity_data = df.groupby('fraud_bool')['velocity_6h'].mean().to_dict()
velocity_chart = [
    {'label': 'Legit', 'value': round(velocity_data.get(0, 0), 2)},
    {'label': 'Fraud', 'value': round(velocity_data.get(1, 0), 2)}
]

# === Exportar a JSON ===
eda_output = {
    'fraud_distribution': fraud_data,
    'income_by_fraud': income_data,
    'fraud_rate_by_employment': employment_chart,
    'avg_velocity_6h': velocity_chart
}

# Guardar como JSON
with open('../web/public/eda.json', 'w') as f:
    json.dump(eda_output, f, indent=2)
