## Dependencias

In [None]:
# Basic data manipulation
import numpy as np
import pandas as pd
from datetime import datetime
from glob import glob

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Statistical analysis
from scipy import stats

# Custom tools
from CreditScoringToolkit.DiscreteNormalizer import DiscreteNormalizer
from CreditScoringToolkit.frequency_table import frequency_table


## Listar archivos

In [None]:
ruta = 'ibm_card_txn/*.csv'
archivos = glob(ruta)

## Limpieza de datos 

In [None]:
def validar_datos(archivo:str)->pd.DataFrame:
    """
    Esta función recibe la ruta de un archivo CSV, lee columnas definidas,
    convierte ciertos campos a valores numéricos y fechas en formato datetime,
    estandariza los nombres de columnas y devuelve un DataFrame resultante
    con la información validada.
    """
    cols = ['User', 'Card', 'Year', 'Month', 'Day', 'Time', 'Amount', 
            'Use Chip','Merchant State', 'MCC', 'Errors?', 'Is Fraud?']
    origin = ['User', 'Card', 'Amount', 
              'Use Chip','Merchant State', 'MCC', 'Errors?', 'Is Fraud?','ts']
    names = ['id_user','id_card','c_amt','d_use_chip','d_merchant_state',
             'id_mcc','d_errors','b_fraud','dt_timestamp']

    df = pd.read_csv(archivo, dtype=str, usecols=cols)

    for c in ['User','Card','MCC']:
        df[c] = pd.to_numeric(df[c], errors='coerce').fillna(99999).astype(int)
    
    for c in ['Month','Day']:
        df[c] = pd.to_numeric(df[c], errors='coerce').map(lambda x:f'{x:02d}')
    
    df['ts'] = df[['Year','Month','Day']].apply("-".join,axis=1)
    df['ts'] = df[['ts','Time']].apply(" ".join,axis=1)
    df['ts'] = pd.to_datetime(df['ts'],format='%Y-%m-%d %H:%M')

    df['Amount'] = pd.to_numeric(df['Amount'].map(lambda x:x.replace('$','')), errors='coerce')
    df['Is Fraud?'] = df['Is Fraud?']=='Yes'

    df.drop(['Year','Month','Day','Time'],axis=1,inplace=True)
    df.rename(columns=dict(zip(origin,names)), inplace=True)

    return df

## Limpieza total

In [None]:
df = pd.concat(map(validar_datos,archivos),ignore_index=True)
df.shape

## Control estadístico de la calidad

In [None]:
df_new = df[df['dt_timestamp']>=datetime(2020,2,1)].copy().reset_index(drop=True)
df_old = df[df['dt_timestamp']<datetime(2020,2,1)].copy().reset_index(drop=True)

In [None]:
df_new.shape,df_old.shape

### Continuas

In [None]:
varc = [v for v in df_old if v[:2]=='c_']
varc

In [None]:
for v in varc:
    fig = plt.Figure()
    sns.kdeplot(df_old[v].sample(1000),shade=True,cumulative=True)
    sns.kdeplot(df_new[v],shade=True,cumulative=True)
    plt.show(fig)

In [None]:
stats.kruskal(df_old['c_amt'].sample(10000),df_new['c_amt'])

In [None]:
df_old['c_amt'].sample(10000).describe(),df_new['c_amt'].describe()

### Discretas

In [None]:
vard = [v for v in df_old if v[:2] in ['d_', 'b_']]
len(vard)


In [None]:
dn = DiscreteNormalizer(normalization_threshold=0.05,default_category='Otros')
dn.fit(df_old[vard])

In [None]:
old_d = dn.transform(df_old[vard].sample(10000))

In [None]:
frequency_table(old_d,vard)

In [None]:
new_d = dn.transform(df_new[vard])

In [None]:
for v in vard[:2]:
    plt.Figure()
    x= pd.concat([old_d[v].value_counts(1),new_d[v].value_counts(1)],axis=1)
    display(x)
    stat = stats.chi2_contingency(*[x[c] for c in x.columns])
    print(v,f'Chi-square statistic: {stat[0]:.2f}',f'p-value: {stat[1]:.4f}')
    fig = x.T.plot(kind='bar', stacked=True, figsize=(10,6))
    plt.title(v)
    plt.show(fig)