## Dependencias

In [1]:
# Basic data manipulation
import numpy as np
import pandas as pd
from datetime import datetime
from glob import glob
from uuid import uuid4
from sqlalchemy.engine import create_engine
from sqlalchemy.types import VARCHAR,FLOAT,INTEGER,UUID,DATETIME,CHAR,BOOLEAN
import json 
from itertools import chain

## Listar archivos

In [2]:
ruta = '../ibm_card_txn/*.csv'
archivos = np.random.choice(glob(ruta), 4)

## Limpieza de datos 

In [3]:
def validar_datos(archivo:str)->pd.DataFrame:
    """
    Esta función recibe la ruta de un archivo CSV, lee columnas definidas,
    convierte ciertos campos a valores numéricos y fechas en formato datetime,
    estandariza los nombres de columnas y devuelve un DataFrame resultante
    con la información validada.
    """
    cols = ['User', 'Card', 'Year', 'Month', 'Day', 'Time', 'Amount', 
            'Use Chip','Merchant State', 'MCC', 'Errors?', 'Is Fraud?']
    origin = ['User', 'Card', 'Amount', 
              'Use Chip','Merchant State', 'MCC', 'Errors?', 'Is Fraud?','ts']
    names = ['id_user','id_card','c_amt','d_use_chip','d_merchant_state',
             'id_mcc','d_errors','b_fraud','dt_timestamp']

    df = pd.read_csv(archivo, dtype=str, usecols=cols)

    for c in ['User','Card','MCC']:
        df[c] = pd.to_numeric(df[c], errors='coerce').fillna(99999).astype(int)
    
    for c in ['Month','Day']:
        df[c] = pd.to_numeric(df[c], errors='coerce').map(lambda x:f'{x:02d}')
    
    df['ts'] = df[['Year','Month','Day']].apply("-".join,axis=1)
    df['ts'] = df[['ts','Time']].apply(" ".join,axis=1)
    df['ts'] = pd.to_datetime(df['ts'],format='%Y-%m-%d %H:%M')

    df['Amount'] = pd.to_numeric(df['Amount'].map(lambda x:x.replace('$','')), errors='coerce')
    df['Is Fraud?'] = df['Is Fraud?']=='Yes'

    df.drop(['Year','Month','Day','Time'],axis=1,inplace=True)
    df.rename(columns=dict(zip(origin,names)), inplace=True)

    return df

OLTP vs OLAP 
OLTP : On-Line Transactional Processing --> Bases de datos relacionales (la que da soporte a las aplicaciones)--> CRUD 
OLAP : On-Line Analytical Processing --> Cubos de información (dimensiones/hechos, sirve para análisis) --> SELECT 

## Limpieza total

In [4]:
df = pd.concat(map(validar_datos,archivos),ignore_index=True)
df.insert(0,'uuid',df.index.map(lambda x:uuid4()))
df.shape

(400000, 10)

In [5]:
df.to_pickle('df_ibm.pkl')

In [6]:
df = pd.read_pickle('df_ibm.pkl')

## Conexión a la base de datos

In [7]:
creds = json.load(open('creds.json'))

In [8]:
cnx = create_engine(f"mysql+pymysql://{creds['user']}:{creds['password']}@{creds['host']}/{creds['database']}").connect()
cnx.closed

False

## Modelo Relacional (OLTP)

### Entidad Usuario 

In [9]:
tbl_user = df[['id_user']].drop_duplicates().reset_index(drop=True)
tbl_user['uuid'] = tbl_user['id_user'].map(lambda x:uuid4())
tbl_user.to_sql('tbl_user',
                cnx,
                if_exists='append',
                index=False,dtype={'id_user':INTEGER,'uuid':CHAR(36)})

37

### Entidad tarjeta 

In [10]:
tbl_card = df[['id_card','id_user']].drop_duplicates().reset_index(drop=True)
tbl_card['uuid'] = tbl_card['id_card'].map(lambda x:uuid4())
tbl_card = tbl_card.merge(tbl_user.rename(columns={'uuid':'uuid_user'}), on='id_user', how='inner').drop(['id_user','id_card'],axis=1)
tbl_card.to_sql('tbl_card',
                cnx,
                if_exists='append',
                index=False,dtype={'uuid_user':CHAR(36),'uuid':CHAR(36)})

121

### Entidad tipo txn

In [11]:
tbl_txn_type = df[['d_use_chip']].drop_duplicates().reset_index(drop=True)
tbl_txn_type['uuid'] = tbl_txn_type['d_use_chip'].map(lambda x:uuid4())
tbl_txn_type = tbl_txn_type.rename(columns={'d_use_chip':'txn_type'})
tbl_txn_type.to_sql('tbl_txn_type',
                    cnx,
                    if_exists='append',
                    index=False,dtype={'txn_type':VARCHAR(18),'uuid':CHAR(36)})

3

In [12]:
df.head(2)

Unnamed: 0,uuid,id_user,id_card,c_amt,d_use_chip,d_merchant_state,id_mcc,d_errors,b_fraud,dt_timestamp
0,0617645c-7821-47ab-879f-fc9d37997f89,1584,3,6.93,Chip Transaction,NY,5921,,False,2017-03-03 10:30:00
1,6d0e047d-d237-48bf-bedd-c187cce08088,1584,3,6.38,Chip Transaction,NY,5921,,False,2017-03-04 10:30:00


### Entidad Estado

In [13]:
tbl_state = df[['d_merchant_state']].fillna('OTHER').astype(str).drop_duplicates().reset_index(drop=True)
tbl_state['uuid'] = tbl_state['d_merchant_state'].map(lambda x:uuid4())
tbl_state = tbl_state.rename(columns={'d_merchant_state':'merchant_state'})
tbl_state.to_sql('tbl_state',
                 cnx,
                 if_exists='append',
                 index=False,dtype={'merchant_state':VARCHAR(24),'uuid':CHAR(36)})

95

### Entidad MCC

In [14]:
mcc = pd.read_excel('lista_mcc.xlsx',skiprows=1)
mcc.columns = ['id_mcc','mcc_description']
mcc['id_mcc'] = pd.to_numeric(mcc['id_mcc'], errors='coerce').fillna(9999).astype(int)
mcc['id_mcc'] = mcc['id_mcc'].map(int).map(lambda x:f'{x:04d}')
tbl_mcc = df[['id_mcc']].drop_duplicates().reset_index(drop=True)
tbl_mcc['id_mcc'] = tbl_mcc['id_mcc'].map(int).map(lambda x:f'{x:04d}')
tbl_mcc = tbl_mcc.merge(mcc, on='id_mcc', how='inner')
tbl_mcc['uuid'] = tbl_mcc['id_mcc'].map(lambda x:uuid4())


In [15]:
tbl_mcc.to_sql('tbl_mcc',
               cnx,
               if_exists='append',
               index=False,dtype={'id_mcc':CHAR(4),'mcc_description':VARCHAR(200),'uuid':CHAR(36)})

77

In [16]:
mcc['id_mcc'].map(len).value_counts()

id_mcc
4    335
Name: count, dtype: int64

### Entidad Errores 

In [17]:
tbl_error = pd.Series(sorted(set(chain(*[x.split(',') for x in set(df['d_errors'].dropna())])))).to_frame('error_desc')
tbl_error['uuid'] = tbl_error['error_desc'].map(lambda x:uuid4())
tbl_error.to_sql('tbl_error',
                 cnx,
                 if_exists='append',
                 index=False,dtype={'error_desc':VARCHAR(20),'uuid':CHAR(36)})

7

### Entidad Puente Error-Txn

In [25]:
tbl_error_txn = df[['uuid','d_errors']].dropna().reset_index(drop=True)
tbl_error_txn['d_errors'] = tbl_error_txn['d_errors'].map(lambda x:x.split(',')) 
tbl_error_txn[['error_1','error_2']] = tbl_error_txn['d_errors'].apply(pd.Series)
tbl_error_txn.drop('d_errors',axis=1,inplace=True)
tbl_error_txn = tbl_error_txn.melt(id_vars='uuid',value_vars=['error_1','error_2'],value_name='error_desc')
tbl_error_txn.drop('variable',axis=1,inplace=True)
tbl_error_txn = tbl_error_txn.dropna().reset_index(drop=True)
tbl_error_txn = tbl_error_txn.merge(tbl_error.rename(columns={'uuid':'uuid_error'}), on='error_desc', how='inner').drop('error_desc',axis=1)
tbl_error_txn.rename(columns={'uuid':'uuid_txn'}, inplace=True)
tbl_error_txn['uuid'] = tbl_error_txn.index.map(lambda x:uuid4())
tbl_error_txn.to_sql('tbl_error_txn',
                     cnx,
                     if_exists='replace',
                     index=False,dtype={'uuid_txn':CHAR(36),'uuid_error':CHAR(36),'uuid':CHAR(36)})

6915

### Entidad Txn

In [26]:
tbl_txn = df.copy()
tbl_txn['id_mcc'] = tbl_txn['id_mcc'].map(int).map(lambda x:f'{x:04d}')
tbl_txn = tbl_txn.merge(tbl_user.rename(columns={'uuid':'uuid_user'}), on='id_user', how='inner').drop('id_user',axis=1)
tbl_txn.drop('id_card',axis=1,inplace=True)
tbl_txn = tbl_txn.merge(tbl_txn_type.rename(columns={'uuid':'uuid_txn_type'}), 
                        left_on='d_use_chip',right_on='txn_type', how='inner').drop(['d_use_chip','txn_type'],axis=1)
tbl_txn = tbl_txn.merge(tbl_state.rename(columns={'uuid':'uuid_state'}),
                        left_on='d_merchant_state',right_on='merchant_state', how='inner').drop(['d_merchant_state','merchant_state'],axis=1)
tbl_txn = tbl_txn.merge(tbl_mcc.rename(columns={'uuid':'uuid_mcc'}), on='id_mcc', how='inner').drop(['id_mcc','mcc_description'],axis=1)
tbl_txn.drop('d_errors',axis=1,inplace=True)
tbl_txn.to_sql('tbl_txn',
               cnx,
               if_exists='append',
               index=False,dtype={'uuid':CHAR(36),'c_amt':FLOAT,'b_fraud':BOOLEAN,'dt_timestamp':DATETIME,
                                  'uuid_user':CHAR(36),'uuid_txn_type':CHAR(36),'uuid_state':CHAR(36),
                                  'uuid_mcc':CHAR(36)})

365608

In [None]:
df.head()