## Dependencias 

In [1]:
import numpy as np
import pandas as pd 

from sqlalchemy import create_engine
from sqlalchemy.types import VARCHAR,CHAR,INTEGER,BIGINT,BOOLEAN,DATE,DATETIME,FLOAT

import json
pd.set_option('display.max_columns',None)

## Lectura de datos 

In [2]:
ruta = '/media/jose/090f6b94-de30-4aaf-9f8a-4e18b120d7f6/bd/02.  Para Ingeniería/cc_txn/credit_card_transactions-ibm_v2.csv'

In [3]:
data = pd.read_csv(ruta,dtype=str)
data.shape

(24386900, 15)

In [4]:
data.head(2)

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
0,0,0,2002,9,1,06:21,$134.09,Swipe Transaction,3527213246127876953,La Verne,CA,91750.0,5300,,No
1,0,0,2002,9,1,06:42,$38.48,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No


## Conexión a BD

In [5]:
creds = json.load(open('../../creds/mysql.json','r'))

In [6]:
cnx = create_engine(f"mysql+pymysql://{creds['u']}:{creds['p']}@{creds['h']}/{creds['d']}").connect()

## Entidades

### Comercio

In [21]:
entComercio = data[['Merchant Name']].drop_duplicates().reset_index(drop=True)
entComercio.insert(0,'id',entComercio.index+1)
entComercio.columns = ['id','merchant_name']
entComercio['merchant_name'] = entComercio['merchant_name'].map(str)
entComercio.head()

Unnamed: 0,id,merchant_name
0,1,3527213246127876953
1,2,-727612092139916043
2,3,3414527459579106770
3,4,5817218446178736267
4,5,-7146670748125200898


In [22]:
entComercio['merchant_name'].map(len).max()

20

In [23]:
dtypes = [INTEGER,VARCHAR(20)]
entComercio.to_sql(name='tbl_merchant',
               index=False,
               con=cnx,
               if_exists='replace',
               chunksize=5000,
               dtype=dict(zip(entComercio.columns,dtypes)))

100343

### Territorio

In [45]:
entTerritorio = pd.read_clipboard()

In [59]:
entTerritorio.head()

Unnamed: 0,id,territory_code,territory_name
0,1,CA,California
1,2,TX,Texas
2,3,FL,Florida
3,4,NY,New York
4,5,OH,Ohio


In [46]:
dtypes = [INTEGER,VARCHAR(3),VARCHAR(32)]
entTerritorio.to_sql(name='tbl_territory',
               index=False,
               con=cnx,
               if_exists='replace',
               chunksize=5000,
               dtype=dict(zip(entTerritorio.columns,dtypes)))

224

### Ciudad

In [52]:
data['Merchant State'] = np.where(data['Merchant City']=='ONLINE','Online',data['Merchant State'])

In [53]:
entCiudad = data[['Merchant City','Merchant State']].drop_duplicates()

In [54]:
entCiudad.columns = ['territory_city','territory_name']

In [102]:
aux = entCiudad.merge(entTerritorio,how='left',on=['territory_name'])
aux['id'] =np.where(aux['id']<52,None,aux['id']) 
aux = aux.merge(entTerritorio,how='left',
                left_on=['territory_name'],right_on=['territory_code'])
aux.tail()


Unnamed: 0,territory_city,territory_name_x,id_x,territory_code_x,id_y,territory_code_y,territory_name_y
20324,Loysville,PA,,,7.0,PA,Pennsylvania
20325,Laurel Bloomery,TN,,,14.0,TN,Tennessee
20326,Alburgh,VT,,,48.0,VT,Vermont
20327,Buskirk,NY,,,4.0,NY,New York
20328,Mooers,NY,,,4.0,NY,New York


In [103]:
aux['id_territory'] = np.where(aux['id_y'].isnull(),aux['id_x'],aux['id_y'])

In [104]:
aux = aux[['id_territory','territory_city']].dropna().reset_index(drop=True)
aux.insert(0,'id',aux.index+1)

In [105]:
aux['territory_city'].map(len).max()

26

In [106]:
dtypes = [INTEGER,INTEGER,VARCHAR(26)]
aux.to_sql(name='tbl_city',
               index=False,
               con=cnx,
               if_exists='replace',
               chunksize=5000,
               dtype=dict(zip(aux.columns,dtypes)))

20328

### Tipo de Transacción


In [116]:
entTipoTxn = data[['Use Chip']].drop_duplicates().reset_index(drop=True)
entTipoTxn.insert(0,'id',entTipoTxn.index+1)
entTipoTxn.columns = ['id','txn_type']
entTipoTxn['txn_type'].map(len).max()

18

In [118]:
dtypes = [INTEGER,VARCHAR(18)]
entTipoTxn.to_sql(name='tbl_txn_type',
               index=False,
               con=cnx,
               if_exists='replace',
               chunksize=5000,

               dtype=dict(zip(entTipoTxn.columns,dtypes)))

3

### Código Postal

In [128]:
entZip = data[['Merchant State','Zip']].drop_duplicates().dropna().reset_index(drop=True)

In [133]:
pd.Series(sorted(entZip['Merchant State'].unique())).to_clipboard(index=False)

In [134]:
data[data['Merchant State']=='AA']

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
3646455,309,0,2012,3,2,09:53,$2.01,Swipe Transaction,6091778774361517457,Dpo,AA,34004.0,5411,,No
3651969,309,2,2009,2,22,09:35,$1.29,Swipe Transaction,6091778774361517457,Dpo,AA,34004.0,5411,,No
3651970,309,2,2009,2,23,13:49,$55.09,Swipe Transaction,6091778774361517457,Dpo,AA,34004.0,5411,,No
3666064,309,4,2009,2,24,09:44,$1.94,Swipe Transaction,6091778774361517457,Dpo,AA,34004.0,5411,,No
5762448,488,0,1998,11,2,12:54,$12.80,Swipe Transaction,6091778774361517457,Dpo,AA,34004.0,5411,,No
5762462,488,0,1998,11,5,12:26,$12.38,Swipe Transaction,6091778774361517457,Dpo,AA,34004.0,5411,,No
16887363,1373,0,2020,2,25,11:34,$10.34,Chip Transaction,6091778774361517457,Dpo,AA,34004.0,5411,,No
21086474,1722,1,2011,8,10,15:41,$90.27,Swipe Transaction,6091778774361517457,Dpo,AA,34004.0,5411,,No
21089949,1722,2,2011,8,4,09:57,$11.31,Swipe Transaction,6091778774361517457,Dpo,AA,34004.0,5411,,No
21090318,1722,2,2012,8,28,10:25,$4.68,Swipe Transaction,6091778774361517457,Dpo,AA,34004.0,5411,,No
