In [1]:
import re
import numpy as np
import pandas as pd
import seaborn as sns
import cufflinks as cf
import missingno as msno
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import scipy.cluster.hierarchy as sch
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE, MDS
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import DistanceMetric
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

%matplotlib inline
cf.go_offline()
pd.set_option('display.max_rows', 60)
# pd.set_option('precision', 5)
pd.set_option('display.float_format', lambda x: "{:,.4f}".format(x))
pd.set_option('display.max_columns', 100)


urllib3 (1.26.2) or chardet (3.0.4) doesn't match a supported version!



# Obtener datos

In [120]:
client = pd.read_csv("datos/client.asc", sep=";")

In [54]:
account = pd.read_csv("datos/account.asc", sep=";")

In [14]:
district = pd.read_csv("datos/district.asc", sep=";")

In [19]:
card = pd.read_csv("datos/card.asc", sep=";")

In [21]:
disp = pd.read_csv("datos/disp.asc", sep=";")

In [23]:
loan = pd.read_csv("datos/loan.asc", sep=";")

In [25]:
order = pd.read_csv("datos/order.asc", sep=";")

In [162]:
trans = pd.read_csv("datos/trans.asc", sep=";",low_memory=False)

# Funciones

In [77]:
def genero(x):
    n = int(x[2:4])
    if(n>12):
        return 'M'
    else:
        return 'H'

In [89]:
def date_genre(x):
    x = list(x)
    n = int(''.join(x[2:4]))
    if(n>12):
        n = n-50
        if(n<=9):
            x[2]='0'
            x[3]=str(n)
        else:
            n = str(n)
            x[2:4] = n[:]
        return ''.join(x)
    else:
        return ''.join(x)

In [90]:
x = "706213"

In [99]:
x[2:4]

'62'

In [91]:
date_genre(x)

'701213'

In [72]:
genero(x)

'M'

# Información de las tablas

## Client

In [135]:
client.head(2)

Unnamed: 0,client_id,birth_number,district_id,genre,birth,diff
0,1,706213,18,M,1970-12-13,1999-01-01
1,2,450204,1,H,1945-02-04,1999-01-01


In [126]:
client.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5369 entries, 0 to 5368
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   client_id     5369 non-null   int64 
 1   birth_number  5369 non-null   object
 2   district_id   5369 non-null   int64 
 3   genre         5369 non-null   object
 4   birth         5369 non-null   object
dtypes: int64(2), object(3)
memory usage: 209.9+ KB


In [123]:
client['birth_number'] = client['birth_number'].astype(str)

In [124]:
client['genre'] = client['birth_number'].map(genero)

In [125]:
client['birth'] = client['birth_number'].map(date_genre)

In [127]:
client['birth'] = '19'+client['birth']

In [128]:
client['birth']=pd.to_datetime(client['birth'], format="%Y%m%d")

In [134]:
client['diff'] = '19990101'
client['diff']=pd.to_datetime(client['diff'], format="%Y%m%d")

In [138]:
client['age'] = (client['diff'] - client['birth']).dt.days/365.2425

In [140]:
client['age'] = client['age'].astype(int)

In [141]:
client.head(3)

Unnamed: 0,client_id,birth_number,district_id,genre,birth,diff,age
0,1,706213,18,M,1970-12-13,1999-01-01,28
1,2,450204,1,H,1945-02-04,1999-01-01,53
2,3,406009,1,M,1940-10-09,1999-01-01,58


## District 

In [149]:
district.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A1      77 non-null     int64  
 1   A2      77 non-null     object 
 2   A3      77 non-null     object 
 3   A4      77 non-null     int64  
 4   A5      77 non-null     int64  
 5   A6      77 non-null     int64  
 6   A7      77 non-null     int64  
 7   A8      77 non-null     int64  
 8   A9      77 non-null     int64  
 9   A10     77 non-null     float64
 10  A11     77 non-null     int64  
 11  A12     77 non-null     object 
 12  A13     77 non-null     float64
 13  A14     77 non-null     int64  
 14  A15     77 non-null     object 
 15  A16     77 non-null     int64  
dtypes: float64(2), int64(10), object(4)
memory usage: 9.8+ KB


In [16]:
district.head(5)

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,1,Hl.m. Praha,Prague,1204953,0,0,0,1,1,100.0,12541,0.29,0.43,167,85677,99107
1,2,Benesov,central Bohemia,88884,80,26,6,2,5,46.7,8507,1.67,1.85,132,2159,2674
2,3,Beroun,central Bohemia,75232,55,26,4,1,5,41.7,8980,1.95,2.21,111,2824,2813
3,4,Kladno,central Bohemia,149893,63,29,6,2,6,67.4,9753,4.64,5.05,109,5244,5892
4,5,Kolin,central Bohemia,95616,65,30,4,1,6,51.4,9307,3.85,4.43,118,2616,3040


## Card

In [150]:
card.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 892 entries, 0 to 891
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   card_id  892 non-null    int64 
 1   disp_id  892 non-null    int64 
 2   type     892 non-null    object
 3   issued   892 non-null    object
dtypes: int64(2), object(2)
memory usage: 28.0+ KB


In [179]:
card.head(2)

Unnamed: 0,card_id,disp_id,type,issued,date
0,1005,9285,classic,931107 00:00:00,1993-11-07
1,104,588,classic,940119 00:00:00,1994-01-19


In [176]:
card['date'] = card['issued'].str.extract(r'([0-9]{6})')

In [178]:
card['date'] = '19'+card['date']
card['date'] = pd.to_datetime(card['date'], format="%Y%m%d")

## Disposition

In [22]:
disp.head(3)

Unnamed: 0,disp_id,client_id,account_id,type
0,1,1,1,OWNER
1,2,2,2,OWNER
2,3,3,2,DISPONENT


## Loan

In [152]:
loan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 682 entries, 0 to 681
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   loan_id     682 non-null    int64  
 1   account_id  682 non-null    int64  
 2   date        682 non-null    int64  
 3   amount      682 non-null    int64  
 4   duration    682 non-null    int64  
 5   payments    682 non-null    float64
 6   status      682 non-null    object 
dtypes: float64(1), int64(5), object(1)
memory usage: 37.4+ KB


In [154]:
loan.head(2)

Unnamed: 0,loan_id,account_id,date,amount,duration,payments,status
0,5314,1787,1993-07-05,96396,12,8033.0,c_terminado_prestamo_no_pagado
1,5316,1801,1993-07-11,165960,36,4610.0,c_terminado_sin_problema


In [153]:
loan.status = loan.status.replace({"A": 'c_terminado_sin_problema', "B": 'c_terminado_prestamo_no_pagado', "C": 'c_en_ejecucion_hasta_ahora', "D": 'c_en_ejecucion_client_endeudado'})
loan['date'] = loan['date'].astype(str)
loan['date'] = '19'+loan['date']
loan['date']=pd.to_datetime(loan['date'], format="%Y%m%d")

## order

In [26]:
order.head(2)

Unnamed: 0,order_id,account_id,bank_to,account_to,amount,k_symbol
0,29401,1,YZ,87144583,2452.0,SIPO
1,29402,2,ST,89597016,3372.7,UVER


In [151]:
order.k_symbol = order.k_symbol.replace({"POJISTNE": 'pago_seguro', "SIPO": 'pago_domestico',
                                         "LEASING": 'pago_arrendamiento', "UVER": 'pago_prestamo'," ": 'no_aplica'})

## Transactions

In [171]:
trans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1056320 entries, 0 to 1056319
Data columns (total 10 columns):
 #   Column      Non-Null Count    Dtype         
---  ------      --------------    -----         
 0   trans_id    1056320 non-null  int64         
 1   account_id  1056320 non-null  int64         
 2   date        1056320 non-null  datetime64[ns]
 3   type        1056320 non-null  object        
 4   operation   1056320 non-null  object        
 5   amount      1056320 non-null  float64       
 6   balance     1056320 non-null  float64       
 7   k_symbol    1056320 non-null  object        
 8   bank        1056320 non-null  object        
 9   account     1056320 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(2), object(5)
memory usage: 80.6+ MB


In [169]:
trans.head(5)

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account
0,695247,2378,930101,credito,credito_en_efectivo,700.0,700.0,no_aplica,no_aplica,no_aplica
1,171812,576,930101,credito,credito_en_efectivo,900.0,900.0,no_aplica,no_aplica,no_aplica
2,207264,704,930101,credito,credito_en_efectivo,1000.0,1000.0,no_aplica,no_aplica,no_aplica
3,1117247,3818,930101,credito,credito_en_efectivo,600.0,600.0,no_aplica,no_aplica,no_aplica
4,579373,1972,930102,credito,credito_en_efectivo,400.0,400.0,no_aplica,no_aplica,no_aplica


In [165]:
trans.type = trans.type.replace({"PRIJEM": 'credito', "VYDAJ": 'debito', "VYBER":'retiro_en_efectivo'})

In [166]:
trans.operation = trans.operation.replace({"VYBER KARTOU": 'retiro_tarjeta_cred', "VKLAD": 'credito_en_efectivo',
                                           "PREVOD Z UCTU":'cobro_otro_banco',"VYBER":'retiro_en_efectivo',
                                           "PREVOD NA UCET":'remesa_otro_banco',
                                           np.nan: "no_aplica"})

In [167]:
trans.k_symbol = trans.k_symbol.replace({"POJISTNE": 'pago_seguro', "SLUZBY'": 'pago_de_declaracion',
                                         "UROK'":'interes_acreditado',
                                         "SANKC.UROK": 'interes_de_sancion_si_saldo_negativo',
                                         "SIPO": 'pago_domestico',"DUCHOD": 'pago_de_pension',
                                         "UVER": 'pago_de_prestamo',
                                         np.nan: "no_aplica"})

In [168]:
trans['bank'] = trans['bank'].replace(np.nan, "no_aplica") 
trans['account'] = trans['account'].replace(np.nan, "no_aplica")

In [170]:
trans['date'] = trans['date'].astype(str)
trans['date'] = "19"+trans['date']
trans['date']=pd.to_datetime(trans['date'], format= '%Y%m%d')

In [155]:
trans.operation.unique()

array(['VKLAD', 'PREVOD Z UCTU', 'VYBER', nan, 'PREVOD NA UCET',
       'VYBER KARTOU'], dtype=object)

## Account

In [148]:
account

Unnamed: 0,account_id,district_id,frequency,date
0,576,55,emision_mensual,1993-01-01
1,3818,74,emision_mensual,1993-01-01
2,704,55,emision_mensual,1993-01-01
3,2378,16,emision_mensual,1993-01-01
4,2632,24,emision_mensual,1993-01-02
...,...,...,...,...
4495,124,55,emision_mensual,1997-12-28
4496,3958,59,emision_mensual,1997-12-28
4497,777,30,emision_mensual,1997-12-28
4498,1573,63,emision_mensual,1997-12-29


In [145]:
account.frequency = account.frequency.replace({"POPLATEK MESICNE": 'emision_mensual',
                                               "POPLATEK TYDNE": 'emision_semanal',
                                               "POPLATEK PO OBRATU": 'emision_desp_tans'})

In [147]:
account['date'] = account['date'].astype(str)
account['date'] = '19'+account['date']
account['date']=pd.to_datetime(account['date'], format="%Y%m%d")