# Data Mining Project

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np
import math

account = pd.read_csv('../data/account.csv',delimiter=';')
card_dev = pd.read_csv('../data/card_dev.csv',delimiter=';')
client = pd.read_csv('../data/client.csv',delimiter=';')
disp = pd.read_csv('../data/disp.csv',delimiter=';')
district = pd.read_csv('../data/district.csv',delimiter=';')
loan_dev = pd.read_csv('../data/loan_dev.csv',delimiter=';')
trans_dev = pd.read_csv('../data/trans_dev.csv',delimiter=';', dtype={'bank':'str'})

## Business Understanding

The bank wants to improve their services. For instance, the bank managers have only vague idea, who is a good client (whom to offer some additional services) and who is a bad client (whom to watch carefully to minimize the bank loses). Fortunately, the bank stores data about their clients, the accounts (transactions within several months), the loans already granted, the credit cards issued. The bank managers hope to improve their understanding of customers and seek specific actions to improve services. A mere application of a discovery tool will not be convincing for them.  

To test a data mining approach to help the bank managers, it was decided to address two problems, a descriptive and a predictive one. While the descriptive problem was left open, the predictive problem is the prediction of whether a loan will end successfuly.

## Data Understanding

### Build Dataset

In [None]:
def parse_date(d):
  year = int(str(d)[0:2])
  month = int(str(d)[2:4])
  day = int(str(d)[4:6])
  return { 'year': year, 'month': month, 'day': day }


def parse_gender(row, birth_date):
  female = birth_date['month'] >= 50

  if female:
    row['gender'] = 'female' 
    birth_date['month'] -= 50
  else:
    row['gender'] = 'male' 


def calculate_age_loan(row):
  date_loan = row['date_loan']
  birth_number = row['birth_number']

  birth_date = parse_date(birth_number)

  parse_gender(row, birth_date)
  
  if (date_loan is None):
    date_loan = parse_date(row['date_loan'])
    date_loan = date_loan['year'] - birth_date['year'] - ((date_loan['month'], date_loan['day']) < (birth_date['month'], birth_date['day']))
  
  row['age_loan'] = date_loan
    
  return row

In [None]:
def nan_unemploymant_rate(year): return district["unemploymant rate '" + year + " "].isna()

# convert '?' to NaN

district['unemploymant rate \'95 '] = pd.to_numeric(district['unemploymant rate \'95 '], errors='coerce')
district['unemploymant rate \'96 '] = pd.to_numeric(district['unemploymant rate \'96 '], errors='coerce')

# NaN values will be equaled to the value of the other column

district.loc[nan_unemploymant_rate(95), 'unemploymant rate \'95 '] = district['unemploymant rate \'96 ']
district.loc[nan_unemploymant_rate(96), 'unemploymant rate \'96 '] = district['unemploymant rate \'95 ']

# create column with mean from both years and drop previous and now useless columns

district['unemploymant_rate'] = district[['unemploymant rate \'95 ', 'unemploymant rate \'96 ']].mean(axis=1)
district = district.drop(['unemploymant rate \'95 ', 'unemploymant rate \'96 '], axis= 1).reset_index()

In [None]:
def nan_commited_crimes(year): return district["no. of commited crimes '" + year + " "].isna()

# convert '?' to NaN

district['no. of commited crimes \'95 '] = pd.to_numeric(district['no. of commited crimes \'95 '], errors='coerce')
district['no. of commited crimes \'96 '] = pd.to_numeric(district['no. of commited crimes \'96 '], errors='coerce')

# NaN values will be equaled to the value of the other column

district.loc[nan_commited_crimes(95), 'no. of commited crimes \'95 '] = district['no. of commited crimes \'96 ']
district.loc[nan_commited_crimes(96), 'no. of commited crimes \'96 '] = district['no. of commited crimes \'95 ']

# create column with mean from both years and drop previous and now useless columns

district['commited_crimes'] = district[['no. of commited crimes \'95 ', 'no. of commited crimes \'96 ']].mean(axis=1)
district = district.drop(['no. of commited crimes \'95 ', 'no. of commited crimes \'96 '], axis= 1).reset_index()

In [None]:
# join account, loan, disposition and client
df = account.merge(loan_dev, on='account_id', suffixes=('','_loan'), how='right')
df = df.merge(disp, on='account_id', suffixes=('','_disp'), how='left')
df = df.merge(client, on='client_id',suffixes=('','_client'), how='left')

df.drop(columns='district_id', axis=1, inplace=True)

# create age_at_loan and gender column
df = df.apply(lambda row: calculate_age_loan(row), axis=1)

# join demograph
district.rename(columns={'code ':'code'}, inplace=True)
df = df.merge(district, left_on='district_id_client', right_on='code', how='left')

# join creditcard
df = df.merge(card_dev, on='disp_id', suffixes=('', '_card'), how='left')

# join transactions
df_transactions = df.merge(trans_dev, on='account_id', suffixes=('', '_transaction'), how='left')

df

### Exploratory Analysis

- número de disponentes vs status 
    - quantas pessoas podem depositar naquela conta / contas quantas aquela pessoa tem ??
- amount da loan vs status 
    - isto tipo não sei se faz super sentido era mais para perceber se o risco aumenta quanto maior for a loan ou se não podems concluir isso
    
- crime rate vs status
- taxa de desemprego vs status
- salário médio da região
- ... outros dados demográficos vs status ??
- diferença salário médio e dinheiro da loan pago a cada mês vs status
    - se o que a pessoa paga em cada mês é quase a totalidade do seu salário, é mais provável que não possa pagar a dada altura
- duration vs status
- género vs status
- frequência com que é utilizada a conta vs status
    - ver pelo número de transactions, no fundo é se é cliente ativo ou não
- percentagem de depósitos/débitos vs status 
    - my thought process foi que se eu só uso a conta para debitar dinheiro secalhar tou a gastar mais do que o ganho kk
- quantas loans tem 'ativas' 
    - ver interseção data de aceitação + duração
- ...


In [None]:
df.info()

In [None]:
df_transactions.info()

In [None]:
loan_dev['status'].value_counts().plot.bar()
plt.xlabel('Status')
plt.ylabel('Count')
plt.title('Status count for bank loans')

In [None]:
loan_dev.boxplot(column='amount', by='status', figsize=(5,5))

In [None]:
loan_dev.boxplot(column='payments', by='status', figsize=(5,5))

In [None]:
loan_dev.boxplot(column='duration', by='status', figsize=(5,5))


Since the boxes are not totally aligned, there might be a relation between the loan amount and the successfulness of a loan. The higher the amount, the most likely it is to fail. The differences in the graphics aren't, however, significant and our conclusions are enforced by logic.

In [None]:
df.boxplot(column='age_loan', by='status', figsize=(5,5))


In [None]:
disp_count = df.groupby(['account_id'])['disp_id'].nunique()
disp_count

In [None]:
df = df.merge(disp_count, on='account_id', suffixes=('', '_count'), how='left')


In [None]:
df.boxplot(column='disp_id_count', by='status', figsize=(5,5))

In [None]:
#remove duplicates
no_dups_df = df.drop_duplicates(subset=['account_id'])
no_dups_df.groupby('disp_id_count')['status'].value_counts().unstack(1).plot.bar()

plt.xlabel('disponents')
plt.ylabel('count by number of disponents')
plt.legend(['Status = -1', 'Status = 1'])
plt.title('Status count by number of account disponents')

In [None]:
df.boxplot(column='average salary ', by='status', figsize=(5,5))

## Data Preparation

## Model