<a href="https://colab.research.google.com/github/Matheus-Homem/international_bank_marketing/blob/main/notebooks/c1_end_to_end_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 0. Imports

## 0.1. Libraries

In [35]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from math import gcd
import decimal
from functools import reduce

## 0.2. Helper Functions

In [63]:
def mirroring(x):
  x += (.5 - x)*2
  return x

## 0.3. Load Dataset

In [2]:
df = pd.read_csv('https://github.com/Matheus-Homem/international_bank_marketing/raw/main/data/CC%20GENERAL.csv')

In [4]:
df.sample().T

Unnamed: 0,2745
CUST_ID,C12824
BALANCE,2316
BALANCE_FREQUENCY,1
PURCHASES,3643.93
ONEOFF_PURCHASES,2971.8
INSTALLMENTS_PURCHASES,672.13
CASH_ADVANCE,780.459
PURCHASES_FREQUENCY,0.916667
ONEOFF_PURCHASES_FREQUENCY,0.583333
PURCHASES_INSTALLMENTS_FREQUENCY,0.833333


# 1. Data Description

In [5]:
df1 = df.copy()

## 1.1. Rename Columns

In [8]:
df1.columns = list(map(lambda x: x.lower(), df1.columns))

## 1.2. Data Dimensions

In [10]:
print('Number of rows: {}'.format(df1.shape[0]))
print('Number of columns: {}'.format(df1.shape[1]))

Number of rows: 8950
Number of columns: 18


## 1.3. Data Types

In [11]:
df1.dtypes

cust_id                              object
balance                             float64
balance_frequency                   float64
purchases                           float64
oneoff_purchases                    float64
installments_purchases              float64
cash_advance                        float64
purchases_frequency                 float64
oneoff_purchases_frequency          float64
purchases_installments_frequency    float64
cash_advance_frequency              float64
cash_advance_trx                      int64
purchases_trx                         int64
credit_limit                        float64
payments                            float64
minimum_payments                    float64
prc_full_payment                    float64
tenure                                int64
dtype: object

## 1.4. Check NA

In [12]:
df1.isna().sum()

cust_id                               0
balance                               0
balance_frequency                     0
purchases                             0
oneoff_purchases                      0
installments_purchases                0
cash_advance                          0
purchases_frequency                   0
oneoff_purchases_frequency            0
purchases_installments_frequency      0
cash_advance_frequency                0
cash_advance_trx                      0
purchases_trx                         0
credit_limit                          1
payments                              0
minimum_payments                    313
prc_full_payment                      0
tenure                                0
dtype: int64

## 1.5. Replace NA

In [15]:
# remove NA
df1.dropna(inplace=True)
print('Number of rows: {}'.format(df1.shape[0]))
print('Removed data: {:.2f}%'.format(1-(df1.shape[0]/df.shape[0])))

Number of rows: 8636
Removed data: 0.04%


## 1.6. Change dtypes

## 1.7. Descriptive Statistics

# 2. Feature Engineering

In [16]:
df2 = df1.copy()

## 2.1. Feature Creation

In [18]:
df2.sample(5).T

Unnamed: 0,251,7611,4702,116,3025
cust_id,C10260,C17817,C14830,C10121,C13114
balance,996.482,1688.25,507.077,223.764,72.5181
balance_frequency,1,1,0.857143,0.636364,1
purchases,675.96,1937.64,0,2309.78,525
oneoff_purchases,0,928.18,0,0,0
installments_purchases,675.96,1009.46,0,2309.78,525
cash_advance,0,967.811,1391.14,0,0
purchases_frequency,1,0.818182,0,0.583333,1
oneoff_purchases_frequency,0,0.454545,0,0,0
purchases_installments_frequency,1,0.545455,0,0.5,1


In [62]:
# data reference
df_ref = pd.DataFrame(df2['cust_id'])

# recency
df_ref['recency'] = df2['purchases_frequency']
df_ref['recency'] = df_ref['recency'].map(mirroring)

# frequency
df_ref['frequency'] = df2['purchases_trx']

# monetary
df_ref['monetary'] = df2['cash_advance']

# balance
df_ref['balance'] = df2['balance']

In [64]:
df_ref.sample().T

Unnamed: 0,7673
cust_id,C17879
recency,0.833333
frequency,2
monetary,0
balance,0


# 3. Feature Filtering

In [65]:
df3 = df_ref.copy()

# 4. EDA (Exploratory Data Analysis)

In [66]:
df4 = df3.copy()

# 5. Data Preparation

In [67]:
df5 = df4.copy()

# 6. Feature Selection

In [68]:
df6 = df5.copy()

# 7. Hyperparameter Fine-Tunning

In [71]:
df7 = df6.drop('cust_id',axis=1)

# 8. Model Training

# 9. Cluster Analysis