# Credit card offer

## Import required libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, MinMaxScaler, PowerTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score

from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks

## Define a goal

**GOAL: Know if a customer will accept a credit card offer or not.**

## Get data

In [26]:
headers =  ['customer_number', 'offer_accepted', 'reward', 'mailer_type', 'income_level', 'bank_accounts_open', 'overdraft_protection', 'credit_rating', 'credit_cards_held', 'homes_owned', 'household_size', 'own_your_home', 'avg_balance', 'balance_q1',
            'balance_q2', 'balance_q3', 'balance_q4']

In [27]:
data = pd.read_csv('./creditcardmarketing.csv', header=None, names = headers)

In [28]:
data.head()

Unnamed: 0,customer_number,offer_accepted,reward,mailer_type,income_level,bank_accounts_open,overdraft_protection,credit_rating,credit_cards_held,homes_owned,household_size,own_your_home,avg_balance,balance_q1,balance_q2,balance_q3,balance_q4
0,1,No,Air Miles,Letter,High,1,No,High,2,1,4,No,1160.75,1669.0,877.0,1095.0,1002.0
1,2,No,Air Miles,Letter,Medium,1,No,Medium,2,2,5,Yes,147.25,39.0,106.0,78.0,366.0
2,3,No,Air Miles,Postcard,High,2,No,Medium,2,1,2,Yes,276.5,367.0,352.0,145.0,242.0
3,4,No,Air Miles,Letter,Medium,2,No,High,1,1,4,No,1219.0,1578.0,1760.0,1119.0,419.0
4,5,No,Air Miles,Letter,Medium,1,No,Medium,2,1,6,Yes,1211.0,2140.0,1357.0,982.0,365.0


In [5]:
data.shape

(18000, 17)

In [17]:
output_file = pd.read_csv('./creditcardmarketing.csv', header=None)

In [18]:
output_file

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,1,No,Air Miles,Letter,High,1,No,High,2,1,4,No,1160.75,1669.0,877.0,1095.0,1002.0
1,2,No,Air Miles,Letter,Medium,1,No,Medium,2,2,5,Yes,147.25,39.0,106.0,78.0,366.0
2,3,No,Air Miles,Postcard,High,2,No,Medium,2,1,2,Yes,276.50,367.0,352.0,145.0,242.0
3,4,No,Air Miles,Letter,Medium,2,No,High,1,1,4,No,1219.00,1578.0,1760.0,1119.0,419.0
4,5,No,Air Miles,Letter,Medium,1,No,Medium,2,1,6,Yes,1211.00,2140.0,1357.0,982.0,365.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17995,17996,No,Cash Back,Letter,High,1,No,Low,1,1,5,Yes,167.50,136.0,65.0,71.0,398.0
17996,17997,No,Cash Back,Letter,High,1,No,Low,3,1,3,Yes,850.50,984.0,940.0,943.0,535.0
17997,17998,No,Cash Back,Letter,High,1,No,Low,2,1,4,No,1087.25,918.0,767.0,1170.0,1494.0
17998,17999,No,Cash Back,Letter,Medium,1,No,Medium,4,2,2,Yes,1022.25,626.0,983.0,865.0,1615.0


In [19]:
output_file = output_file.drop(columns=[0])

In [20]:
output_file

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,No,Air Miles,Letter,High,1,No,High,2,1,4,No,1160.75,1669.0,877.0,1095.0,1002.0
1,No,Air Miles,Letter,Medium,1,No,Medium,2,2,5,Yes,147.25,39.0,106.0,78.0,366.0
2,No,Air Miles,Postcard,High,2,No,Medium,2,1,2,Yes,276.50,367.0,352.0,145.0,242.0
3,No,Air Miles,Letter,Medium,2,No,High,1,1,4,No,1219.00,1578.0,1760.0,1119.0,419.0
4,No,Air Miles,Letter,Medium,1,No,Medium,2,1,6,Yes,1211.00,2140.0,1357.0,982.0,365.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17995,No,Cash Back,Letter,High,1,No,Low,1,1,5,Yes,167.50,136.0,65.0,71.0,398.0
17996,No,Cash Back,Letter,High,1,No,Low,3,1,3,Yes,850.50,984.0,940.0,943.0,535.0
17997,No,Cash Back,Letter,High,1,No,Low,2,1,4,No,1087.25,918.0,767.0,1170.0,1494.0
17998,No,Cash Back,Letter,Medium,1,No,Medium,4,2,2,Yes,1022.25,626.0,983.0,865.0,1615.0


In [21]:
output_file.to_csv('./creditcardmarketing_db.csv', header = False, index=False)

## Explore data

In [None]:
data.info()

In [None]:
data.isna().sum()

In [None]:
data.isna().sum() / data.shape[0]

* We have null values so we need to know what we will do with them.

## Deal with missing values

In [None]:
data.isna().sum()[data.isna().sum() > 0]

* I decided to remove those observations because my dataset has a 18,000 observations, I think that 24 are deprecable.

In [None]:
data = data.dropna()

In [None]:
data.shape

## Deal with outliers

In [None]:
summary_cols = ['balance_q1', 'balance_q2', 'balance_q3', 'balance_q4']

In [None]:
for column in ['avg_balance'] + summary_cols:    
    sns.boxplot(data[column])
    plt.xlabel(column)
    plt.show()

In [None]:
summary = data[summary_cols].describe().T

In [None]:
summary

In [None]:
summary['IQR'] = summary['75%'] - summary['25%']

In [None]:
summary

In [None]:
summary['upper_limit'] = summary['75%'] + 1.5 * summary['IQR']

In [None]:
summary['lower_limit'] = summary['25%'] - 1.5 * summary['IQR']

In [None]:
summary

In [None]:
data.head()

In [None]:
for column in summary_cols:
    print(f'{column.upper()} total outliers: ')
    print(data[(data[column] < summary.loc[column, 'lower_limit']) | (data[column] > summary.loc[column, 'upper_limit'])].shape[0])
    print('----------' * 5)

In [None]:
data[(data['balance_q1'] < summary.loc['balance_q1', 'lower_limit']) | (data['balance_q1'] > summary.loc['balance_q1', 'upper_limit'])]

In [None]:
data[(data['balance_q2'] < summary.loc['balance_q2', 'lower_limit']) | (data['balance_q2'] > summary.loc['balance_q2', 'upper_limit'])]

In [None]:
data[(data['balance_q3'] < summary.loc['balance_q3', 'lower_limit']) | (data['balance_q3'] > summary.loc['balance_q3', 'upper_limit'])]

In [None]:
data[(data['balance_q4'] < summary.loc['balance_q1', 'lower_limit']) | (data['balance_q4'] > summary.loc['balance_q4', 'upper_limit'])]

In [None]:
def remove_outliers(df,columns):
    for col in columns:
        print('Working on column: {}'.format(col))
        
        df = df[(df[col] > summary.loc[col, 'lower_limit']) & (df[col] < summary.loc[col, 'upper_limit'])]
        
    return df

In [None]:
data = remove_outliers(data, summary_cols)

In [None]:
data.shape

In [None]:
data = data.reset_index(drop = True)

In [None]:
data

In [None]:
for column in data.select_dtypes(np.number).columns[5:]:
    plt.figure(figsize = (4,4))
    sns.histplot(data[column], kde= True)
    plt.show()

## Split data into categorical and numerical

### Numerical data

In [None]:
def transform_target(row):
    if row == 'No':
        return 0
    else:
        return 1

In [None]:
data['offer_accepted'] = data['offer_accepted'].apply(lambda x: transform_target(x))

In [None]:
data['offer_accepted'] = data['offer_accepted'].astype(np.int64)

In [None]:
numerical_data = data.select_dtypes(np.number)

In [None]:
numerical_data.columns

In [None]:
for column in numerical_data.drop(columns=['customer_number']).columns[5:]:
    plt.figure(figsize = (4,4))
    sns.histplot(numerical_data[column], kde= True)
    plt.show()

In [None]:
numerical_data = numerical_data.reset_index(drop = True)

In [None]:
numerical_data

In [None]:
numerical_corr = numerical_data.drop(columns=['customer_number', 'offer_accepted']).corr()

In [None]:
numerical_corr

In [None]:
mask = np.triu(numerical_corr)

plt.figure(figsize=(10, 6))
sns.heatmap(numerical_corr, cmap="YlGnBu", annot=True, mask=mask)
plt.show()

In [None]:
numerical_data = numerical_data.drop(columns=['avg_balance', 'customer_number'])

In [None]:
numerical_data

In [None]:
mask = np.triu( numerical_data.drop(columns=['offer_accepted']).corr())

plt.figure(figsize=(10, 6))
sns.heatmap(numerical_data.drop(columns=['offer_accepted']).corr(), cmap="mako", annot=True, mask=mask)
plt.show()

### Categorical data

In [None]:
categorical_data = data.select_dtypes(object)

In [None]:
categorical_data.columns

In [None]:
for column in categorical_data.columns:
    sns.barplot(data = data, x = categorical_data[column].unique(), y = categorical_data[column].value_counts(dropna=False))
    plt.xlabel(column)
    plt.show()

In [None]:
# Encoding data
categories = [['Low', 'Medium', 'High']]

ordinal_encoder = OrdinalEncoder(categories=categories)

In [None]:
income_oe_data = ordinal_encoder.fit_transform(categorical_data[['income_level']])

In [None]:
rating_oe_data = ordinal_encoder.fit_transform(categorical_data[['credit_rating']])

In [None]:
categorical_data = categorical_data.drop(columns=['income_level', 'credit_rating'])

In [None]:
categorical_data.columns

In [None]:
categorical_data = pd.get_dummies(categorical_data, drop_first=True)

In [None]:
categorical_data = categorical_data.reset_index(drop=True)

In [None]:
categorical_data

In [None]:
income_data = pd.DataFrame(income_oe_data, columns=['income_level'])

In [None]:
income_data = income_data.reset_index(drop = True)

In [None]:
income_data

In [None]:
income_data.shape

In [None]:
rating_data = pd.DataFrame(rating_oe_data, columns=['credit_rating'])

In [None]:
rating_data = rating_data.reset_index(drop = True)

In [None]:
rating_data

In [None]:
rating_data.shape

In [None]:
categorical_data = pd.concat([categorical_data, income_data, rating_data], axis = 1)

In [None]:
categorical_data

### Get dataframe to it use into analysis

In [None]:
full_data = pd.concat([numerical_data, categorical_data], axis = 1).reset_index(drop = True)

In [None]:
full_data.columns = full_data.columns.str.lower().str.replace(' ', '_')

In [None]:
full_data

## Check data balance (classes)

In [None]:
accepted, not_accepted = full_data['offer_accepted'].value_counts()

In [None]:
print('Percentage of those clients that accepted the offer: {: .2f}%'.format(accepted / (accepted + not_accepted) * 100))
print("Percentage of those clients that didn't accept the offer: {: .2f}%".format(not_accepted / (accepted + not_accepted) * 100))

In [None]:
sns.countplot(x = full_data['offer_accepted'], data = full_data)
plt.show()

* We can notice that we have imbalanced into our target.

## Split into train and test dataset

### Initial model

In [None]:
y = full_data['offer_accepted']

In [None]:
y.shape

In [None]:
X = full_data.drop(columns=['offer_accepted'])

In [None]:
X.columns

In [None]:
X.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
y_train.shape

In [None]:
y_test.shape

In [None]:
numerical_cols = ['bank_accounts_open', 'credit_cards_held', 'homes_owned', 'household_size', 'balance_q1', 'balance_q2', 'balance_q3', 'balance_q4']
categorical_cols = ['reward_cash_back', 'reward_points', 'mailer_type_postcard', 'overdraft_protection_yes', 'own_your_home_yes', 'income_level', 'credit_rating']

### Using standard scaler

In [None]:
scaler = StandardScaler()

In [None]:
X_train_scaler = scaler.fit_transform(X_train[numerical_cols])

In [None]:
X_train_scaler = pd.DataFrame(X_train_scaler, columns=numerical_cols)

In [None]:
X_train_scaler = pd.concat([X_train_scaler, X_train[categorical_cols]], axis= 1)

In [None]:
X_train_scaler

In [None]:
for column in X_train_scaler.columns[4:]:
    plt.figure(figsize = (4,4))
    sns.histplot(X_train_scaler[column], kde = True)
    plt.show()

### Using MinMax scaler

In [None]:
min_max_scaler = MinMaxScaler()

In [None]:
X_train_min_max = min_max_scaler.fit_transform(X_train[numerical_data.drop(columns = ['offer_accepted']).columns])

In [None]:
X_train_min_max = pd.DataFrame(X_train_min_max, columns=numerical_data.drop(columns = ['offer_accepted']).columns)

In [None]:
X_train_min_max

In [None]:
for column in X_train_min_max.columns[4:]:
    plt.figure(figsize = (4,4))
    sns.histplot(X_train_min_max[column], kde = True)
    plt.show()

### Using Power tranform

In [None]:
power_transform = PowerTransformer()

In [None]:
X_train_power = power_transform.fit_transform(X_train[numerical_data.drop(columns = ['offer_accepted']).columns])

In [None]:
X_train_power = pd.DataFrame(X_train_power, columns=numerical_data.drop(columns = ['offer_accepted']).columns)

In [None]:
X_train_power

In [None]:
for column in X_train_min_max.columns[4:]:
    plt.figure(figsize = (4,4))
    sns.histplot(X_train_min_max[column], kde = True)
    plt.show()

## Train model

### Initial model

In [None]:
logistic_reg = LogisticRegression(random_state=0, solver='lbfgs', max_iter=700)

In [None]:
logistic_reg.fit(X_train, y_train)

In [None]:
logistic_reg.n_iter_

In [None]:
train_pred = logistic_reg.predict(X_train)

In [None]:
train_pred

In [None]:
test_pred = logistic_reg.predict(X_test)

In [None]:
test_pred

### Using balanced data

In [None]:
count_classes = full_data['offer_accepted'].value_counts()

In [None]:
count_classes

In [None]:
count_classes[0] / full_data.shape[0]

In [None]:
(count_classes[0]-count_classes[1])/(count_classes[0]+count_classes[1])

---------------------------------------

#### Oversampling

In [None]:
ros = RandomOverSampler(random_state=42)

In [None]:
X_train_over, y_train_over = ros.fit_resample(X_train,y_train)

In [None]:
y_train_over.value_counts()

In [None]:
print('Resampled dataset shape %s' % Counter(y_train_over))

#### Undersampling

In [None]:
rus = RandomUnderSampler(random_state=42)

In [None]:
X_train_under, y_train_under = rus.fit_resample(X_train, y_train)

In [None]:
y_train_under.value_counts()

In [None]:
print('Resampled dataset shape %s' % Counter(y_train_under))

#### SMOTE

In [None]:
sm = SMOTE(random_state=100, k_neighbors=5)

X_train_SMOTE, y_train_SMOTE = sm.fit_resample(X_train, y_train)

In [None]:
y_train_SMOTE.value_counts()

In [None]:
print('Resampled dataset shape %s' % Counter(y_train_SMOTE))

#### Tomeklinks

In [None]:
tl = TomekLinks()
X_train_tl, y_train_tl = tl.fit_resample(X_train, y_train)
y_train_tl.value_counts()

In [None]:
print('Resampled dataset shape %s' % Counter(y_train_tl))

## Test model

In [None]:
def get_scores(balance_method, test_sample, test_predictions):
    print(f"Scores using {balance_method} to balance data")
    print("The accuracy in the TEST set using is: {:.2f}".format(accuracy_score(y_test,test_predictions)))
    print("The precision in the TEST set is: {:.2f}".format(precision_score(y_test,test_predictions)))
    print("The recall in the TEST set is: {:.2f}".format(recall_score(y_test,test_predictions)))
    print("The F1 in the TEST set is: {:.2f}".format(f1_score(y_test,test_predictions)))
    print("The Kappa in the TEST set is: {:.2f}".format(cohen_kappa_score(y_test,test_predictions)))

### Initial model

In [None]:
print("Train accuracy initial model: {:.2f}".format(logistic_reg.score(X_train, y_train)))
print("Test accuracy initial model: {:.2f}".format(logistic_reg.score(X_test, y_test)))

In [None]:
cm_train = confusion_matrix(y_train, train_pred)
cm_test = confusion_matrix(y_test, test_pred)

In [None]:
disp = ConfusionMatrixDisplay(cm_train, display_labels=logistic_reg.classes_)
disp.plot()
plt.show()

In [None]:
disp = ConfusionMatrixDisplay(cm_test, display_labels=logistic_reg.classes_)
disp.plot()
plt.show()

### Using oversampling

In [None]:
lr = LogisticRegression(max_iter=500)
lr.fit(X_train_over, y_train_over)
y_pred_test_ros = lr.predict(X_test)

get_scores('OVERSAMPLING', y_test, y_pred_test_ros)

In [None]:
cm_test = confusion_matrix(y_test,y_pred_test_ros)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_test, display_labels=lr.classes_)
disp.plot()
plt.show()

### Using undersampling

In [None]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_under, y_train_under)
y_pred_test_rus = lr.predict(X_test)

get_scores('UNDERSAMPLING', y_test, y_pred_test_rus)

In [None]:
cm_test = confusion_matrix(y_test,y_pred_test_rus)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_test, display_labels=lr.classes_)
disp.plot()
plt.show()

### Using SMOTE

In [None]:
lr = LogisticRegression(max_iter=1200)
lr.fit(X_train_SMOTE, y_train_SMOTE)
y_pred_test_SMOTE = lr.predict(X_test)

get_scores('SMOTE', y_test, y_pred_test_SMOTE)

In [None]:
cm_test = confusion_matrix(y_test,y_pred_test_SMOTE)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_test, display_labels=lr.classes_)
disp.plot()
plt.show()

### Using Tomeklink

In [None]:
lr = LogisticRegression(max_iter=1200)
lr.fit(X_train_tl, y_train_tl)
y_pred_test_tl = lr.predict(X_test)

get_scores('SMOTE', y_test, y_pred_test_tl)

## Improve model

In [None]:
### Initial model 

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=10)

In [None]:
knn.fit(X_train, y_train)

In [None]:
y_pred = knn.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)