In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn  as sns
import warnings
warnings.filterwarnings('ignore')


In [None]:
df = pd.read_csv('../artifacts/Data.csv')

In [None]:
df


In [None]:
df.rename(columns={'default.payment.next.month' : 'Defaulter'}, inplace=True)
df.rename(columns={'PAY_0' : 'PAY_1'}, inplace=True)

In [None]:
df.columns

In [None]:
df['Defaulter'].value_counts()

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(x='Defaulter', data=df,palette='Set1')
plt.xticks([0,1],['No','Yes'])

### Categorical Variables

#### SEX

- 1 - Male
- 2 - Female


In [None]:
df['SEX'].value_counts()

#### EDUCATION

- 1 = Graduate School
- 2 = University
- 3 = High School
- 4 = Others

In [None]:
df['EDUCATION'].value_counts()

In the datasheet we have values like 5,6,0 for which we have no description so we can add uo them in 4 (Others)

In [None]:
fil = (df['EDUCATION'] == 5) | (df['EDUCATION'] == 6) | (df['EDUCATION'] == 0)
df.loc[fil, 'EDUCATION'] = 4
df['EDUCATION'].value_counts()

#### MARRIAGE

- 1 = Married
- 2 = Single
- 3 = Others 

In [None]:
df['MARRIAGE'].value_counts()

There are a very few datapoints for (0) so we decided to add them to (3)

In [None]:
fil = df['MARRIAGE'] == 0
df.loc[fil, 'MARRIAGE'] = 3
df['MARRIAGE'].value_counts()

Plots of Categorical Features

In [None]:
categorical_features = ['SEX', 'EDUCATION', 'MARRIAGE']

In [None]:
df_cat = df[categorical_features]
df_cat['Defaulter'] = df['Defaulter']

In [None]:
df_cat.replace({'SEX': {1 : 'MALE', 2 : 'FEMALE'},
            'EDUCATION' : {1 : 'Graduate School', 2 : 'University', 3 : 'High School', 4 : 'Others'},
            'MARRIAGE' : {1 : 'Married', 2 : 'Single', 3 : 'Others'}}, inplace = True)

In [None]:
for col in categorical_features:
  plt.figure(figsize=(8,4))
  fig, axes = plt.subplots(ncols=2,figsize=(13,8))
  df_cat[col].value_counts().plot(kind="pie",ax = axes[0],subplots=True)
  sns.countplot(x = col, hue = 'Defaulter', data = df_cat)
  plt.legend(['No', 'Yes'])

####  Below are few observations for categorical features:

- There are more females credit card holder,so no. of defaulter have high proportion of females.
- No. of defaulters have a higher proportion of educated people (graduate school and university)
- No. of defaulters have a higher proportion of Singles.

#### LIMIT BALANCE

In [None]:
df['LIMIT_BAL'].max()

In [None]:
df['LIMIT_BAL'].min()

In [None]:
df['LIMIT_BAL'].describe()

In [None]:
plt.figure(figsize=(6, 4))
ax = sns.barplot(x='Defaulter', y='LIMIT_BAL', data=df,palette='Set1', ci=None)
plt.xticks([0,1],['No','Yes'])
plt.ylim(0, 200000)
for p in ax.patches:
        ax.annotate("%.2f" %(p.get_height()), (p.get_x()+0.20, p.get_height()+100),fontsize=12)
plt.show()




In [None]:
plt.figure(figsize=(6,6))
ax = sns.boxplot(x="Defaulter", y="LIMIT_BAL", data=df,palette='Set1')
plt.xticks([0,1],['No','Yes'])


In [None]:

df.rename(columns={'PAY_1':'PAY_SEPT','PAY_2':'PAY_AUG','PAY_3':'PAY_JUL','PAY_4':'PAY_JUN','PAY_5':'PAY_MAY','PAY_6':'PAY_APR'},inplace=True)
df.rename(columns={'BILL_AMT1':'BILL_AMT_SEPT','BILL_AMT2':'BILL_AMT_AUG','BILL_AMT3':'BILL_AMT_JUL','BILL_AMT4':'BILL_AMT_JUN','BILL_AMT5':'BILL_AMT_MAY','BILL_AMT6':'BILL_AMT_APR'}, inplace = True)
df.rename(columns={'PAY_AMT1':'PAY_AMT_SEPT','PAY_AMT2':'PAY_AMT_AUG','PAY_AMT3':'PAY_AMT_JUL','PAY_AMT4':'PAY_AMT_JUN','PAY_AMT5':'PAY_AMT_MAY','PAY_AMT6':'PAY_AMT_APR'},inplace=True)

In [None]:
df.head()

#### AGE

In [None]:
df['AGE'].value_counts()

In [None]:
df['AGE']=df['AGE'].astype('int')

In [None]:
plt.figure(figsize=(6, 6))
sns.boxplot(x='Defaulter', y='AGE', data=df,palette='Set1')
plt.xticks([0,1],['No','Yes'])

#### Bill Payment

In [None]:
bill_amnt_df = df[['BILL_AMT_SEPT',	'BILL_AMT_AUG',	'BILL_AMT_JUL',	'BILL_AMT_JUN',	'BILL_AMT_MAY',	'BILL_AMT_APR']]

In [None]:
plt.figure(figsize=(10, 8))
sns.pairplot(data = bill_amnt_df)

In [None]:
pay_df = df[['PAY_SEPT',	'PAY_AUG',	'PAY_JUL',	'PAY_JUN',	'PAY_MAY',	'PAY_APR']]
for i in pay_df:
    plt.figure(figsize=(10,8))
    ax =sns.countplot(x = i, data = df,palette='Set1',hue='Defaulter')
    plt.ylim(0, 18000)
    plt.legend(['No', 'Yes'])
    for p in ax.patches:
        ax.annotate("%.0f" %(p.get_height()), (p.get_x()+0.001, p.get_height()+100),fontsize=8)
plt.show()





#### Amount Paid 

In [None]:
pay_amnt_df = df[['PAY_AMT_SEPT',	'PAY_AMT_AUG',	'PAY_AMT_JUL',	'PAY_AMT_JUN',	'PAY_AMT_MAY',	'PAY_AMT_APR', 'Defaulter']]



In [None]:

sns.pairplot(data = pay_amnt_df, hue = 'Defaulter')

#### Using SMOTE to balance the data

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
x_smote,y_smote = smote.fit_resample(df.drop('Defaulter', axis=1), df['Defaulter'])


In [None]:
x_smote

In [None]:
y_smote

In [None]:
processed_data = pd.concat([x_smote, y_smote], axis=1)  

In [None]:
processed_data

In [None]:
processed_data.shape

In [None]:
processed_data.to_csv('../artifacts/processed_data.csv', index=False)