## Importing Data

In [1]:
import pandas as pd
import os
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from functions import *

[Data Being Used](https://www.kaggle.com/zaurbegiev/my-dataset#credit_train.csv)

In [2]:
data = os.path.join('Data', 'bank.csv')
bank_df = pd.read_csv(data)

## Looking at Datframe

In [None]:
bank_df.head()

In [None]:
bank_df.tail()

## Missing values

In [None]:
# Checking for missing values
bank_df.isna().sum()

## Missing values

In [None]:
fig, ax = plt.subplots(figsize=(15, 5))

x = bank_df.isna().sum().index
y = bank_df.isna().sum()
ax.bar(x=x, height=y)
ax.set_xticklabels(x, rotation = 45)
plt.tight_layout();

In [None]:
"""
Removing features that are
30% or less NaN values
"""

# df.count() does not include NaN values
df2 = bank_df[[column for column in bank_df if bank_df[column].count() / len(bank_df) >= 0.3]]

# Checking to see which columns were dropped
print("List of dropped columns:", end=" ")

# If columns in the orignial df is not found in the new df then we dropped those columns
for c in bank_df.columns:
    if c not in df2.columns:
        print(c, end=", ")
else:
    print("No Columns Dropped")
print('\n')

bank_df = df2

In [None]:
# Dropping missing values
bank_df.dropna(inplace=True)

In [None]:
bank_df.shape  # Number of columns and rows left after dropping missing values

## Dependent Variable

**Observation:**
Dependent varibale has to values and will need to be balanced to get a more accurate model

In [None]:
# Checking the value counts of the dependent variable
bank_df.Loan_Status.value_counts(normalize=True)

In [None]:
bank_df.describe().T

## Changing Column Names

In [None]:
# Changing column names to be more suitable
bank_df.columns = bank_df.columns.str.replace(' ', '_')

## Data Distribution

In [None]:
sns.pairplot(bank_df)

## Observation

Data seems to be mainly categorical. Meaning that logistic regression would best fit this data.

The data would also have to be transformed, maybe a log transformation would help normalize data

## Checking Correlation

In [None]:
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='viridis', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);

## Observation 
- There are a few multicollinearity in the data

## Numeric Data Distribution

In [None]:
bank_df.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8)
plt.show()

## Looking Into Credit Score Distribution
Credit score shows high values

In [None]:
plt.figure(figsize=(10, 9))

# Distribution graph
sns.distplot(bank_df.Credit_Score)
plt.show()

In [None]:
# Checking credit score greater than 850
bank_df.Credit_Score.loc[bank_df.Credit_Score > 850].count()

## Observation
- Data will need to be normalized/scaled
- Many of the distributions are skewed
- Credit score has values over 850 and will need to be dropped. The cap will be 850. 1,955 columns will be dropped

# Checking Categorical Values

## Purpose Column
There are many values in the Purpose column and will need to get rid of some values
- Values to get rid of:
    - other and Other
    - moving
    - wedding
    - vacation
    - Educational Expenses
    - renewable_energy
    
This means that values under 100 occurrences will be dropped

In [None]:
# Checking the Purpose column
bank_df.Purpose.value_counts()

In [None]:
bank_df.Purpose.value_counts(normalize=True)

## Home Ownership Column

In [None]:
bank_df.Home_Ownership.value_counts()

## Years in Current Job Columns

In [None]:
bank_df.Years_in_current_job.value_counts()

## Term Column

In [None]:
bank_df.Term.value_counts(normalize=True)