# Bureaus Files Analysis

* Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

* Read data from files

In [None]:
bureau = pd.read_csv('../data/bureau.csv')
bureau_balance = pd.read_csv('../data/bureau_balance.csv')

* Print the shape of the data

In [None]:
print("Shape of the following data:")
print("Bureau: ", bureau.shape)
print("Bureau balance: ", bureau_balance.shape)

* Print the information about the data and datatypes for each file

In [None]:
print("Bureau data:")
bureau.info()

print("")
print("Bureau balance data:")
bureau_balance.info()

* Print the head for both files, so that we can, what types of data we are, in fact, working with

In [None]:
bureau.head()

In [None]:
bureau_balance.head()

* Let's look at the object values, since we would probably like to change them into integer type of data

* First, bureau file:

In [None]:
bureau['CREDIT_ACTIVE'].value_counts()

In [None]:
bureau['CREDIT_CURRENCY'].value_counts()

In [None]:
bureau['CREDIT_TYPE'].value_counts()

* For the bureau file, we are going to drop CREDIT_CURRENCY and CREDIT_TYPE cols, since they probably don't provide any important information for the analysis, while for the CREDIT_ACTIVE column we can change Active cases into 1, and all other into 0's

In [None]:
#bureau = bureau.drop(['CREDIT_TYPE'], axis=1)
#bureau = bureau.drop(['CREDIT_CURRENCY'], axis=1)

In [None]:
#bureau['CREDIT_ACTIVE'] = bureau['CREDIT_ACTIVE'].replace(to_replace='Active', value=1)
#bureau['CREDIT_ACTIVE'] = bureau['CREDIT_ACTIVE'].replace(to_replace='Closed', value=0)
#bureau['CREDIT_ACTIVE'] = bureau['CREDIT_ACTIVE'].replace(to_replace='Sold', value=0)
#bureau['CREDIT_ACTIVE'] = bureau['CREDIT_ACTIVE'].replace(to_replace='Bad debt', value=0)

* Let's look at the changes

In [None]:
bureau.info()

In [None]:
bureau['CREDIT_ACTIVE'].value_counts()

* Now, it's time to clean up the bureau_balance file

In [None]:
bureau_balance.info()

* Let's look at the STATUS field, since it's an object

In [None]:
bureau_balance['STATUS'].value_counts()

* I don't know, what these things are meant to mean, but it should probably be ok to deal with them by using One Hot Encoding - we will do that in the future

* Let's handle the missing values!

In [None]:
bureau.isnull().sum()

* I'm going to simply replace the NaN values with zeros

In [None]:
#bureau['DAYS_CREDIT_ENDDATE'].replace(np.nan, 0, inplace=True)
#bureau['DAYS_ENDDATE_FACT'].replace(np.nan, 0, inplace=True)
#bureau['AMT_CREDIT_MAX_OVERDUE'].replace(np.nan, 0, inplace=True)
#bureau['AMT_CREDIT_SUM'].replace(np.nan, 0, inplace=True)
#bureau['AMT_CREDIT_SUM_DEBT'].replace(np.nan, 0, inplace=True)
#bureau['AMT_CREDIT_SUM_LIMIT'].replace(np.nan, 0, inplace=True)
#bureau['AMT_ANNUITY'].replace(np.nan, 0, inplace=True)

from Functions.DataPreperation import fill_missing_values

fill_missing_values(bureau, mean = True)

In [None]:
bureau.isnull().sum()

* Very nice

* Now, it's time to check bureau_balance!

In [None]:
bureau_balance.isnull().sum()

* No work to be done here!

* Let's check, if there are any duplicates!

In [None]:
duplicates = bureau.duplicated()
print(duplicates.sum())

In [None]:
duplicates = bureau_balance.duplicated()
print(duplicates.sum())

* There are no duplicates!

* Now, let's look at the possible correlations between these two files and the application_train.csv

In [None]:
app = pd.read_csv('../data/application_train.csv')

In [None]:
app.shape

* First, let's analyse the bureau file

In [None]:
bureau['TARGET'] = app['TARGET']

In [None]:
bureau.isnull().sum()

In [None]:
bureau = bureau[bureau['TARGET'].notna()]

In [None]:
bureau.isnull().sum()

In [None]:
bur_correlations = bureau.corr()['TARGET'].sort_values()

print('Max correlations: \n', bur_correlations.tail(10))
print('Min correlations: \n', bur_correlations.head(10))

* The correlations are bad

* Now, let's look at the bureau_balance

In [None]:
bureau_balance['TARGET'] = app['TARGET']

In [None]:
bureau_balance.isnull().sum()

In [None]:
bureau_balance = bureau_balance[bureau_balance['TARGET'].notna()]

In [None]:
bureau_balance.isnull().sum()

In [None]:
bur_bal_correlations = bureau_balance.corr()['TARGET'].sort_values()

print('Max correlations: \n', bur_bal_correlations.tail(10))
print('Min correlations: \n', bur_bal_correlations.head(10))

* Here, the correlations are really bad as well!

# Conclusion

* When it comes to both files, the correlations are really low, the files in current state are useless for the further analysis, and can make more harm than good to our model 

* What may be good to do is to implement One Hot Encoding for bureau_balance, and maybe try to join both files, as well as try different type of feature engineering, drop some columns which seem to be useless, and try to fill the NaN values with median instead

In [None]:
from Functions.FeatureEngineering import *

bureau_numeric = group_numeric_values(bureau, 'bureau')
bureau_object = group_object_values(bureau, 'bureau')