# EDA steps:
•	Import the Data-set
•	slicing data
•	Check data characters
•	calculating statistics: mean, median, standard deviation
•	basic plotting for data
•	Correlation computation

Data pre-processing steps:
•	Check out and filled the missing values
•	Import the Libraries
•   Read the bureau data
•	Handle /Create Dummy variable
•   Group by function
•   Logistic Regrssion
•	Dimensionality reduction
•	Clustering of similar observations

In [2]:
# Import the Data-set & Import the Libraries

In [3]:
# numpy and pandas for data manipulation
import pandas as pd
import numpy as np
# matplotlib and seaborn for plotting
import matplotlib
import matplotlib.pyplot as plt 
import seaborn as sns 

Read Data

In [5]:
app_train = pd.read_csv("/dbfs/FileStore/tables/application_train.csv", header='infer')
app_test = pd.read_csv("/dbfs/FileStore/tables/application_test.csv", header='infer')
bureau = pd.read_csv("/dbfs/FileStore/tables/bureau.csv", header='infer')
bureau_balance = pd.read_csv("/dbfs/FileStore/tables/bureau_balance.csv", header='infer')


Slicing data

Check data characters & calculating statistics: mean, median, standard deviation

In [8]:
app_train['TARGET'].value_counts()

In [9]:
print('AMT_INCOME_TOTAL Train',app_train['AMT_INCOME_TOTAL'].count())

basic plotting for data

In [11]:
app_train['TARGET'].astype(int).plot.hist()

Correlation computation

In [13]:
# Find correlations with the target and sort
correlations = app_train.corr()['TARGET'].sort_values()

# Display correlations
print('Most Positive Correlations:\n', correlations.tail(20))
print('\nMost Negative Correlations:\n', correlations.head(20))

Correlation Heat Map

In [15]:
ext_data = app_train[['TARGET', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']]
ext_data_corrs = ext_data.corr()
ext_data_corrs

plt.figure(figsize = (8, 6))

# Heatmap of correlations
sns.heatmap(ext_data_corrs, cmap = plt.cm.RdYlBu_r, vmin = -0.25, annot = True, vmax = 0.6)
plt.title('Correlation Heatmap');

Check out and filled the missing values

In [17]:
# Function to calculate missing values by column# Funct 
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [18]:
 # Missing values statistics
missing_values = missing_values_table(app_train)
missing_values.head(20)

Exploration of Bureau Data

Import the Libraries

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import Imputer 
from sklearn.preprocessing import scale 
# Suppress warnings from pandas
import warnings
warnings.filterwarnings('ignore')

plt.style.use('fivethirtyeight')

Group By Function

In [23]:
# Groupby the client id (SK_ID_CURR), count the number of previous loans, and rename the column
previous_loan_counts = bureau.groupby('SK_ID_CURR', as_index=False)['SK_ID_BUREAU'].count().rename(columns = {'SK_ID_BUREAU': 'previous_loan_counts'})
previous_loan_counts.head()

In [24]:
# app_train left join previous loan data
train = app_train.join(previous_loan_counts, on = 'SK_ID_CURR', how = 'left', lsuffix='SK_ID_CURR', rsuffix='r_SK_ID_CURR')

# Fill the missing values with 0 
train['previous_loan_counts'] = train['previous_loan_counts'].fillna(0)
train.head()

In [25]:
# one-hot encoding of categorical variables
app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test)
print('Training Features shape: ', train.shape)


In [26]:
train_labels = app_train['TARGET']

# Align the training and testing data, keep only columns present in both dataframes
app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)

# Add the target back in
app_train['TARGET'] = train_labels

print('Training Features shape: ', app_train.shape)
print('Testing Features shape: ', app_test.shape)

In [27]:
# Logistic Regression
from sklearn.preprocessing import MinMaxScaler, Imputer
# one-hot encoding of categorical variables
app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test)

# Match the columns in the dataframes
app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)
print('Training shape: ', app_train.shape)
print('Testing shape: ', app_test.shape)


In [28]:
app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)

print('Training set full shape: ', app_train.shape)
print('Testing set full shape: ' , app_test.shape)

In [29]:
# Copy of the testing data
app_test = app_test.copy()

# Median imputation of missing values
imputer = Imputer(strategy = 'median')

# Scale each feature to 0-1
scaler = MinMaxScaler(feature_range = (0, 1))

# Fit on the training data
imputer.fit(app_train)

# Transform both training and testing data
app_train = imputer.transform(app_train)
app_test = imputer.transform(app_test)

# Repeat with the scaler
scaler.fit(app_train)
app_train = scaler.transform(app_train)
app_test = scaler.transform(app_test)

print('Training data shape: ', app_train.shape)
print('Testing data shape: ', app_test.shape)

In [30]:
from sklearn.linear_model import LogisticRegression

# Make the model with the specified regularization parameter
log_reg = LogisticRegression(C = 0.0001)

# Train on the training data
log_reg.fit(app_train, train_labels)

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
app_train = pd.read_csv("/dbfs/FileStore/tables/application_train.csv", header='infer')
app_test = pd.read_csv("/dbfs/FileStore/tables/application_test.csv", header='infer')

In [32]:
# Naive Bayes
app_train = pd.get_dummies(app_train)
# Match the columns in the dataframes
app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)
print('Training shape: ', app_train.shape)
print('Testing shape: ', app_test.shape)

In [33]:
# Split dataset in training and test datasets
X_train, X_test = train_test_split(app_train, test_size=0.3, random_state = 0)

# fit a Naive Bayes model to the data
model = GaussianNB()
model.fit(X_train, X_test)
print(model)
# make predictions
expected = X_test
predicted = model.predict(X_train)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

In [34]:
# Decision Tree Classifier
from sklearn import datasets
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier

# load the datasets
app_train = pd.read_csv("/dbfs/FileStore/tables/application_train.csv", header='infer')
app_train = pd.get_dummies(app_train)
app_train.dropna()

# Split dataset in training and test datasets
X_train, X_test = train_test_split(app_train, test_size=0.3, random_state = 0)

# fit a CART model to the data
model = DecisionTreeClassifier()
model.fit(X_train, X_test)
print(model)

# make predictions
expected = X_test.target
predicted = model.predict(X_train)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

In [35]:
# Support Vector Machine
from sklearn import datasets
from sklearn import metrics
from sklearn.svm import SVC
# load the datasets
app_train = pd.read_csv("/dbfs/FileStore/tables/application_train.csv", header='infer')
app_train = pd.get_dummies(app_train)
app_train.dropna()
# Split dataset in training and test datasets
X_train, X_test = train_test_split(app_train, test_size=0.3, random_state = 0)
# fit a SVM model to the data
model = SVC()
model.fit(X_train, X_test)
print(model)
# make predictions
expected = X_test
predicted = model.predict(X_train)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

In [36]:
# Random forest
from sklearn.ensemble import RandomForestClassifier

# Make the random forest classifier
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50, verbose = 1, n_jobs = -1)


In [37]:
random_forest.fit(app_train, train_labels)

# Extract feature importances
feature_importance_values = random_forest.feature_importances_
feature_importances = pd.DataFrame({'feature': features, 'importance': feature_importance_values})

# Make predictions on the test data
predictions = random_forest.predict_proba(app_test)[:, 1]

In [38]:
# AdaBoost
from sklearn.ensemble import AdaBoostClassifier #For Classification
from sklearn.ensemble import AdaBoostRegressor #For Regression
from sklearn.tree import DecisionTreeClassifier

# load the datasets
app_train = pd.read_csv("/dbfs/FileStore/tables/application_train.csv", header='infer')
app_train = pd.get_dummies(app_train)
app_train.dropna(how="all", inplace=True)

# Split dataset in training and test datasets
app_train_train, app_train_test, X, y = train_test_split(app_train, predict, test_size = 0.25, random_state = 42)
dt = DecisionTreeClassifier() 
clf = AdaBoostClassifier(n_estimators=100, base_estimator=dt,learning_rate=1)

# Above I have used decision tree as a base estimator, you can use any ML learner as base estimator if it ac# cepts sample weight 
clf.fit(app_train, predict)

In [39]:
#PCA

In [40]:
#PCA
# pandas and numpy for data manipulation
import pandas as pd
import numpy as np

# matplotlit and seaborn for visualizations
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 22
import seaborn as sns

# Suppress warnings from pandas
import warnings
warnings.filterwarnings('ignore')

from sklearn.decomposition import PCA
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline

app_train = pd.read_csv("/dbfs/FileStore/tables/application_train.csv", header='infer')
app_test = pd.read_csv("/dbfs/FileStore/tables/application_test.csv", header='infer')

In [41]:
# Train missing values (in percent)
app_train_missing = (app_train.isnull().sum() / len(app_train)).sort_values(ascending = False)
app_train_missing.head()

In [42]:
# Test missing values (in percent)
app_test_missing = (app_test.isnull().sum() / len(app_test)).sort_values(ascending = False)
app_test_missing.head()

In [43]:
 # One hot encoding
app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test)

# Match the columns in the dataframes
app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)
print('Training shape: ', app_train.shape)
print('Testing shape: ', app_test.shape)


In [44]:
app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)

print('Training set full shape: ', app_train.shape)
print('Testing set full shape: ' , app_test.shape)

In [45]:
# Make a pipeline with imputation and pca
pipeline = Pipeline(steps = [('imputer', Imputer(strategy = 'median')),
             ('pca', PCA())])

# Fit and transform on the training data
app_train_pca = pipeline.fit_transform(app_train)

# transform the testing data
app_test_pca = pipeline.transform(app_test)

In [46]:
# Extract the pca object
pca = pipeline.named_steps['pca']

# Plot the cumulative variance explained

plt.figure(figsize = (10, 8))
plt.plot(list(range(app_train.shape[1])), np.cumsum(pca.explained_variance_ratio_), 'r-')
plt.xlabel('Number of PC'); plt.ylabel('Cumulative Variance Explained');
plt.title('Cumulative Variance Explained with PCA');