In [1]:
# EDA steps:
•	Import the Data-set
•	Read data
•	Check data characters
•	Calculating statistics: mean, median, standard deviation
•	Basic plotting for data
•	Correlation computation
•   Correlation Heat Map

# Data pre-processing steps:
•	Import the Libraries
•	Check out & filled the missing values
•   Group by function
•   Logistic Regrssion & Handle /Create Dummy variable
•   Decision Tree Classifier
•   Support Vector Machine
•   Random forest
•   AdaBoost
•	PCA

EDA Steps

In [3]:
# Import the Data-set & Import the Libraries

In [4]:
# numpy and pandas for data manipulation
import pandas as pd
import numpy as np
# matplotlib and seaborn for plotting
import matplotlib
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import Imputer 
from sklearn.preprocessing import scale 

In [5]:
# Read Data

In [6]:
app_train = pd.read_csv("/dbfs/FileStore/tables/application_train.csv", header='infer')
app_test = pd.read_csv("/dbfs/FileStore/tables/application_test.csv", header='infer')
bureau = pd.read_csv("/dbfs/FileStore/tables/bureau.csv", header='infer')
bureau_balance = pd.read_csv("/dbfs/FileStore/tables/bureau_balance.csv", header='infer')

In [7]:
# Check data characters & calculating statistics: mean, median, standard deviation

In [8]:
app_train['TARGET'].value_counts()

In [9]:
print('AMT_INCOME_TOTAL Train',app_train['AMT_INCOME_TOTAL'].count())

In [10]:
# Basic plotting for data

In [11]:
app_train['TARGET'].astype(int).plot.hist()

In [12]:
# Correlation computation

In [13]:
# Find correlations with the target and sort
correlations = app_train.corr()['TARGET'].sort_values()

# Display correlations
print('Most Positive Correlations:\n', correlations.tail(20))
print('\nMost Negative Correlations:\n', correlations.head(20))

In [14]:
# Correlation Heat Map

In [15]:
ext_data = app_train[['TARGET', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']]
ext_data_corrs = ext_data.corr()
ext_data_corrs

plt.figure(figsize = (8, 6))

# Heatmap of correlations
sns.heatmap(ext_data_corrs, cmap = plt.cm.RdYlBu_r, vmin = -0.25, annot = True, vmax = 0.6)
plt.title('Correlation Heatmap');

Data pre-processing steps

In [17]:
# Import the Libraries

In [18]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import time
# Decision Tree Classifier
from sklearn import datasets
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
# Support Vector Machine
from sklearn import datasets
from sklearn import metrics
from sklearn.svm import SVC
# Random forest
from sklearn.ensemble import RandomForestClassifier
# AdaBoost
from sklearn.ensemble import AdaBoostClassifier #For Classification
from sklearn.ensemble import AdaBoostRegressor #For Regression
from sklearn.tree import DecisionTreeClassifier
app_train = pd.read_csv("/dbfs/FileStore/tables/application_train.csv", header='infer')
app_test = pd.read_csv("/dbfs/FileStore/tables/application_test.csv", header='infer')

In [19]:
# Suppress warnings from pandas
import warnings
warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')

In [20]:
# Check out & filled the missing values

In [21]:
# Function to calculate missing values by column# Funct 
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [22]:
# Missing values statistics
missing_values = missing_values_table(app_train)
missing_values.head(20)

In [23]:
# Group By Function

In [24]:
# Groupby the client id (SK_ID_CURR), count the number of previous loans, and rename the column
previous_loan_counts = bureau.groupby('SK_ID_CURR', as_index=False)['SK_ID_BUREAU'].count().rename(columns = {'SK_ID_BUREAU': 'previous_loan_counts'})
previous_loan_counts.head()

In [25]:
# app_train left join previous loan data
train = app_train.join(previous_loan_counts, on = 'SK_ID_CURR', how = 'left', lsuffix='SK_ID_CURR', rsuffix='r_SK_ID_CURR')

# Fill the missing values with 0 
train['previous_loan_counts'] = train['previous_loan_counts'].fillna(0)
train.head()

In [26]:
## Model & Evaluation

# Next, I will use five different models, and evaluated on them by using F score.Turned out Logistic regrassion has the highest F1 score, 0.98. And the decison tree give the lowest F score, 0.89. Second highest F score model are SVM and Random forest (0.91). Adaboost and PCA gave the average F score of 0.90. 

# The reason for the different F score is because logistic regression model could handle the  continuous and categorical variables, which could increase the accuarcy of the model. For the decision tree, it has the tendency to overfit, they are prone to sampling errors. Also, tree splitting is locally greedy. Whcich may also descrease the accuaracy. In order to increase the accuaracy of the model, I used Randon forest.Random forest (and all other variations of the decision tree method) will only tell you which predictors are more important to build the trees, without any information on the direction of association. Thus, this model gave a higher F score. SVM model is an approximation to a bound on the test error rate, which can help increase the accuaracy, but choosing a “good” kernel function is not easy. So, it may lower the score. Meanwhile, Adaboost is a powerful classification algorithm, but it can be sensitive to noisy data and outliers, therfore, lower the F score. PCA is an efficient tool to reduce dimention, which may generate a higher F score

In [27]:
#Logistic Regression & Handle /Create Dummy variable

In [28]:
# one-hot encoding of categorical variables
app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test)
print('Training Features shape: ', train.shape)

In [29]:
# load the datasets
app_train = pd.read_csv("/dbfs/FileStore/tables/application_train.csv", header='infer')
app_train = pd.get_dummies(app_train)
app_train = app_train.dropna()

logreg = LogisticRegression()

X = app_train.drop(['TARGET'], axis=1)
y = app_train['TARGET']

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.4, random_state=42)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
from sklearn.metrics import roc_auc_score
log_reg = LogisticRegression()

## make predictions
X_test_predicted = model.predict(X_test)
# # summarize the fit of the model
print(metrics.classification_report(y_test, X_test_predicted))
print(metrics.confusion_matrix(y_test, X_test_predicted))

In [30]:
# ROC_AUC score
from sklearn.metrics import roc_auc_score
logreg = LogisticRegression()
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.4, random_state=42)
logreg.fit(X_train, y_train)
y_pred_prob = logreg.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_pred_prob)

In [31]:
# Decision Tree Classifier

In [32]:
# load the datasets
app_train = pd.read_csv("/dbfs/FileStore/tables/application_train.csv", header='infer')
app_train = pd.get_dummies(app_train)
app_train = app_train.dropna()

X = app_train.drop(['TARGET'], axis=1)
y = app_train['TARGET']
# Split dataset in training and test datasets
X_train, X_test,y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 0)

# fit a CART model to the data
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
print(model)

# make predictions
X_test_predicted = model.predict(X_test)
# # summarize the fit of the model
print(metrics.classification_report(y_test, X_test_predicted))
print(metrics.confusion_matrix(y_test, X_test_predicted))

In [33]:
# Support Vector Machine

In [34]:
app_train = app_train.dropna()
# Split dataset in training and test datasets
X_train, X_test,y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 0)
# fit a SVM model to the data
model = SVC()
model.fit(X_train, y_train)
print(model)
# make predictions
X_test_predicted = model.predict(X_test)
# # summarize the fit of the model
print(metrics.classification_report(y_test, X_test_predicted))
print(metrics.confusion_matrix(y_test, X_test_predicted))

In [35]:
# Random forest

In [36]:
# Make the random forest classifier
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50, verbose = 1, n_jobs = -1)
random_forest.fit(X_train, y_train)

# Make predictions on the test data
predictions = random_forest.predict_proba(X_test)[:, 1]

# # summarize the fit of the model
print(metrics.classification_report(y_test, X_test_predicted))
print(metrics.confusion_matrix(y_test, X_test_predicted))

In [37]:
# AdaBoost

In [38]:
# Split dataset in training and test datasets
X_train, X_test,y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)
dt = DecisionTreeClassifier() 
clf = AdaBoostClassifier(n_estimators=100, base_estimator=dt,learning_rate=1)

# Above I have used decision tree as a base estimator, you can use any ML learner as base estimator if it ac# cepts sample weight 
clf.fit(X_train, y_train)

# Make predictions on the test data
predictions = random_forest.predict_proba(X_test)[:, 1]

# # summarize the fit of the model
print(metrics.classification_report(y_test, X_test_predicted))
print(metrics.confusion_matrix(y_test, X_test_predicted))

In [39]:
#PCA

In [40]:
#PCA
# matplotlit and seaborn for visualizations
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 22
import seaborn as sns

# Suppress warnings from pandas
import warnings
warnings.filterwarnings('ignore')

from sklearn.decomposition import PCA
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline

app_train = pd.read_csv("/dbfs/FileStore/tables/application_train.csv", header='infer')
app_test = pd.read_csv("/dbfs/FileStore/tables/application_test.csv", header='infer')

In [41]:
# Train missing values (in percent)
app_train_missing = (app_train.isnull().sum() / len(app_train)).sort_values(ascending = False)
app_train_missing.head()

In [42]:
# Test missing values (in percent)
app_test_missing = (app_test.isnull().sum() / len(app_test)).sort_values(ascending = False)
app_test_missing.head()

In [43]:
 # One hot encoding
app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test)

# Match the columns in the dataframes
app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)
print('Training shape: ', app_train.shape)
print('Testing shape: ', app_test.shape)


In [44]:
# Make a pipeline with imputation and pca
pipeline = Pipeline(steps = [('imputer', Imputer(strategy = 'median')),
             ('pca', PCA())])

# Fit and transform on the training data
app_train_pca = pipeline.fit_transform(app_train)

# transform the testing data
app_test_pca = pipeline.transform(app_test)

In [45]:
# Extract the pca object
pca = pipeline.named_steps['pca']

# Plot the cumulative variance explained

plt.figure(figsize = (10, 8))
plt.plot(list(range(app_train.shape[1])), np.cumsum(pca.explained_variance_ratio_), 'r-')
plt.xlabel('Number of PC'); plt.ylabel('Cumulative Variance Explained');
plt.title('Cumulative Variance Explained with PCA');

# # summarize the fit of the model
print(metrics.classification_report(y_test, X_test_predicted))
print(metrics.confusion_matrix(y_test, X_test_predicted))