# TIN200 - Data Visualization and supervised learning for automatic processing of loan apllications

In [1]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from eli5 import explain_weights
from eli5 import explain_prediction
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score


ModuleNotFoundError: No module named 'eli5'

#### Creating dataframes

In [None]:
train_df = pd.read_csv("DATA/train_TIN200.csv")

train_df

In [None]:
test_df = pd.read_csv("DATA/test_TIN200.csv")

test_df.head(5)

#### Check for NaN Values

In [None]:
train_df.isnull().values.any()

In [None]:
test_df.isnull().values.any()

In [None]:
train_df.isnull().sum()

In [None]:
test_df.isnull().sum()

#### Visualize NaN values

In [None]:
import missingno as msno

In [None]:
msno.matrix(train_df)

In [None]:
msno.matrix(test_df)

In [None]:
msno.bar(train_df)

In [None]:
msno.bar(test_df)

In [None]:
msno.heatmap(train_df)


In [None]:
msno.heatmap(test_df)

In [None]:
#dropping loan_ID

train_df = train_df.drop(columns=['Loan_ID'])
test_df = test_df.drop(columns=['Loan_ID'])

#### Imputing missing values (https://medium.com/analytics-vidhya/ways-to-handle-categorical-column-missing-data-its-implementations-15dc4a56893)

In [None]:
train_df.dtypes

In [None]:
print("Number of Categories in: ")

for ColName in train_df[['Gender','Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']]:
    print("{} = {}".format(ColName,       len(train_df[ColName].unique())))

In [None]:
test_df.dtypes

In [None]:
print("Number of Categories in: ")

for ColName in test_df[['Gender','Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']]:
    print("{} = {}".format(ColName,       len(test_df[ColName].unique())))

In [None]:
#grabbed from https://medium.com/analytics-vidhya/ways-to-handle-categorical-column-missing-data-its-implementations-15dc4a56893

def impute_nan_most_frequent_category(DataFrame, ColName):
    # .mode()[0] - gives first category name
     most_frequent_category=DataFrame[ColName].mode()[0]
    
    # replace nan values with most occured category
     DataFrame[ColName + "_Imputed"] = DataFrame[ColName]
     DataFrame[ColName + "_Imputed"].fillna(most_frequent_category,inplace=True)
    
for Columns in ['Gender','Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']:
    impute_nan_most_frequent_category(train_df,Columns)
    
train_df[['Gender','Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 
          'Gender_Imputed','Married_Imputed', 'Dependents_Imputed', 'Education_Imputed', 
          'Self_Employed_Imputed', 'Property_Area_Imputed']].head(10)

In [None]:
train_df = train_df.drop(['Gender','Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area'], axis = 1)

In [None]:
train_df.isnull().sum()

In [None]:
#Replacing the remainder with mean

cols = ['LoanAmount', 'Loan_Amount_Term', 'Credit_History']

for col in cols:
    train_df[col] = train_df[col].fillna(train_df[col].mean())

In [None]:
train_df.isnull().sum()

#### Doing the same for the test set

In [None]:
for Columns in ['Gender','Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']:
    impute_nan_most_frequent_category(test_df,Columns)
    
test_df[['Gender','Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 
          'Gender_Imputed','Married_Imputed', 'Dependents_Imputed', 'Education_Imputed', 
          'Self_Employed_Imputed', 'Property_Area_Imputed']].head(10)

In [None]:
test_df = test_df.drop(['Gender','Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area'], axis = 1)

In [None]:
cols = ['LoanAmount', 'Loan_Amount_Term', 'Credit_History']

for col in cols:
    test_df[col] = test_df[col].fillna(test_df[col].mean())

#### All the NaN-values are dealt with and we can now visualize the sets

In [None]:
sns.pairplot(data=train_df, hue='Loan_Status')

#### preprocessing

In [None]:
def label_encode(column_name, DataFrame):
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    DataFrame[column_name] = le.fit_transform(DataFrame[column_name])
    
cols = ['Gender_Imputed', 'Married_Imputed', 'Dependents_Imputed', 
        'Education_Imputed', 'Self_Employed_Imputed', 
        'Property_Area_Imputed', 'Loan_Status']

#le = LabelEncoder()
#train_df['Married_Imputed'] = le.fit_transform(train_df['Married_Imputed'])

for col in cols:
    label_encode(col, train_df)
    

In [None]:
train_df

In [None]:
cols = ['Gender_Imputed', 'Married_Imputed', 'Dependents_Imputed', 
        'Education_Imputed', 'Self_Employed_Imputed', 'Property_Area_Imputed']

#le = LabelEncoder()
#train_df['Married_Imputed'] = le.fit_transform(train_df['Married_Imputed'])

for col in cols:
    label_encode(col, test_df)

In [None]:
train_df

#### Now that the data is prepared we can create splits

In [None]:
#making target (loan_status) last column
train_df = train_df[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 
                     'Loan_Amount_Term', 'Credit_History', 'Gender_Imputed', 
                     'Married_Imputed', 'Dependents_Imputed', 'Education_Imputed', 
                     'Self_Employed_Imputed', 'Property_Area_Imputed', 'Loan_Status']]

In [None]:
#importing train_test_split
from sklearn.model_selection import train_test_split

### Train and test

In [None]:
X = train_df.drop(['Loan_Status'],axis=1)
y = train_df['Loan_Status']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(n_estimators=50, random_state=70)
rfc.fit(X_train, y_train)
rfc_train =rfc.predict(X_train)
rfc_test  = rfc.predict(X_test)

In [None]:
print(accuracy_score(y_test, rfc_test))

rfc.score(X_test,y_test)

In [None]:
rfc.fit(X,y)
rfc_full_test = rfc.predict(X)
print(accuracy_score(y, rfc_full_test))


### Cross validation

In [None]:
from sklearn.model_selection import cross_val_score

cross_score = cross_val_score(rfc,X, y, cv=10)
print('CV accuracy scores: %s' % cross_score)


In [None]:
explain_weights(rfc)

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV


In [None]:
xgb = XGBClassifier(random_state=70)
xgb.fit(X_train, y_train)

xg_train = xgb.predict(X_train)
#xg_test = xgb.predict(X_test)


In [None]:
grid = [{'learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0],
         'max_depth': [1,2,3],
         'n_estimators': [100, 1000, 3000, 5000, 8000]}]

gs = GridSearchCV(estimator= xgb,
                param_grid= grid,
                scoring='r2',
                n_jobs=-1)


In [None]:
gs = gs.fit(X_train, y_train)

In [None]:
print(gs.best_score_)
print(gs.best_params_)

In [None]:
op_xgb = XGBClassifier(learing_rate=0.0001, max_depth=1, n_estimators=100 )
op_xgb.fit(X,y)


In [None]:
op_exg_pred = op_xgb.predict(X)
print(accuracy_score(y, op_exg_pred))


In [None]:
y.to_csv('y.csv', index = False, header=True)

In [None]:
X.to_csv('full_fix_data.csv', index = False, header=True)

### SVC

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler


In [None]:
pipe_svc = make_pipeline(StandardScaler(), SVC(random_state=70))
pipe_svc.get_params()


In [None]:
param_range  = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0] # For regularization parameter C.
param_range2 = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0]         # For scaling parameter gamma og rbf-kernel.

param_grid   = [{'svc__C': param_range, 'svc__kernel': ['linear']},
                {'svc__C': param_range, 'svc__gamma': param_range2, 'svc__kernel': ['rbf']}]

gs = GridSearchCV(estimator=pipe_svc, 
                  param_grid=param_grid, 
                  scoring='accuracy', 
                  cv=10,
                  n_jobs=-1)


In [None]:
gs = gs.fit(X_train, y_train)


In [None]:
print(gs.best_score_)
print(gs.best_params_)
clf = gs.best_estimator_
clf

In [None]:
cross_score = cross_val_score(xgb, X, y,  cv=10)
print('CV accuracy scores: %s' % cross_score)

In [None]:
print('Test accuracy: %.3f' % clf.score(X_test, y_test))
print('Test accuracy: %.3f' % gs.score(X_test, y_test))
gs.cv_results_['mean_test_score']


In [None]:
clf.fit(X,y)
clf_pred_x = clf.predict(X)
gs.cv_results_['mean_test_score']
accuracy_score(y, clf_pred)


In [None]:
cross_score = cross_val_score(clf, X, y,  cv=10)
print('CV accuracy scores: %s' % cross_score)


In [None]:
explain_weights(clf)
explain_weights()

In [None]:
from eli5.xgboost import explain_weights_xgboost

explain_weights_xgboost(xgb)


### SFC

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs


In [None]:
# Prepare SFS
sfs = SFS(rfc, 
          k_features=(1, 12), 
          cv=10)


In [None]:
from eli5.xgboost import explain_weights_xgboost
from eli5.sklearn.explain_prediction import explain_prediction_tree_regressor
from eli5.sklearn.explain_prediction import explain_prediction_tree_classifier

In [None]:
#explain_prediction_tree_classifier(rfc, doc=None)