In [1]:
import numpy as np
# Import the necessary libraries
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.svm import SVC

In [2]:
# load the dataset
test = pd.read_csv('data/loan/test.csv', index_col='id')
train = pd.read_csv('data/loan/train.csv', index_col='id')

In [3]:
# split data 
X = train.drop('loan_status', axis=1)
y = train['loan_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Create a pipeline to clean the data

num_columns = X.select_dtypes(include=[np.number]).columns
cat_columns = X.select_dtypes(include=[object]).columns

col_transformer = ColumnTransformer(transformers=
    [
        ('num_normalizer',Normalizer(), num_columns),
        ('cat_encoder', OrdinalEncoder(), cat_columns),
    ],
    remainder='drop', n_jobs=-1
)

In [5]:
from sklearn.model_selection import cross_val_score

# Using SVC estimator
best_params = {'C': 1,
               'coef0': 0, 
               'degree': 1,
               'gamma': 2,
               'probability': True,
               'kernel': 'rbf', 
               'random_state': 666}

estimator = SVC(**best_params)
pipeline = make_pipeline(col_transformer, estimator)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
model_score = pipeline.score(X_test, y_test)

cross_validation = cross_val_score(pipeline, X, y, cv=5)
mean_cross_val = cross_validation.mean()

# Show Report
print('Model Score: ', model_score)
print('Accuracy Score:', accuracy_score(y_test, y_pred)) # best score 0.93 by using SVC
print('Cross Validation Score:', mean_cross_val)
print('Cross Validation Scores:', cross_validation)

Model Score:  0.9265069485889675
Accuracy Score: 0.9265069485889675
Cross Validation Score: 0.9234717367209481
Cross Validation Scores: [0.92002728 0.9226703  0.92386393 0.92642169 0.92437548]


In [6]:
# test_pred = pipeline.predict(test)

In [7]:
# Save results (prediction + id )
# submission = pd.DataFrame(test_pred, index=test.index, columns=['loan_status'])
# submission.to_csv('data/loan/submission.csv', index=True)
# print('Prediction saved to submission.csv')