## Model Training

#### 1.1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [1]:
# Basic Import
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.linear_model import SGDClassifier,LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error,accuracy_score

from sklearn.model_selection import cross_val_predict, cross_val_score,KFold, RepeatedStratifiedKFold
import warnings

#### Import the ASC Data as Pandas DataFrame

In [2]:
import pandas as pd
df = pd.read_csv("SouthGermanCredit.asc", sep=' ')
df

Unnamed: 0,laufkont,laufzeit,moral,verw,hoehe,sparkont,beszeit,rate,famges,buerge,...,verm,alter,weitkred,wohn,bishkred,beruf,pers,telef,gastarb,kredit
0,1,18,4,2,1049,1,2,4,2,1,...,2,21,3,1,1,3,2,1,2,1
1,1,9,4,0,2799,1,3,2,3,1,...,1,36,3,1,2,3,1,1,2,1
2,2,12,2,9,841,2,4,2,2,1,...,1,23,3,1,1,2,2,1,2,1
3,1,12,4,0,2122,1,3,3,3,1,...,1,39,3,1,2,2,1,1,1,1
4,1,12,4,0,2171,1,3,4,3,1,...,2,38,1,2,2,2,2,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1,24,2,3,1987,1,3,2,3,1,...,1,21,3,1,1,2,1,1,2,0
996,1,24,2,0,2303,1,5,4,3,2,...,1,45,3,2,1,3,2,1,2,0
997,4,21,4,0,12680,5,5,4,3,1,...,4,30,3,3,1,4,2,2,2,0
998,2,12,2,3,6468,5,1,2,3,1,...,4,52,3,2,1,4,2,2,2,0


In [3]:
columns_names = ["status", "duration", "credit_history", "purpose", "amount", "savings", "employment_duration","installment_rate", "personal_status_sex", "other_debtors", "present_residence", "property","age", "other_installment_plans", "housing", "number_credits", "job", "people_liable", "telephone", "foreign_worker", "credit_risk" ]
df.columns = columns_names
df.head()

Unnamed: 0,status,duration,credit_history,purpose,amount,savings,employment_duration,installment_rate,personal_status_sex,other_debtors,...,property,age,other_installment_plans,housing,number_credits,job,people_liable,telephone,foreign_worker,credit_risk
0,1,18,4,2,1049,1,2,4,2,1,...,2,21,3,1,1,3,2,1,2,1
1,1,9,4,0,2799,1,3,2,3,1,...,1,36,3,1,2,3,1,1,2,1
2,2,12,2,9,841,2,4,2,2,1,...,1,23,3,1,1,2,2,1,2,1
3,1,12,4,0,2122,1,3,3,3,1,...,1,39,3,1,2,2,1,1,1,1
4,1,12,4,0,2171,1,3,4,3,1,...,2,38,1,2,2,2,2,1,1,1


In [4]:
#Feature Porocessing
df['log_amount'] = round(np.log(df['amount']),2)
df.drop('amount',axis=1,inplace=True)
df['log_age'] =  round(np.log(df['age']),2)
df['log_duration'] = round(np.log(df['duration']),2)
df.drop(['age','duration'],axis=1,inplace=True)
df.head()

Unnamed: 0,status,credit_history,purpose,savings,employment_duration,installment_rate,personal_status_sex,other_debtors,present_residence,property,...,housing,number_credits,job,people_liable,telephone,foreign_worker,credit_risk,log_amount,log_age,log_duration
0,1,4,2,1,2,4,2,1,4,2,...,1,1,3,2,1,2,1,6.96,3.04,2.89
1,1,4,0,1,3,2,3,1,2,1,...,1,2,3,1,1,2,1,7.94,3.58,2.2
2,2,2,9,2,4,2,2,1,4,1,...,1,1,2,2,1,2,1,6.73,3.14,2.48
3,1,4,0,1,3,3,3,1,2,1,...,1,2,2,1,1,1,1,7.66,3.66,2.48
4,1,4,0,1,3,4,3,1,4,2,...,2,2,2,2,1,1,1,7.68,3.64,2.48


In [5]:
# Random shuffle of records 
df = df.sample(frac=1).reset_index(drop=True)

In [6]:
X = df.drop('credit_risk',axis=1)
y = df['credit_risk']

In [7]:
from imblearn.over_sampling import SMOTE
smk = SMOTE(sampling_strategy='all',random_state=42)
X,y=smk.fit_resample(X,y)

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#### Create an Evaluate Function to give all metrics after model Training

In [10]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    accuracy = accuracy_score(true,predicted)
    return mae, rmse, r2_square,accuracy

In [11]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest Classifier": RandomForestClassifier(),
    "XGBClassifier": XGBClassifier(), 
    "AdaBoost Classifier": AdaBoostClassifier(),
    "DecisionTree Classifier": DecisionTreeClassifier(),
    "Bagging Classifier": BaggingClassifier(),
    "SVC Classifier": SVC()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2,model_train_accuracy = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2,model_test_accuracy = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))
    print("- Accuracy: {:.4f}".format(model_train_accuracy))


    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    print("- Accuracy: {:.4f}".format(model_test_accuracy))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Logistic Regression
Model performance for Training set
- Root Mean Squared Error: 0.4640
- Mean Absolute Error: 0.2153
- R2 Score: 0.1387
- Accuracy: 0.7847
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.4551
- Mean Absolute Error: 0.2071
- R2 Score: 0.1711
- Accuracy: 0.7929


Random Forest Classifier
Model performance for Training set
- Root Mean Squared Error: 0.0000
- Mean Absolute Error: 0.0000
- R2 Score: 1.0000
- Accuracy: 1.0000
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.3994
- Mean Absolute Error: 0.1595
- R2 Score: 0.3617
- Accuracy: 0.8405


XGBClassifier
Model performance for Training set
- Root Mean Squared Error: 0.0000
- Mean Absolute Error: 0.0000
- R2 Score: 1.0000
- Accuracy: 1.0000
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.4082
- Mean Absolute Error: 0.1667
- R2 Score: 0.3331
- Accuracy: 0.8333


AdaBoost Classifier


In [12]:
lin_model = LogisticRegression(fit_intercept=True)
lin_model = lin_model.fit(X_train, y_train)
y_pred = lin_model.predict(X_test)
score = accuracy_score(y_test, y_pred)*100
print(" Accuracy of the model is %.2f" %score)

 Accuracy of the model is 79.29


In [13]:
pred_df=pd.DataFrame({'Actual Value':y_test,'Predicted Value':y_pred,'Difference':y_test-y_pred})
pred_df

Unnamed: 0,Actual Value,Predicted Value,Difference
665,1,1,0
624,1,1,0
115,1,1,0
478,1,1,0
233,1,1,0
...,...,...,...
287,1,0,1
1349,0,0,0
1163,0,0,0
294,1,1,0
