# Classification Template
Supervised learning classification technique. It is used to predict a qualitative outcome.  
(i.e. *yes* or *no*, *convert* or *not convert*, *positive* or *negative*, in this case  `good` or `bad` etc.)

## Import Libraries and Data
-  Import Libraries  
-  Import dataset(s)

In [None]:
# Import Libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import confusion_matrix

from pycaret.classification import *

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [None]:
# Import the dataset
data = pd.read_csv('./data/GermanCredit.csv')
data.head()

## Exploratory Data Analysis (EDA)
-  dataframe shape  
-  identify null / na values  
-  Tukey Five Number - describe() 
-  Seaborn Visualization

In [None]:
# Data shape
print(f'The data has {data.shape[0]} records and {data.shape[1]} columns')

In [None]:
# Nulls in the data
msk = data.isna().sum()
msk[msk > 0] # or
#msk

In [None]:
# Information about numeric features
data.describe().T

In [None]:
# inforamtion about the category data
data.describe(include='object').T

### Data visualization with Seaborn

In [None]:
sns.countplot(x= data['credit']);

In [None]:
sns.countplot(y= data['purpose']);

In [None]:
sns.histplot(x= data['credit_amt'], kde=True, color='brown');

In [None]:
sns.boxplot(x=data['age'], color='pink');

In [None]:
sns.boxplot(x=data['duration_mnth'], color='orange');

## Model Building
- Create unseen data
- Set up data 
- Create model
- tune model (if required)
- Finalize model
- plot model
- Prediction

In [None]:
# Holding out Data as unseen from the model
unseen = data.sample(n= 100)
data = data[~data.index.isin(unseen.index)]
print(f'Data for model: {data.shape},\nData for unseen predictions: {unseen.shape}')
unseen.to_csv('./data/Germancredit_unseen.csv', index=False)

#### Setting up with Pycaret 3.0

In [None]:
s = setup(data = data, target = 'credit', train_size=0.9, fix_imbalance=True, session_id=2930,
          ordinal_features = {'checking_acc': [ 'none','Less than 0 DM','Btw 0 to 199 DM','Equal or Greater than 200 DM'],
                            'savings_acc':['none','Less than 100 DM','Btw 100 to 499 DM','Btw 500 to 999 DM','Equal or Greater than 1000 DM'],
                             'emp_status':['unemployed','Less than a year','Btw 1 to 4 years','Btw 4 to 7 years', 'Greater than 7 years']
                             })

In [None]:
# To view the transformed data
s.dataset_transformed.head()

#### Creating model
> The models used on the data by owner can be found at [UCI website](https://archive-beta.ics.uci.edu/dataset/144/statlog+german+credit+data)

In [None]:
models()

In [None]:
%%time
## comparing the eight models - i always add lightgbm
compare_models(include=['xgboost', 'rf', 'lr', 'catboost', 'lightgbm'])

In [None]:
%%time
## Creating light GBM model
lgbm = create_model('lightgbm')
tuned_lgbm = tune_model(lgbm)

In [None]:
%%time
## Creating Cat Boost Classifier model
cb = create_model('catboost')
tuned_cb = tune_model(cb)

#### Plotting Model - catboost 

In [None]:
plot_model(tuned_lgbm, plot = 'confusion_matrix') #'auc' , 'error', 'pr'

In [None]:
plot_model(cb, plot = 'auc') #'auc' , 'error', 'pr'

In [None]:
plot_model(cb, plot = 'pr') #'auc' , 'error', 'pr'

#### Plotting Model - Lightgbm

In [None]:
plot_model(lgbm, plot = 'confusion_matrix') #'auc' , 'error', 'pr'

In [None]:
plot_model(lgbm, plot = 'auc') #'auc' , 'error', 'pr'

In [None]:
plot_model(lgbm, plot = 'feature') #'auc' , 'error', 'pr'

### Predicting on Test Data - Using LGBM

In [None]:
# Predicting the test data - LGBM
predict_model(lgbm, raw_score=True)

In [None]:
# TO predict the whole data - lgbm
predict_model(lgbm, data = data)

### Testing Model on the Unseen data
- The real test of a model depends on how well it perform on an unseen data. Not only measured, accuracy but the Precison, Recall and F1_score as they are highly important for the business model

#### Predicting on Unseen data using Light GBM 

In [None]:
# To predict the unseen data
lgbm_pred = predict_model(lgbm, data = unseen)

# Uncomment and RUn to view where it got the credit wrong
# pred[pred['credit'] != pred['prediction_label']]

In [None]:
## Creating a confusion Matrix
y_true = lgbm_pred['credit']
y_pred = lgbm_pred['prediction_label']
lgbm_cm =confusion_matrix(y_true, y_pred)
lgbm_cm

In [None]:
lgbm_cm_df = pd.DataFrame(lgbm_cm, index = ['bad', 'good'], columns = ['bad', 'good'])
lgbm_cm_df

In [None]:
sns.heatmap(data = lgbm_cm_df, annot=True, fmt = '00');

In [None]:
# Calculate accuracy
accuracy = (lgbm_cm_df['good']['good'] + lgbm_cm_df['bad']['bad']) / sum(sum(i) for i in lgbm_cm_df.values)
print(f'LGBM model Accuracy: {accuracy * 100:.1f}%')

# Calculate precision
precision = lgbm_cm_df['good']['good'] / (lgbm_cm_df['good']['good'] + lgbm_cm_df['bad']['good'])
print(f'LGBM model Precision: {precision * 100:.1f}%')

# Calculate recall
recall = lgbm_cm_df['good']['good'] / (lgbm_cm_df['good']['good'] + lgbm_cm_df['good']['bad'])
print(f'LGBM model Recall: {recall * 100:.1f}%')

# Calculate F1 score
f1_score = 2 * (precision * recall) / (precision + recall)
print(f'LGBM model F1 Score: {f1_score * 100:.1f}%')
