<a href="https://www.kaggle.com/code/ibrahimawad02/ai502-credit-fraud?scriptVersionId=145024966" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:#FFBB55;
           font-size:20px;
           font-family:Nexa;
           letter-spacing:0.5px">
        <p style="padding: 10px;
              color:black;">
            <b>Using different classification models and comparing them</b>
        </p>
</div>
<div class="alert alert-block alert-info" style="font-size:22px; font-family:arial;">
    Presented by Ibrahim Hossam
</div>

In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

### Reading data and analyzing it

In [None]:
df = pd.read_csv("/kaggle/input/credit/credit_train.csv", sep = ',', encoding = 'utf-8')
df.sample(5)

In [None]:
dft = pd.read_csv("/kaggle/input/credit/credit_test.csv", sep = ',', encoding = 'utf-8')  #df test
dft.sample(5)

In [None]:
df.info()

In [None]:
df.describe(include = "O")

In [None]:
dft.info()

In [None]:
dft.describe(include = 'O')

In [None]:
dft.describe()

## Data cleaning for modelling

In [None]:
df.drop(['Loan ID', 'Customer ID'], axis = 1, inplace = True)

In [None]:
df.isnull().sum()

In [None]:
# Calculating the percentage of null values
df.isnull().sum()/df.shape[0]*100

In [None]:
df_cleaned = df.dropna(thresh = 14)
df_cleaned.isnull().sum()

In [None]:
df_cleaned.isnull().sum()/df.shape[0]*100

In [None]:
(1-df_cleaned.shape[0]/df.shape[0])*100

In [None]:
# Since the percent of null values in bankruptcies and tax lien is less than 1% we can drop their null values
df_cleaned[['Bankruptcies', 'Tax Liens']].dropna()

In [None]:
(1-df_cleaned.shape[0]/df.shape[0])*100

In [None]:
# Since the percentage of null values in the delinquent columns is more than 50% we will drop it
df_cleaned.drop('Months since last delinquent', axis = 1, inplace = True)

In [None]:
# Since the percentage of null values in the delinquent columns is
df = df_cleaned

***
### Cleaning test data

In [None]:
dft.drop(['Loan ID', 'Customer ID'], axis = 1, inplace = True)

In [None]:
dft.isnull().sum()

In [None]:
# Calculating the percentage of null values
dft.isnull().sum()/df.shape[0]*100

In [None]:
dft_cleaned = dft.dropna(thresh = 1)
dft_cleaned.isnull().sum()

In [None]:
dft_cleaned.isnull().sum()/df.shape[0]*100

In [None]:
# Will drop the column of last delinquent since we already dropped it in the train data
dft_cleaned.drop('Months since last delinquent', axis = 1, inplace = True)

In [None]:
(1-dft_cleaned.shape[0]/dft.shape[0])*100

In [None]:
# I assign the clean dft to dft because it is easier to deal with
dft = dft_cleaned

Although the cleaned data is 3.4% of the original data but that data is rows with only one value which will be bad for the model.

## Since we dropped around 1% of null values in df we are going to fill the missing data using MICE

In [None]:
# First let's check the shape of the data to ensure it doesn't change after interpolation
sns.kdeplot(data = df, x = 'Credit Score', color = 'yellow')

In [None]:
plt.figure(figsize=(12, 5))
sns.countplot(data=df, x='Years in current job', palette='hls')
plt.show()

In [None]:
sns.kdeplot(data = df, x = 'Annual Income', color = 'cyan')

## Using Multiple Imputaion Chained Equations method (MICE)

**Explanation:**<br>
This is how the algorithm operates:

1. Replace each variable's missing value with a straightforward imputation technique, such as mean imputation, which is also referred to as "placeholders."<br>
2. In a regression model where X1 is the dependent variable and the other variables are the independent variables, the "placeholders" for the first variable, X1, are regressed. <br>
3. The others are then employed as independent variables, with X2 acting as the dependent variable. The process keeps going until every variable has been taken into account at least once as the dependent variable.
4. These initial "placeholders" are subsequently swapped out for the regression model's predictions.
5. According to Raghunathan et al. 2002, the replacement procedure is performed for a certain number of cycles, which is typically 10, and the imputation is updated at each cycle.
- The prediction values that best capture the relationships found in the data are preferably used to replace the missing values at the end of the cycle.

In [None]:
pip install miceforest

In [None]:
#Changing the object columns into category type
df[df.select_dtypes(['object']).columns] = df.select_dtypes(['object']).apply(lambda x: x.astype('category'))

In [None]:
from miceforest import ImputationKernel

mice_kernel = ImputationKernel(
data = df,
save_all_iterations = True,
random_state = 2024
)

In [None]:
mice_kernel.mice(2)
df = mice_kernel.complete_data()
df.sample(10)

In [None]:
df.isnull().sum()

In [None]:
# First let's check the shape of the data to ensure it doesn't change after filling the missing data
sns.kdeplot(data = df, x = 'Credit Score', color = 'green')

In [None]:
sns.set()
plt.figure(figsize=(12, 5))
sns.countplot(data = df, x = 'Years in current job')

In [None]:
sns.kdeplot(data = df, x = 'Annual Income', color = 'magenta')

### Filling the missing values in test data

In [None]:
# First let's check the shape of the data to ensure it doesn't change after filling the missing data
sns.kdeplot(data = dft, x = 'Credit Score', color = 'orange')

In [None]:
plt.figure(figsize=(12, 5))
sns.countplot(data=dft, x='Years in current job', palette='pastel')
plt.show()

In [None]:
sns.kdeplot(data = df, x = 'Annual Income', color = 'purple')

## Using Multiple Imputaion Chained Equations method (MICE)

In [None]:
#Changing the object columns into category type
dft[dft.select_dtypes(['object']).columns] = dft.select_dtypes(['object']).apply(lambda x: x.astype('category'))

In [None]:
from miceforest import ImputationKernel

mice_kernel = ImputationKernel(
data = dft,
save_all_iterations = True,
random_state = 2024
)

In [None]:
mice_kernel.mice(2)
dft = mice_kernel.complete_data()
dft.sample(10)

In [None]:
dft.isnull().sum()

In [None]:
# First let's check the shape of the data to ensure it doesn't change after interpolation
sns.kdeplot(data = dft, x = 'Credit Score', color = 'brown')

In [None]:
sns.set()
plt.figure(figsize=(12, 5))
sns.countplot(data = df, x = 'Years in current job', palette = 'bright')

In [None]:
sns.kdeplot(data = df, x = 'Annual Income', color = 'magenta')

***
# Label encoding and scaling

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

In [None]:
SC=StandardScaler()

In [None]:
df.describe(include = 'category')

In [None]:
label_encoder = LabelEncoder()
category_columns = ['Loan Status', 'Term', 'Years in current job', 'Home Ownership', 'Purpose']

for col in category_columns:
    df[col] = label_encoder.fit_transform(df[col])

# After Encoding
df.info()

In [None]:
dft.describe(include = 'category')

In [None]:
label_encoder = LabelEncoder()
category_columns = ['Term', 'Years in current job', 'Home Ownership', 'Purpose']

for col in category_columns:
    dft[col] = label_encoder.fit_transform(dft[col])

# After Encoding
dft.info()

***
# Splitting data

In [None]:
print(df.info())
dft.info()

In [None]:
x_train = df.drop('Loan Status', axis = 1).values
y_train = df['Loan Status'].values

In [None]:
x_train

In [None]:
y_train

In [None]:
x_test = dft.values

In [None]:
x_test

In [None]:
X_train_Scaled=SC.fit_transform(x_train)

In [None]:
X_test_Scaled=SC.fit_transform(x_test)

***
# Modelling
# Model 1

In [None]:
model1=LogisticRegression()

In [None]:
model1.fit(x_train,y_train)

In [None]:
model1.get_params()

In [None]:
model1.coef_

In [None]:
y_pred=model1.predict(X_train_Scaled)

In [None]:
pd.DataFrame({'y_pred': y_pred,'y_train' : y_train})

In [None]:
model1.score(X_train_Scaled,y_train)

***
# Model 2

In [None]:
model2=LogisticRegression(C=0.1,penalty='l1',solver='saga')

In [None]:
model2.fit(X_train_Scaled,y_train)

In [None]:
model2.coef_

In [None]:
model2.score(X_train_Scaled,y_train)

***
# Model 3

In [None]:
model3=LogisticRegression(C=0.3,penalty='l2',solver='newton-cg')

In [None]:
model3.fit(X_train_Scaled,y_train)

In [None]:
model3.coef_

In [None]:
model3.score(X_train_Scaled,y_train)

***
# Model 4

In [None]:
model4=LogisticRegression(C=0.7,penalty='l2',solver='lbfgs')

In [None]:
model4.fit(X_train_Scaled,y_train)

In [None]:
model4.coef_

In [None]:
model4.score(X_train_Scaled,y_train)

***
# Model 5

In [None]:
model5=LogisticRegression(C=0.65,penalty='l2',solver='liblinear' )

In [None]:
model5.fit(X_train_Scaled,y_train)

In [None]:
model5.coef_

In [None]:
model5.score(X_train_Scaled,y_train)

***
# Model 6

In [None]:
model6=LogisticRegression(C=0.55,penalty='l1',solver='liblinear', max_iter = 125)

In [None]:
model6.fit(X_train_Scaled,y_train)

In [None]:
model6.coef_

In [None]:
model6.score(X_train_Scaled,y_train)

### From the previous models even when changing the parameters, the accuracy doesn't change.

*****
# Using KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
kneighbour=KNeighborsClassifier(n_neighbors=5)

In [None]:
kneighbour.fit(X_train_Scaled,y_train)

In [None]:
kneighbour.score(X_train_Scaled,y_train)

In [None]:
train_accuracy=[]
for i in range(1,14):
    knn=KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train_Scaled,y_train)
    train_accuracy.append(knn.score(X_train_Scaled,y_train))

x=[i for i in range(1,14)]
plt.plot(x,train_accuracy,label='Training_accuracy')
plt.legend()
plt.plot()

In [None]:
train_accuracy

### From the previous cell we can find that the best KNN model is the model of two neighbours and of score 91.2%

***
Support Vector Machine (SVM)

In [None]:
from sklearn.svm import SVC

In [None]:
SVM1 = SVC(C = 10, kernel= 'rbf')

In [None]:
SVM1.fit(x_train, y_train)

In [None]:
SVM1.score(x_train,y_train)

In [None]:
train_acc = []
for i in range(1,101,25):
    classifier = SVC(C = i, kernel= 'rbf')
    classifier.fit(x_train, y_train)
    train_acc.append(classifier.score(x_train,y_train))
print(train_acc)

***
# Thanks for reading
# Yours, Ibrahim Hossam