In [2]:
#Importing dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import pickle

In [3]:
#Loading the dataset
data=pd.read_csv('smartcredit.csv')

Doing some data cleaning

In [4]:
#Fixing missing/empty values
data.isnull().sum()

Loan_ID                 0
Gender                  0
Married                 0
Dependent_No            0
Education               0
Self_Employed           0
Applicant_Income        0
CoApplicant_Income    585
Loan_Amount             0
Loan_Amount_Term        0
Credit_History          0
Loan_Status             0
dtype: int64

In [5]:
#Replacing the null values in a particular column of my dataset with the median value
x = data['CoApplicant_Income'].median()
data['CoApplicant_Income'].fillna(x, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['CoApplicant_Income'].fillna(x, inplace=True)


In [6]:
#Checking for duplicates
data.duplicated().sum()

0

In [7]:
data.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependent_No', 'Education',
       'Self_Employed', 'Applicant_Income', 'CoApplicant_Income',
       'Loan_Amount', 'Loan_Amount_Term', 'Credit_History', 'Loan_Status'],
      dtype='object')

In [None]:
# Using visual method to check for outliers (Scatter plot)
plt.scatter(range(len(data['Loan_ID'])), data['Applicant_Income'])
plt.title("Scatter Plot")
plt.show()


Exploratory Data Analysis (EDA)

In [7]:
data.describe()

Unnamed: 0,Loan_ID,Gender,Married,Dependent_No,Education,Self_Employed,Applicant_Income,CoApplicant_Income,Loan_Amount,Loan_Amount_Term,Credit_History,Loan_Status
count,5584.0,5584.0,5584.0,5584.0,5584.0,5584.0,5584.0,5584.0,5584.0,5584.0,5584.0,5584.0
mean,25793.5,0.654549,0.474749,1.995881,0.382521,0.597779,50658.55283,25345.349391,35333.094556,212.643266,0.664219,0.667264
std,1612.106283,0.475558,0.499407,1.414904,0.486046,0.49039,28580.613968,13417.892959,17147.360206,102.963595,0.472305,0.471235
min,23002.0,0.0,0.0,0.0,0.0,0.0,1002.0,1002.0,10000.0,60.0,0.0,0.0
25%,24397.75,0.0,0.0,1.0,0.0,0.0,26090.25,14752.0,20000.0,120.0,0.0,0.0
50%,25793.5,1.0,0.0,2.0,0.0,1.0,50909.5,25165.0,40000.0,240.0,1.0,1.0
75%,27189.25,1.0,1.0,3.0,1.0,1.0,75769.75,36205.5,50000.0,300.0,1.0,1.0
max,28585.0,1.0,1.0,4.0,1.0,1.0,99995.0,49997.0,60000.0,360.0,1.0,1.0


Doing some data transformation

In [9]:
#Deleting a column that is not useful for my model
data.drop(columns=['Loan_ID'], inplace=True)

Seperating my dataset into features and label

In [10]:
features = data.drop(columns=['Loan_Status'])
label = data['Loan_Status']

Checking for unbalanced dataset

Key Outputs

Class Counts: The exact number of instances per class.

Bar Plot: A visual representation of class distribution.

Imbalance Ratio: Indicates the severity of imbalance:
Balanced Dataset: Ratio close to 1.
Moderately Imbalanced Dataset: Ratio between 1.5 and 4.
Highly Imbalanced Dataset: Ratio greater than 4.

In [None]:
# Specify the label column (target variable)
#label_column = 'target'  # Replace 'target' with your label column name

# Count the number of instances in each class
class_counts = data['Loan_Status'].value_counts()

# Display the counts
print("Class Distribution:")
print(class_counts)

# Visualize the class distribution
plt.figure(figsize=(8, 6))
class_counts.plot(kind='bar', color=['skyblue', 'orange'])
plt.title('Class Distribution')
plt.xlabel('Classes')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

# Calculate class imbalance ratio
majority_class = class_counts.max()
minority_class = class_counts.min()
imbalance_ratio = majority_class / minority_class
print(f"Imbalance Ratio (Majority:Minority): {imbalance_ratio:.2f}")


Splitting the datasets into training and testing sets

In [12]:
# Split data into training and testing sets (usually 80/20 split)
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=42)

Training the model on different algorithms

In [13]:
# Create and train the Logistic Regression model
model1 = LogisticRegression(max_iter=1000)
model2 = KNeighborsClassifier(n_neighbors=5)
model3 = SVC(kernel='linear')  # Choose kernel (e.g., 'linear', 'rbf')
model4 = DecisionTreeClassifier(max_depth=3)
model5 = RandomForestClassifier(n_estimators=100)
model6 = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100)
model7 = XGBClassifier()
model8 = GaussianNB()

# Reshape y_train to a 1D array (ensure compatibility with LogisticRegression)
y_train_reshaped = y_train.values.ravel()  # Extract NumPy array and flatten

# Train the model
model1.fit(X_train, y_train_reshaped)
model2.fit(X_train, y_train_reshaped)
model3.fit(X_train, y_train_reshaped)
model4.fit(X_train, y_train_reshaped)
model5.fit(X_train, y_train_reshaped)
model6.fit(X_train, y_train_reshaped)
model7.fit(X_train, y_train_reshaped)
model8.fit(X_train, y_train_reshaped)

In [17]:
# Make predictions on the testing data
y_pred1 = model1.predict(X_test)
y_pred2 = model2.predict(X_test)
y_pred3 = model3.predict(X_test)
y_pred4 = model4.predict(X_test)
y_pred5 = model5.predict(X_test)
y_pred6 = model6.predict(X_test)
y_pred7 = model5.predict(X_test)
y_pred8 = model6.predict(X_test)

# Evaluate model performance using various metrics

# Accuracy
accuracy = accuracy_score(y_test, y_pred1)
print(f"Accuracy: {accuracy:.4f}")
accuracy = accuracy_score(y_test, y_pred2)
print(f"Accuracy: {accuracy:.4f}")
accuracy = accuracy_score(y_test, y_pred3)
print(f"Accuracy: {accuracy:.4f}")
accuracy = accuracy_score(y_test, y_pred4)
print(f"Accuracy: {accuracy:.4f}")
accuracy = accuracy_score(y_test, y_pred5)
print(f"Accuracy: {accuracy:.4f}")
accuracy = accuracy_score(y_test, y_pred6)
print(f"Accuracy: {accuracy:.4f}")
accuracy = accuracy_score(y_test, y_pred7)
print(f"Accuracy: {accuracy:.4f}")
accuracy = accuracy_score(y_test, y_pred8)
print(f"Accuracy: {accuracy:.4f}")



# Precision
precision = precision_score(y_test, y_pred1)
print(f"Precision: {precision:.4f}")
precision = precision_score(y_test, y_pred2)
print(f"Precision: {precision:.4f}")
precision = precision_score(y_test, y_pred3)
print(f"Precision: {precision:.4f}")
precision = precision_score(y_test, y_pred4)
print(f"Precision: {precision:.4f}")
precision = precision_score(y_test, y_pred5)
print(f"Precision: {precision:.4f}")
precision = precision_score(y_test, y_pred6)
print(f"Precision: {precision:.4f}")
precision = precision_score(y_test, y_pred7)
print(f"Precision: {precision:.4f}")
precision = precision_score(y_test, y_pred8)
print(f"Precision: {precision:.4f}")


# Recall
recall = recall_score(y_test, y_pred1)
print(f"Recall: {recall:.4f}")
recall = recall_score(y_test, y_pred2)
print(f"Recall: {recall:.4f}")
recall = recall_score(y_test, y_pred3)
print(f"Recall: {recall:.4f}")
recall = recall_score(y_test, y_pred4)
print(f"Recall: {recall:.4f}")
recall = recall_score(y_test, y_pred5)
print(f"Recall: {recall:.4f}")
recall = recall_score(y_test, y_pred6)
print(f"Recall: {recall:.4f}")
recall = recall_score(y_test, y_pred7)
print(f"Recall: {recall:.4f}")
recall = recall_score(y_test, y_pred8)
print(f"Recall: {recall:.4f}")


# F1-score (harmonic mean of precision and recall)
f1 = f1_score(y_test, y_pred1)
print(f"F1-score: {f1:.4f}")
f1 = f1_score(y_test, y_pred2)
print(f"F1-score: {f1:.4f}")
f1 = f1_score(y_test, y_pred3)
print(f"F1-score: {f1:.4f}")
f1 = f1_score(y_test, y_pred4)
print(f"F1-score: {f1:.4f}")
f1 = f1_score(y_test, y_pred5)
print(f"F1-score: {f1:.4f}")
f1 = f1_score(y_test, y_pred6)
print(f"F1-score: {f1:.4f}")
recall = recall_score(y_test, y_pred7)
print(f"Recall: {recall:.4f}")
recall = recall_score(y_test, y_pred8)
print(f"Recall: {recall:.4f}")

# Confusion Matrix
confusion_matrix_result = confusion_matrix(y_test, y_pred1)
print("\nConfusion Matrix:\n", confusion_matrix_result)
confusion_matrix_result = confusion_matrix(y_test, y_pred2)
print("\nConfusion Matrix:\n", confusion_matrix_result)
confusion_matrix_result = confusion_matrix(y_test, y_pred3)
print("\nConfusion Matrix:\n", confusion_matrix_result)
confusion_matrix_result = confusion_matrix(y_test, y_pred4)
print("\nConfusion Matrix:\n", confusion_matrix_result)
confusion_matrix_result = confusion_matrix(y_test, y_pred5)
print("\nConfusion Matrix:\n", confusion_matrix_result)
confusion_matrix_result = confusion_matrix(y_test, y_pred6)
print("\nConfusion Matrix:\n", confusion_matrix_result)
confusion_matrix_result = confusion_matrix(y_test, y_pred6)
print("\nConfusion Matrix:\n", confusion_matrix_result)
confusion_matrix_result = confusion_matrix(y_test, y_pred7)
print("\nConfusion Matrix:\n", confusion_matrix_result)



# You can explore other metrics like ROC AUC curve or classification report based on your needs

Accuracy: 0.6723
Accuracy: 0.6097
Accuracy: 0.6723
Accuracy: 0.6723
Accuracy: 0.6365
Accuracy: 0.6661
Accuracy: 0.6365
Accuracy: 0.6661
Precision: 0.6723
Precision: 0.6834
Precision: 0.6723
Precision: 0.6723
Precision: 0.6710
Precision: 0.6712
Precision: 0.6710
Precision: 0.6712
Recall: 1.0000
Recall: 0.7816
Recall: 1.0000
Recall: 1.0000
Recall: 0.9015
Recall: 0.9867
Recall: 0.9015
Recall: 0.9867
F1-score: 0.8041
F1-score: 0.7292
F1-score: 0.8041
F1-score: 0.8041
F1-score: 0.7693
F1-score: 0.7989
Recall: 0.9015
Recall: 0.9867

Confusion Matrix:
 [[  0 366]
 [  0 751]]

Confusion Matrix:
 [[ 94 272]
 [164 587]]

Confusion Matrix:
 [[  0 366]
 [  0 751]]

Confusion Matrix:
 [[  0 366]
 [  0 751]]

Confusion Matrix:
 [[ 34 332]
 [ 74 677]]

Confusion Matrix:
 [[  3 363]
 [ 10 741]]

Confusion Matrix:
 [[  3 363]
 [ 10 741]]

Confusion Matrix:
 [[ 34 332]
 [ 74 677]]


In [18]:
# Save the model to a file
with open('smartcreditmodelv1.pkl', 'wb') as f:
  pickle.dump(model1, f)