In [3]:
#Importing dependencies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import pickle

In [4]:
#Loading the data
data = pd.read_csv("smartcredit.csv")

In [5]:
#Converting the loaded data to a Pandas DataFrame
df = pd.DataFrame(data)

In [6]:
#Exploratory Data Analysis
# Method 1: Using isnull().sum()
#total_missing = df.isnull().sum().sum()  # Count total missing values

# Method 2: Using missing values count per column
missing_per_col = data.isnull().sum()


#total_missing

missing_per_col


Loan_ID                 0
Gender                  0
Married                 0
Dependent_No            0
Education               0
Self_Employed           0
Applicant_Income        0
CoApplicant_Income    585
Loan_Amount             0
Loan_Amount_Term        0
Credit_History          0
Loan_Status             0
dtype: int64

In [7]:
#Filling the missing values of the CoApplicant_Income column with the median value
column_name = "CoApplicant_Income"
data[column_name] = data[column_name].fillna(data[column_name].median())
data

Unnamed: 0,Loan_ID,Gender,Married,Dependent_No,Education,Self_Employed,Applicant_Income,CoApplicant_Income,Loan_Amount,Loan_Amount_Term,Credit_History,Loan_Status
0,23002,1,1,3,1,1,4315,19748.0,60000,300,1,1
1,23003,0,1,3,0,1,3331,29657.0,20000,60,0,1
2,23004,1,1,3,0,1,22494,29539.0,30000,240,1,1
3,23005,1,1,2,0,0,78117,12909.0,20000,60,0,1
4,23006,1,0,2,0,0,41976,45598.0,60000,300,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5579,28581,1,1,0,0,1,36765,43345.0,20000,180,0,1
5580,28582,0,0,1,0,0,89468,12119.0,60000,240,1,0
5581,28583,0,0,0,0,1,89976,22311.0,30000,60,1,1
5582,28584,1,1,1,0,0,78179,38722.0,40000,360,0,1


In [8]:
missing_columns = data.isnull().sum().sum
missing_columns

<bound method Series.sum of Loan_ID               0
Gender                0
Married               0
Dependent_No          0
Education             0
Self_Employed         0
Applicant_Income      0
CoApplicant_Income    0
Loan_Amount           0
Loan_Amount_Term      0
Credit_History        0
Loan_Status           0
dtype: int64>

In [9]:
data.drop("Loan_ID", axis=1, inplace=True)
print(data)

      Gender  Married  Dependent_No  Education  Self_Employed  \
0          1        1             3          1              1   
1          0        1             3          0              1   
2          1        1             3          0              1   
3          1        1             2          0              0   
4          1        0             2          0              0   
...      ...      ...           ...        ...            ...   
5579       1        1             0          0              1   
5580       0        0             1          0              0   
5581       0        0             0          0              1   
5582       1        1             1          0              0   
5583       0        1             4          0              0   

      Applicant_Income  CoApplicant_Income  Loan_Amount  Loan_Amount_Term  \
0                 4315             19748.0        60000               300   
1                 3331             29657.0        20000          

In [24]:
#Assigning the columns in my dataset into features and label
target_column = "Loan_Status" 

features = data.drop( target_column, axis=1)
target = data[[target_column]]

# Print the first few rows of the features DataFrame to see what it contains
print("Features:")
print(features.head())

# Print the first few rows of the target DataFrame to see what it contains
print("\nTarget:")
print(target.head())

Features:
   Gender  Married  Dependent_No  Education  Self_Employed  Applicant_Income  \
0       1        1             3          1              1              4315   
1       0        1             3          0              1              3331   
2       1        1             3          0              1             22494   
3       1        1             2          0              0             78117   
4       1        0             2          0              0             41976   

   CoApplicant_Income  Loan_Amount  Loan_Amount_Term  Credit_History  
0             19748.0        60000               300               1  
1             29657.0        20000                60               0  
2             29539.0        30000               240               1  
3             12909.0        20000                60               0  
4             45598.0        60000               300               1  

Target:
   Loan_Status
0            1
1            1
2            1
3            1

In [11]:
data

Unnamed: 0,Gender,Married,Dependent_No,Education,Self_Employed,Applicant_Income,CoApplicant_Income,Loan_Amount,Loan_Amount_Term,Credit_History,Loan_Status
0,1,1,3,1,1,4315,19748.0,60000,300,1,1
1,0,1,3,0,1,3331,29657.0,20000,60,0,1
2,1,1,3,0,1,22494,29539.0,30000,240,1,1
3,1,1,2,0,0,78117,12909.0,20000,60,0,1
4,1,0,2,0,0,41976,45598.0,60000,300,1,1
...,...,...,...,...,...,...,...,...,...,...,...
5579,1,1,0,0,1,36765,43345.0,20000,180,0,1
5580,0,0,1,0,0,89468,12119.0,60000,240,1,0
5581,0,0,0,0,1,89976,22311.0,30000,60,1,1
5582,1,1,1,0,0,78179,38722.0,40000,360,0,1


In [12]:
# Split data into training and testing sets (usually 80/20 split)
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [13]:
#Creating the model
model1 = LogisticRegression(max_iter=1000)
model2 = KNeighborsClassifier(n_neighbors=5)
model3 = SVC(kernel='linear')  # Choose kernel (e.g., 'linear', 'rbf')
model3 = DecisionTreeClassifier(max_depth=3)
model5 = RandomForestClassifier(n_estimators=100)
model6 = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100)
model7 = XGBClassifier()


In [14]:
# Create and train the Logistic Regression model
model1 = LogisticRegression(max_iter=1000)
model2 = KNeighborsClassifier(n_neighbors=5)
model3 = SVC(kernel='linear')  # Choose kernel (e.g., 'linear', 'rbf')
model3 = DecisionTreeClassifier(max_depth=3)
model4 = RandomForestClassifier(n_estimators=100)
model5 = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100)
model6 = XGBClassifier()

# Reshape y_train to a 1D array (ensure compatibility with LogisticRegression)
y_train_reshaped = y_train.values.ravel()  # Extract NumPy array and flatten

# Train the model
model1.fit(X_train, y_train_reshaped)
model2.fit(X_train, y_train_reshaped)
model3.fit(X_train, y_train_reshaped)
model4.fit(X_train, y_train_reshaped)
model5.fit(X_train, y_train_reshaped)
model6.fit(X_train, y_train_reshaped)


In [15]:
# Make predictions on the testing data
y_pred1 = model1.predict(X_test)
y_pred2 = model2.predict(X_test)
y_pred3 = model3.predict(X_test)
y_pred4 = model4.predict(X_test)
y_pred5 = model5.predict(X_test)
y_pred6 = model6.predict(X_test)

# Evaluate model performance using various metrics

# Accuracy
accuracy = accuracy_score(y_test, y_pred1)
print(f"Accuracy: {accuracy:.4f}")
accuracy = accuracy_score(y_test, y_pred2)
print(f"Accuracy: {accuracy:.4f}")
accuracy = accuracy_score(y_test, y_pred3)
print(f"Accuracy: {accuracy:.4f}")
accuracy = accuracy_score(y_test, y_pred4)
print(f"Accuracy: {accuracy:.4f}")
accuracy = accuracy_score(y_test, y_pred5)
print(f"Accuracy: {accuracy:.4f}")
accuracy = accuracy_score(y_test, y_pred6)
print(f"Accuracy: {accuracy:.4f}")


# Precision
precision = precision_score(y_test, y_pred1)
print(f"Precision: {precision:.4f}")
precision = precision_score(y_test, y_pred2)
print(f"Precision: {precision:.4f}")
precision = precision_score(y_test, y_pred3)
print(f"Precision: {precision:.4f}")
precision = precision_score(y_test, y_pred4)
print(f"Precision: {precision:.4f}")
precision = precision_score(y_test, y_pred5)
print(f"Precision: {precision:.4f}")
precision = precision_score(y_test, y_pred6)
print(f"Precision: {precision:.4f}")


# Recall
recall = recall_score(y_test, y_pred1)
print(f"Recall: {recall:.4f}")
recall = recall_score(y_test, y_pred2)
print(f"Recall: {recall:.4f}")
recall = recall_score(y_test, y_pred3)
print(f"Recall: {recall:.4f}")
recall = recall_score(y_test, y_pred4)
print(f"Recall: {recall:.4f}")
recall = recall_score(y_test, y_pred5)
print(f"Recall: {recall:.4f}")
recall = recall_score(y_test, y_pred6)
print(f"Recall: {recall:.4f}")


# F1-score (harmonic mean of precision and recall)
f1 = f1_score(y_test, y_pred1)
print(f"F1-score: {f1:.4f}")
f1 = f1_score(y_test, y_pred2)
print(f"F1-score: {f1:.4f}")
f1 = f1_score(y_test, y_pred3)
print(f"F1-score: {f1:.4f}")
f1 = f1_score(y_test, y_pred4)
print(f"F1-score: {f1:.4f}")
f1 = f1_score(y_test, y_pred5)
print(f"F1-score: {f1:.4f}")
f1 = f1_score(y_test, y_pred6)
print(f"F1-score: {f1:.4f}")

# Confusion Matrix
confusion_matrix_result = confusion_matrix(y_test, y_pred1)
print("\nConfusion Matrix:\n", confusion_matrix_result)
confusion_matrix_result = confusion_matrix(y_test, y_pred2)
print("\nConfusion Matrix:\n", confusion_matrix_result)
confusion_matrix_result = confusion_matrix(y_test, y_pred3)
print("\nConfusion Matrix:\n", confusion_matrix_result)
confusion_matrix_result = confusion_matrix(y_test, y_pred4)
print("\nConfusion Matrix:\n", confusion_matrix_result)
confusion_matrix_result = confusion_matrix(y_test, y_pred5)
print("\nConfusion Matrix:\n", confusion_matrix_result)
confusion_matrix_result = confusion_matrix(y_test, y_pred6)
print("\nConfusion Matrix:\n", confusion_matrix_result)



# You can explore other metrics like ROC AUC curve or classification report based on your needs

Accuracy: 0.6723
Accuracy: 0.6097
Accuracy: 0.6723
Accuracy: 0.6437
Accuracy: 0.6661
Accuracy: 0.6132
Precision: 0.6723
Precision: 0.6834
Precision: 0.6723
Precision: 0.6732
Precision: 0.6712
Precision: 0.6751
Recall: 1.0000
Recall: 0.7816
Recall: 1.0000
Recall: 0.9134
Recall: 0.9867
Recall: 0.8189
F1-score: 0.8041
F1-score: 0.7292
F1-score: 0.8041
F1-score: 0.7751
F1-score: 0.7989
F1-score: 0.7401

Confusion Matrix:
 [[  0 366]
 [  0 751]]

Confusion Matrix:
 [[ 94 272]
 [164 587]]

Confusion Matrix:
 [[  0 366]
 [  0 751]]

Confusion Matrix:
 [[ 33 333]
 [ 65 686]]

Confusion Matrix:
 [[  3 363]
 [ 10 741]]

Confusion Matrix:
 [[ 70 296]
 [136 615]]


In [16]:
# Save the model to a file
with open('smartcreditmodel.pkl', 'wb') as f:
  pickle.dump(model1, f)

In [17]:
data

Unnamed: 0,Gender,Married,Dependent_No,Education,Self_Employed,Applicant_Income,CoApplicant_Income,Loan_Amount,Loan_Amount_Term,Credit_History,Loan_Status
0,1,1,3,1,1,4315,19748.0,60000,300,1,1
1,0,1,3,0,1,3331,29657.0,20000,60,0,1
2,1,1,3,0,1,22494,29539.0,30000,240,1,1
3,1,1,2,0,0,78117,12909.0,20000,60,0,1
4,1,0,2,0,0,41976,45598.0,60000,300,1,1
...,...,...,...,...,...,...,...,...,...,...,...
5579,1,1,0,0,1,36765,43345.0,20000,180,0,1
5580,0,0,1,0,0,89468,12119.0,60000,240,1,0
5581,0,0,0,0,1,89976,22311.0,30000,60,1,1
5582,1,1,1,0,0,78179,38722.0,40000,360,0,1
