In [1]:
# Importing necessary libraries
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression 
from sklearn import svm 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import accuracy_score
import joblib

In [2]:
# Load the dataset
df = pd.read_csv(r"C:\Users\kusha\OneDrive\Desktop\Projects\Python\Loan approval\loan_data.csv")

In [3]:
# Display first few rows and basic info
df.head()
df.shape
df.tail()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
376,LP002953,Male,Yes,3+,Graduate,No,5703,0.0,128.0,360.0,1.0,Urban,Y
377,LP002974,Male,Yes,0,Graduate,No,3232,1950.0,108.0,360.0,1.0,Rural,Y
378,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
379,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
380,LP002990,Female,No,0,Graduate,Yes,4583,0.0,133.0,360.0,0.0,Semiurban,N


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381 entries, 0 to 380
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            381 non-null    object 
 1   Gender             376 non-null    object 
 2   Married            381 non-null    object 
 3   Dependents         373 non-null    object 
 4   Education          381 non-null    object 
 5   Self_Employed      360 non-null    object 
 6   ApplicantIncome    381 non-null    int64  
 7   CoapplicantIncome  381 non-null    float64
 8   LoanAmount         381 non-null    float64
 9   Loan_Amount_Term   370 non-null    float64
 10  Credit_History     351 non-null    float64
 11  Property_Area      381 non-null    object 
 12  Loan_Status        381 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 38.8+ KB


In [None]:
# Check for missing values
df.isnull().sum()

Loan_ID               0
Gender                5
Married               0
Dependents            8
Education             0
Self_Employed        21
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     11
Credit_History       30
Property_Area         0
Loan_Status           0
dtype: int64

In [None]:
# Drop 'Loan_ID' as it's not useful for prediction
df = df.drop('Loan_ID', axis = 1)

In [None]:
# Handling missing values from 'Gender', 'Dependents', and 'Loan_Amount_Term'
df = df.dropna(subset = ['Gender','Dependents','Loan_Amount_Term'])

In [None]:
# Fill missing values with their mode
df['Self_Employed'].unique()

array(['No', 'Yes', nan], dtype=object)

In [9]:
df['Credit_History'].unique()

array([ 1., nan,  0.])

In [10]:
df['Self_Employed'].fillna(df['Self_Employed'].mode()[0], inplace = True)
df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace= True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Self_Employed'].fillna(df['Self_Employed'].mode()[0], inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace= True)


In [11]:
df.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [12]:
df['Dependents'].replace('3+', '4', inplace= True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Dependents'].replace('3+', '4', inplace= True)


In [13]:
df['Gender'].unique()
df['Dependents'].unique()
df['Married'].unique()
df['Dependents'].unique()
df['Education'].unique()
df['Self_Employed'].unique()
df['Property_Area'].unique()
df['Loan_Status'].unique()

array(['N', 'Y'], dtype=object)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 358 entries, 0 to 380
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             358 non-null    object 
 1   Married            358 non-null    object 
 2   Dependents         358 non-null    object 
 3   Education          358 non-null    object 
 4   Self_Employed      358 non-null    object 
 5   ApplicantIncome    358 non-null    int64  
 6   CoapplicantIncome  358 non-null    float64
 7   LoanAmount         358 non-null    float64
 8   Loan_Amount_Term   358 non-null    float64
 9   Credit_History     358 non-null    float64
 10  Property_Area      358 non-null    object 
 11  Loan_Status        358 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 36.4+ KB


In [None]:
# Encode categorical variables to numerical values
encoding = {
    'Gender' : {'Male':1 , 'Female':0},
    'Married': {'Yes' : 1, 'No': 0},
    'Dependents' : {'1':1, '0': 0, '2':2, '4':4},
    'Education' : {'Graduate': 1, 'Not Graduate':0},
    'Self_Employed' : {'Yes' : 1, 'No': 0},
    'Property_Area' : {'Rural':0, 'Urban':1, 'Semiurban':2},
    'Loan_Status' : {'N':0, 'Y':1}
}

In [16]:
df.replace(encoding, inplace=True)

  df.replace(encoding, inplace=True)


In [None]:
# Define features and target
X = df.drop('Loan_Status', axis = 1)
Y = df['Loan_Status']

In [18]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,0
1,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,1,1
2,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,1,1
3,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,1,1
4,1,1,0,0,0,2333,1516.0,95.0,360.0,1.0,1,1


In [19]:
#This scaler standardizes features by removing the mean and scaling to unit variance.
num_cols = ['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term']
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

In [20]:
X.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,1,1,1,0,0.71163,0.092069,0.80598,0.285826,1.0,0
1,1,1,0,1,1,-0.398856,-0.539332,-1.350425,0.285826,1.0,1
2,1,1,0,0,0,-0.691384,0.447965,0.527735,0.285826,1.0,1
3,1,0,0,1,0,1.705666,-0.539332,1.25813,0.285826,1.0,1
4,1,1,0,0,0,-0.866761,0.095418,-0.341784,0.285826,1.0,1


In [None]:
# Function to train and evaluate a model using train-test split and cross-validation
def evaluate_model(model):
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, random_state = 42)
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    # Accuracy on test set
    accuracy = accuracy_score(Y_test, Y_pred)
    # Average cross-validation score
    cross_val = cross_val_score(model, X, Y, cv=5)
    avg_cross_val = np.mean(cross_val)
    print(f"{model.__class__.__name__}- accuracy : {accuracy : 2f}, Cross-Val-Score : {avg_cross_val: .2f}")
    return avg_cross_val

In [None]:
# Initialize models
models = {
    LogisticRegression(),
    svm.SVC(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
}

In [23]:
model_score = {model.__class__.__name__: evaluate_model(model) for model in models}

DecisionTreeClassifier- accuracy :  0.875000, Cross-Val-Score :  0.79
GradientBoostingClassifier- accuracy :  0.847222, Cross-Val-Score :  0.83
LogisticRegression- accuracy :  0.847222, Cross-Val-Score :  0.84
RandomForestClassifier- accuracy :  0.833333, Cross-Val-Score :  0.84
SVC- accuracy :  0.847222, Cross-Val-Score :  0.83


In [24]:
def tune_model(model, param_grid):
    tuner = RandomizedSearchCV(model, param_grid, cv = 5, n_iter =20, verbose = True, random_state = 42)
    tuner.fit(X, Y)
    print(f"Best Score for {model.__class__.__name__}: {tuner.best_score_:.2f}")
    print(f"Best Parameter for {model.__class__.__name__}: {tuner.best_params_}")
    return tuner.best_estimator_

In [25]:
log_reg_grid = {'C': np.logspace(-4, 4, 20), "solver": ["liblinear"]}
svc_grid = {'C': [0.25, 0.50, 0.75, 1], "kernel": ['linear']}

rf_grid = {
    'n_estimators': np.arange(10, 1000, 10),
    'max_features': ['log2', 'sqrt'], 
    'max_depth': [None, 3, 5, 10, 20, 30],
    'min_samples_split': [2, 5, 20, 50, 100],
    'min_samples_leaf': [1, 2, 5, 10]
}

In [26]:
best_log_reg = tune_model(LogisticRegression(), log_reg_grid)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Score for LogisticRegression: 0.84
Best Parameter for LogisticRegression: {'solver': 'liblinear', 'C': 1.623776739188721}


In [27]:
best_svc_reg = tune_model(svm.SVC(), svc_grid)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best Score for SVC: 0.84
Best Parameter for SVC: {'kernel': 'linear', 'C': 0.25}




In [28]:
best_rf = tune_model(RandomForestClassifier(), rf_grid)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Score for RandomForestClassifier: 0.84
Best Parameter for RandomForestClassifier: {'n_estimators': 70, 'min_samples_split': 100, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 5}


In [29]:
final_model = best_rf

In [30]:

joblib.dump(final_model, 'Loan_Status_Predictor.pkl')

['Loan_Status_Predictor.pkl']

In [None]:
# Prediction System Sample 

sample_data = pd.DataFrame({
    'Gender': [1],
    'Married': [1],
    'Dependents': [2],
    'Education': [0],
    'Self_Employed': [0],
    'ApplicantIncome': [1000],
    'CoapplicantIncome': [0.0],
    'LoanAmount': [150],
    'Loan_Amount_Term': [180],
    'Credit_History': [0],
    'Property_Area': [1]
})

sample_data[num_cols] = scaler.transform(sample_data[num_cols])
loaded_model = joblib.load('loan_status_predictor.pkl')
prediction = loaded_model.predict(sample_data)

result = "Loan Approved" if prediction[0] == 1 else "Loan Not Approved"
print(f"\nPrediction Result: {result}")


Prediction Result: Loan Not Approved


In [32]:
sample_data = pd.DataFrame({
    'Gender': [1],                # Male
    'Married': [1],               # Married
    'Dependents': [0],            # No dependents
    'Education': [1],             # Graduate
    'Self_Employed': [0],         # Not self-employed
    'ApplicantIncome': [6000],    # Higher income
    'CoapplicantIncome': [1500],  # Co-applicant support
    'LoanAmount': [120],          # Moderate loan
    'Loan_Amount_Term': [360],    # Standard long-term
    'Credit_History': [1],        # Good credit history
    'Property_Area': [2]          # Semiurban
})
sample_data[num_cols] = scaler.transform(sample_data[num_cols])
prediction = loaded_model.predict(sample_data)
result = "Loan Approved" if prediction[0] == 1 else "Loan Not Approved"
print(f"\nPrediction Result: {result}")


Prediction Result: Loan Approved


In [33]:
joblib.dump(scaler, 'vector.pkl')

['vector.pkl']