In [5]:
# Features (all columns except target)
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression 

# Load the data
df = pd.read_csv("Loan dataset_classification.csv")

print(df["Married"].unique())
df["Married"]=df["Married"].fillna(df["Married"].mode()[0])
df["Married"] = df["Married"].map({"No": 0, "Yes": 1})
print(df["Married"].unique())


print(df["Self_Employed"].unique())
df["Self_Employed"]=df["Self_Employed"].fillna(df["Self_Employed"].mode()[0])
df["Self_Employed"] = df["Self_Employed"].map({"No": 0, "Yes": 1})
print(df["Self_Employed"].unique())

print(df["Education"].unique())
df["Education"]=df["Education"].fillna(df["Education"].mode()[0])
df["Education"] = df["Education"].map({"Not Graduate": 0, "Graduate": 1})
print(df["Education"].unique())

print(df["Property_Area"].unique())
df["Property_Area"]=df["Property_Area"].fillna(df["Property_Area"].mode()[0])
df["Property_Area"] = df["Property_Area"].map({"Urban": 0, "Rural": 1,"Semiurban":2 })
print(df["Property_Area"].unique())

df["ApplicantIncome"] = df["ApplicantIncome"].fillna(df["ApplicantIncome"].median())
df["CoapplicantIncome"] = df["CoapplicantIncome"].fillna(df["CoapplicantIncome"].median())
df["LoanAmount"] = df["LoanAmount"].fillna(df["LoanAmount"].median())
df["Loan_Amount_Term"] = df["Loan_Amount_Term"].fillna(df["Loan_Amount_Term"].median())
df["Credit_History"] = df["Credit_History"].fillna(0)

# Add small value to avoid log(0) errors
df["ApplicantIncome_log"]= np.log(df["ApplicantIncome"] + 1)
df["CoApplicantIncome_log"]= np.log(df["CoapplicantIncome"] + 1)

df.isnull().sum()

['No' 'Yes' nan]
[0 1]
['No' 'Yes' nan]
[0 1]
['Graduate' 'Not Graduate']
[1 0]
['Urban' 'Rural' 'Semiurban']
[0 1 2]


Loan_ID                    0
Gender                    24
Married                    0
Dependents                25
Education                  0
Self_Employed              0
ApplicantIncome            0
CoapplicantIncome          0
LoanAmount                 0
Loan_Amount_Term           0
Credit_History             0
Property_Area              0
Loan_Status              367
ApplicantIncome_log        0
CoApplicantIncome_log      0
dtype: int64

In [11]:
# Show dependent variable (Loan_Status) count by categories
print("Overall Loan Status Distribution:")
print(df["Loan_Status"].value_counts())
print("\n" + "="*50 + "\n")

# Loan Status by Gender
print("Loan Status by Gender:")
print(pd.crosstab(df["Gender"], df["Loan_Status"], margins=True))
print("\n" + "="*50 + "\n")

# Loan Status by Married
print("Loan Status by Married Status:")
print(pd.crosstab(df["Married"], df["Loan_Status"], margins=True))
print("\n" + "="*50 + "\n")

# Loan Status by Education
print("Loan Status by Education:")
print(pd.crosstab(df["Education"], df["Loan_Status"], margins=True))
print("\n" + "="*50 + "\n")

# Loan Status by Self_Employed
print("Loan Status by Self Employment:")
print(pd.crosstab(df["Self_Employed"], df["Loan_Status"], margins=True))
print("\n" + "="*50 + "\n")

# Loan Status by Property_Area
print("Loan Status by Property Area:")
print(pd.crosstab(df["Property_Area"], df["Loan_Status"], margins=True))

Overall Loan Status Distribution:
Loan_Status
Y    422
N    192
Name: count, dtype: int64


Loan Status by Gender:
Loan_Status    N    Y  All
Gender                    
Female        37   75  112
Male         150  339  489
All          187  414  601


Loan Status by Married Status:
Loan_Status    N    Y  All
Married                   
0             79  134  213
1            113  288  401
All          192  422  614


Loan Status by Education:
Loan_Status    N    Y  All
Education                 
0             52   82  134
1            140  340  480
All          192  422  614


Loan Status by Self Employment:
Loan_Status      N    Y  All
Self_Employed               
0              166  366  532
1               26   56   82
All            192  422  614


Loan Status by Property Area:
Loan_Status      N    Y  All
Property_Area               
0               69  133  202
1               69  110  179
2               54  179  233
All            192  422  614


In [12]:
# Handle missing values in Loan_Status (drop rows with NaN in target)
df = df.dropna(subset=["Loan_Status"])

# Encode target variable
df["Loan_Status"] = df["Loan_Status"].map({"Y": 1, "N": 0})

print(f"Dataset shape after cleaning: {df.shape}")
print(f"Loan Status distribution:\n{df['Loan_Status'].value_counts()}")

Dataset shape after cleaning: (614, 15)
Loan Status distribution:
Loan_Status
1    422
0    192
Name: count, dtype: int64


In [13]:
# Prepare features (X) and target (y)
from sklearn.model_selection import train_test_split

# Select features for training
feature_columns = ["Gender", "Married", "Education", "Self_Employed", 
                   "ApplicantIncome", "CoapplicantIncome", "LoanAmount", 
                   "Loan_Amount_Term", "Credit_History", "Property_Area"]

# Encode Gender if not already done
if df["Gender"].dtype == 'object':
    df["Gender"] = df["Gender"].map({"Male": 1, "Female": 0})
    
# Drop rows with any remaining missing values in features
df_clean = df[feature_columns + ["Loan_Status"]].dropna()

X = df_clean[feature_columns]
y = df_clean["Loan_Status"]

print(f"Final dataset shape: {df_clean.shape}")
print(f"Features: {list(X.columns)}")
print(f"Target distribution:\n{y.value_counts()}")

Final dataset shape: (601, 11)
Features: ['Gender', 'Married', 'Education', 'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area']
Target distribution:
Loan_Status
1    414
0    187
Name: count, dtype: int64


In [14]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")
print(f"\nTraining set target distribution:\n{y_train.value_counts()}")
print(f"\nTesting set target distribution:\n{y_test.value_counts()}")

Training set size: (480, 10)
Testing set size: (121, 10)

Training set target distribution:
Loan_Status
1    331
0    149
Name: count, dtype: int64

Testing set target distribution:
Loan_Status
1    83
0    38
Name: count, dtype: int64


In [15]:
# Train Logistic Regression Model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

print("Model trained successfully!")
print(f"Model coefficients: {model.coef_[0]}")
print(f"Model intercept: {model.intercept_[0]}")

Model trained successfully!
Model coefficients: [-1.07940451e-01  5.11454135e-01  3.65336072e-01  7.70996340e-02
 -3.16343849e-06 -5.69132299e-06 -1.48647532e-03 -1.89705464e-04
  2.11324226e+00  1.71016215e-01]
Model intercept: -1.1488000209771758


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
# Make predictions
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")
print(f"Testing Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
print("\n" + "="*50)
print("\nConfusion Matrix (Test Set):")
print(confusion_matrix(y_test, y_test_pred))
print("\n" + "="*50)
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred, target_names=["Not Approved (N)", "Approved (Y)"]))

Training Accuracy: 0.7812 (78.12%)
Testing Accuracy: 0.7273 (72.73%)


Confusion Matrix (Test Set):
[[17 21]
 [12 71]]


Classification Report (Test Set):
                  precision    recall  f1-score   support

Not Approved (N)       0.59      0.45      0.51        38
    Approved (Y)       0.77      0.86      0.81        83

        accuracy                           0.73       121
       macro avg       0.68      0.65      0.66       121
    weighted avg       0.71      0.73      0.72       121



In [17]:
# Load original dataset to find missing Loan_Status values
df_original = pd.read_csv("Loan dataset_classification.csv")

print(f"Original dataset shape: {df_original.shape}")
print(f"Missing Loan_Status values: {df_original['Loan_Status'].isnull().sum()}")
print(f"\nRows with missing Loan_Status:")
missing_loan_status = df_original[df_original['Loan_Status'].isnull()]
print(missing_loan_status)

Original dataset shape: (981, 13)
Missing Loan_Status values: 367

Rows with missing Loan_Status:
      Loan_ID Gender Married  ... Credit_History Property_Area Loan_Status
614  LP001015   Male     Yes  ...            1.0         Urban         NaN
615  LP001022   Male     Yes  ...            1.0         Urban         NaN
616  LP001031   Male     Yes  ...            1.0         Urban         NaN
617  LP001035   Male     Yes  ...            NaN         Urban         NaN
618  LP001051   Male      No  ...            1.0         Urban         NaN
..        ...    ...     ...  ...            ...           ...         ...
976  LP002971   Male     Yes  ...            1.0         Urban         NaN
977  LP002975   Male     Yes  ...            1.0         Urban         NaN
978  LP002980   Male      No  ...            NaN     Semiurban         NaN
979  LP002986   Male     Yes  ...            1.0         Rural         NaN
980  LP002989   Male      No  ...            1.0         Rural         NaN

[

In [18]:
# Prepare the data with missing Loan_Status for prediction
df_predict = df_original.copy()

# Apply same preprocessing as training data
print("Preprocessing data for prediction...")

# Handle categorical variables
df_predict["Gender"] = df_predict["Gender"].map({"Male": 1, "Female": 0})
df_predict["Married"] = df_predict["Married"].fillna(df_predict["Married"].mode()[0])
df_predict["Married"] = df_predict["Married"].map({"No": 0, "Yes": 1})
df_predict["Self_Employed"] = df_predict["Self_Employed"].fillna(df_predict["Self_Employed"].mode()[0])
df_predict["Self_Employed"] = df_predict["Self_Employed"].map({"No": 0, "Yes": 1})
df_predict["Education"] = df_predict["Education"].fillna(df_predict["Education"].mode()[0])
df_predict["Education"] = df_predict["Education"].map({"Not Graduate": 0, "Graduate": 1})
df_predict["Property_Area"] = df_predict["Property_Area"].fillna(df_predict["Property_Area"].mode()[0])
df_predict["Property_Area"] = df_predict["Property_Area"].map({"Urban": 0, "Rural": 1, "Semiurban": 2})

# Handle numerical variables
df_predict["ApplicantIncome"] = df_predict["ApplicantIncome"].fillna(df_predict["ApplicantIncome"].median())
df_predict["CoapplicantIncome"] = df_predict["CoapplicantIncome"].fillna(df_predict["CoapplicantIncome"].median())
df_predict["LoanAmount"] = df_predict["LoanAmount"].fillna(df_predict["LoanAmount"].median())
df_predict["Loan_Amount_Term"] = df_predict["Loan_Amount_Term"].fillna(df_predict["Loan_Amount_Term"].median())
df_predict["Credit_History"] = df_predict["Credit_History"].fillna(0)

print("Preprocessing complete!")
print(f"Missing values after preprocessing:\n{df_predict[feature_columns].isnull().sum()}")

Preprocessing data for prediction...
Preprocessing complete!
Missing values after preprocessing:
Gender               24
Married               0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term      0
Credit_History        0
Property_Area         0
dtype: int64


In [19]:
# Handle remaining missing Gender values by filling with mode
df_predict["Gender"] = df_predict["Gender"].fillna(df_predict["Gender"].mode()[0])

# Get indices where Loan_Status is missing
missing_indices = df_predict[df_predict['Loan_Status'].isnull()].index

# Prepare features for prediction
X_missing = df_predict.loc[missing_indices, feature_columns]

print(f"Number of records to predict: {len(X_missing)}")
print(f"Missing values in features: {X_missing.isnull().sum().sum()}")

Number of records to predict: 367
Missing values in features: 0


In [20]:
# Predict missing Loan_Status values
predictions = model.predict(X_missing)

# Map predictions back to Y/N
predictions_mapped = ['Y' if pred == 1 else 'N' for pred in predictions]

print(f"Predictions made for {len(predictions)} records")
print(f"\nPrediction distribution:")
print(f"Approved (Y): {predictions.sum()}")
print(f"Not Approved (N): {len(predictions) - predictions.sum()}")
print(f"\nFirst 10 predictions: {predictions_mapped[:10]}")

Predictions made for 367 records

Prediction distribution:
Approved (Y): 279
Not Approved (N): 88

First 10 predictions: ['Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y']


In [21]:
# Fill missing Loan_Status values with predictions
df_complete = df_original.copy()
df_complete.loc[missing_indices, 'Loan_Status'] = predictions_mapped

print(f"Original missing Loan_Status: {df_original['Loan_Status'].isnull().sum()}")
print(f"After prediction missing Loan_Status: {df_complete['Loan_Status'].isnull().sum()}")
print(f"\nComplete dataset shape: {df_complete.shape}")
print(f"\nLoan Status distribution in complete dataset:")
print(df_complete['Loan_Status'].value_counts())

# Display sample of predicted records
print("\n" + "="*80)
print("Sample of records with predicted Loan_Status:")
print("="*80)
print(df_complete.loc[missing_indices[:10], ['Loan_ID', 'Gender', 'Married', 'Education', 
                                               'ApplicantIncome', 'LoanAmount', 'Credit_History', 
                                               'Loan_Status']])

Original missing Loan_Status: 367
After prediction missing Loan_Status: 0

Complete dataset shape: (981, 13)

Loan Status distribution in complete dataset:
Loan_Status
Y    701
N    280
Name: count, dtype: int64

Sample of records with predicted Loan_Status:
      Loan_ID  Gender Married  ... LoanAmount  Credit_History  Loan_Status
614  LP001015    Male     Yes  ...      110.0             1.0            Y
615  LP001022    Male     Yes  ...      126.0             1.0            Y
616  LP001031    Male     Yes  ...      208.0             1.0            Y
617  LP001035    Male     Yes  ...      100.0             NaN            N
618  LP001051    Male      No  ...       78.0             1.0            Y
619  LP001054    Male     Yes  ...      152.0             1.0            Y
620  LP001055  Female      No  ...       59.0             1.0            Y
621  LP001056    Male     Yes  ...      147.0             0.0            N
622  LP001059    Male     Yes  ...      280.0             1.0     

In [22]:
# Save the complete dataset with predictions
df_complete.to_csv("Loan_dataset_complete_with_predictions.csv", index=False)
print("✓ Complete dataset saved as 'Loan_dataset_complete_with_predictions.csv'")
print(f"\nSummary:")
print(f"- Total records: {len(df_complete)}")
print(f"- Records with original Loan_Status: {len(df_original) - df_original['Loan_Status'].isnull().sum()}")
print(f"- Records with predicted Loan_Status: {len(predictions)}")
print(f"- Predicted Approvals (Y): {predictions.sum()} ({predictions.sum()/len(predictions)*100:.1f}%)")
print(f"- Predicted Rejections (N): {len(predictions) - predictions.sum()} ({(len(predictions) - predictions.sum())/len(predictions)*100:.1f}%)")

✓ Complete dataset saved as 'Loan_dataset_complete_with_predictions.csv'

Summary:
- Total records: 981
- Records with original Loan_Status: 614
- Records with predicted Loan_Status: 367
- Predicted Approvals (Y): 279 (76.0%)
- Predicted Rejections (N): 88 (24.0%)
