In [1]:
from google.colab import files
uploaded = files.upload()


Saving loan-train.csv to loan-train.csv


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
df = pd.read_csv("loan-train.csv")   # replace with your actual file name
print(df.head())
print(df.info())


    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2             1.0   

In [4]:
# -------------------------
# Step 4: Data Preprocessing
# -------------------------

# 1. Drop Loan_ID (it's just an identifier)
if 'Loan_ID' in df.columns:
    df.drop('Loan_ID', axis=1, inplace=True)

# 2. Handle missing values
if 'Gender' in df.columns:
    df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)

if 'Married' in df.columns:
    df['Married'].fillna(df['Married'].mode()[0], inplace=True)

if 'Dependents' in df.columns:
    df['Dependents'].fillna(df['Dependents'].mode()[0], inplace=True)
    df['Dependents'] = df['Dependents'].replace('3+', 3).astype(int)   # Fix "3+"

if 'Self_Employed' in df.columns:
    df['Self_Employed'].fillna(df['Self_Employed'].mode()[0], inplace=True)

if 'LoanAmount' in df.columns:
    df['LoanAmount'].fillna(df['LoanAmount'].median(), inplace=True)

if 'Loan_Amount_Term' in df.columns:
    df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0], inplace=True)

if 'Credit_History' in df.columns:
    df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace=True)

# 3. Encode categorical variables
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

categorical_cols = ['Gender','Married','Education','Self_Employed','Property_Area']

for col in categorical_cols:
    if col in df.columns:
        df[col] = le.fit_transform(df[col])

# 4. Encode target column (Loan_Status)
target_col = None
for col in df.columns:
    if 'Loan_Status' in col:   # handles Loan_Status with spaces
        df.rename(columns={col: 'Loan_Status'}, inplace=True)
        df['Loan_Status'] = le.fit_transform(df['Loan_Status'])
        target_col = 'Loan_Status'
        break

# 5. Check processed dataset
print("\n✅ After preprocessing:")
print(df.head())
print("\nData types:\n", df.dtypes)



✅ After preprocessing:
   Gender  Married  Dependents  Education  Self_Employed  ApplicantIncome  \
0       1        0           0          0              0             5849   
1       1        1           1          0              0             4583   
2       1        1           0          0              1             3000   
3       1        1           0          1              0             2583   
4       1        0           0          0              0             6000   

   CoapplicantIncome  LoanAmount  Loan_Amount_Term  Credit_History  \
0                0.0       128.0             360.0             1.0   
1             1508.0       128.0             360.0             1.0   
2                0.0        66.0             360.0             1.0   
3             2358.0       120.0             360.0             1.0   
4                0.0       141.0             360.0             1.0   

   Property_Area  Loan_Status  
0              2            1  
1              0            

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Married'].fillna(df['Married'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on whi

In [5]:
from sklearn.model_selection import train_test_split

# Make sure target column exists
if 'Loan_Status' in df.columns:
    # Features and target
    X = df.drop('Loan_Status', axis=1)
    y = df['Loan_Status']

    # Split dataset into training and test sets (80/20 split)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    print("✅ Train-Test Split successful")
    print("Training set size:", X_train.shape)
    print("Test set size:", X_test.shape)

else:
    print("⚠️ Target column 'Loan_Status' not found. Please check dataset.")


✅ Train-Test Split successful
Training set size: (491, 11)
Test set size: (123, 11)


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# --- Train Logistic Regression ---
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# --- Predictions ---
y_pred = log_reg.predict(X_test)

# --- Evaluation ---
print("✅ Logistic Regression Model Trained")
print("🔹 Accuracy:", accuracy_score(y_test, y_pred))

print("\n🔹 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\n🔹 Classification Report:")
print(classification_report(y_test, y_pred))


✅ Logistic Regression Model Trained
🔹 Accuracy: 0.8617886178861789

🔹 Confusion Matrix:
[[22 16]
 [ 1 84]]

🔹 Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.58      0.72        38
           1       0.84      0.99      0.91        85

    accuracy                           0.86       123
   macro avg       0.90      0.78      0.81       123
weighted avg       0.88      0.86      0.85       123



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# Dictionary to store models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Support Vector Machine": SVC()
}

# Train each model and evaluate
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"\n🔹 {name} Accuracy: {acc:.4f}")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



🔹 Logistic Regression Accuracy: 0.8618
Confusion Matrix:
 [[22 16]
 [ 1 84]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.58      0.72        38
           1       0.84      0.99      0.91        85

    accuracy                           0.86       123
   macro avg       0.90      0.78      0.81       123
weighted avg       0.88      0.86      0.85       123


🔹 Random Forest Accuracy: 0.8293
Confusion Matrix:
 [[24 14]
 [ 7 78]]
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.63      0.70        38
           1       0.85      0.92      0.88        85

    accuracy                           0.83       123
   macro avg       0.81      0.77      0.79       123
weighted avg       0.83      0.83      0.82       123


🔹 Decision Tree Accuracy: 0.7642
Confusion Matrix:
 [[23 15]
 [14 71]]
Classification Report:
               precision    recall  f1-score   suppor

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Smaller parameter grid for faster tuning
param_grid_quick = {
    'n_estimators': [50, 100],          # fewer options
    'max_depth': [None, 10],            # fewer options
    'min_samples_split': [2, 5],        # fewer options
    'min_samples_leaf': [1, 2],         # fewer options
    'bootstrap': [True]                  # just one option
}

# Initialize Random Forest
rf_quick = RandomForestClassifier(random_state=42)

# GridSearchCV with smaller grid
grid_search_quick = GridSearchCV(
    estimator=rf_quick,
    param_grid=param_grid_quick,
    cv=5,
    n_jobs=-1,
    scoring='accuracy',
    verbose=1
)

# Fit to training data
grid_search_quick.fit(X_train, y_train)

# Best parameters
print("✅ Best Parameters (Quick):", grid_search_quick.best_params_)

# Evaluate best model
best_rf_quick = grid_search_quick.best_estimator_
y_pred = best_rf_quick.predict(X_test)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print("🔹 Tuned Random Forest Accuracy (Quick):", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Fitting 5 folds for each of 16 candidates, totalling 80 fits
✅ Best Parameters (Quick): {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
🔹 Tuned Random Forest Accuracy (Quick): 0.8536585365853658

Confusion Matrix:
 [[23 15]
 [ 3 82]]

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.61      0.72        38
           1       0.85      0.96      0.90        85

    accuracy                           0.85       123
   macro avg       0.86      0.78      0.81       123
weighted avg       0.86      0.85      0.84       123



In [10]:
import joblib

# 1️⃣ Save the trained model
joblib.dump(best_rf_quick, "loan_eligibility_model.pkl")
print("✅ Model saved as 'loan_eligibility_model.pkl'")

# 2️⃣ Load the model (for future use)
model = joblib.load("loan_eligibility_model.pkl")
print("✅ Model loaded successfully")

# 3️⃣ Predict a new applicant
# Example new applicant data (must match feature order in X_train)
# Features: Gender, Married, Dependents, Education, Self_Employed,
# ApplicantIncome, CoapplicantIncome, LoanAmount, Loan_Amount_Term,
# Credit_History, Property_Area

new_applicant = [[1, 0, 0, 1, 0, 5000, 0, 128, 360, 1, 2]]  # sample values

prediction = model.predict(new_applicant)

if prediction[0] == 1:
    print("🔹 Loan Eligible ✅")
else:
    print("🔹 Loan Not Eligible ❌")


✅ Model saved as 'loan_eligibility_model.pkl'
✅ Model loaded successfully
🔹 Loan Eligible ✅




In [11]:
from google.colab import files
import pandas as pd

# Upload test dataset from local storage
uploaded = files.upload()  # Choose your test CSV file

# Load it into a DataFrame
test_df = pd.read_csv("loan-test.csv")  # replace with your file name if different
print(test_df.head())
print(test_df.columns.tolist())


Saving loan-test.csv to loan-test.csv
    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001015   Male     Yes          0      Graduate            No   
1  LP001022   Male     Yes          1      Graduate            No   
2  LP001031   Male     Yes          2      Graduate            No   
3  LP001035   Male     Yes          2      Graduate            No   
4  LP001051   Male      No          0  Not Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5720                  0       110.0             360.0   
1             3076               1500       126.0             360.0   
2             5000               1800       208.0             360.0   
3             2340               2546       100.0             360.0   
4             3276                  0        78.0             360.0   

   Credit_History Property_Area  
0             1.0         Urban  
1             1.0         Urban  
2             1.0 

In [12]:
import pandas as pd

# -------------------------
# Load train data
# -------------------------
train_df = pd.read_csv("loan-train.csv")

# Fix Dependents
train_df['Dependents'] = train_df['Dependents'].replace('3+', 3).astype(float)

# Fix Credit_History
train_df['Credit_History'] = pd.to_numeric(train_df['Credit_History'], errors='coerce')

# Fill missing values
train_df['Gender'].fillna(train_df['Gender'].mode()[0], inplace=True)
train_df['Married'].fillna(train_df['Married'].mode()[0], inplace=True)
train_df['Dependents'].fillna(train_df['Dependents'].mode()[0], inplace=True)
train_df['Self_Employed'].fillna(train_df['Self_Employed'].mode()[0], inplace=True)
train_df['LoanAmount'].fillna(train_df['LoanAmount'].median(), inplace=True)
train_df['Loan_Amount_Term'].fillna(train_df['Loan_Amount_Term'].mode()[0], inplace=True)
train_df['Credit_History'].fillna(train_df['Credit_History'].mode()[0], inplace=True)

# -------------------------
# Encode categorical columns with explicit mapping
# -------------------------
def map_categories(train_col):
    """
    Returns a mapping dict for a categorical column in train data
    """
    unique_values = train_col.unique()
    mapping = {val:i for i, val in enumerate(unique_values)}
    return mapping

categorical_cols = ['Gender','Married','Education','Self_Employed','Property_Area']
mappings = {}
for col in categorical_cols:
    mappings[col] = map_categories(train_df[col])
    train_df[col] = train_df[col].map(mappings[col])

# Encode target
train_df['Loan_Status'] = train_df['Loan_Status'].map({'Y':1, 'N':0})

# -------------------------
# Load test data
# -------------------------
test_df = pd.read_csv("loan-test.csv")

# Drop Loan_ID
if 'Loan_ID' in test_df.columns:
    test_df.drop('Loan_ID', axis=1, inplace=True)

# Fix Dependents and Credit_History
test_df['Dependents'] = test_df['Dependents'].replace('3+', 3).astype(float)
test_df['Credit_History'] = pd.to_numeric(test_df['Credit_History'], errors='coerce')

# Fill missing values using train statistics
test_df['Gender'].fillna(train_df['Gender'].mode()[0], inplace=True)
test_df['Married'].fillna(train_df['Married'].mode()[0], inplace=True)
test_df['Dependents'].fillna(train_df['Dependents'].mode()[0], inplace=True)
test_df['Self_Employed'].fillna(train_df['Self_Employed'].mode()[0], inplace=True)
test_df['LoanAmount'].fillna(train_df['LoanAmount'].median(), inplace=True)
test_df['Loan_Amount_Term'].fillna(train_df['Loan_Amount_Term'].mode()[0], inplace=True)
test_df['Credit_History'].fillna(train_df['Credit_History'].mode()[0], inplace=True)

# -------------------------
# Encode test categorical columns using train mapping
# -------------------------
for col in categorical_cols:
    test_df[col] = test_df[col].map(mappings[col])

# If there are new/unseen values, fill them with a default (like 0)
for col in categorical_cols:
    test_df[col].fillna(0, inplace=True)

print("✅ Preprocessing completed successfully")
print(train_df.head())
print(test_df.head())


✅ Preprocessing completed successfully
    Loan_ID  Gender  Married  Dependents  Education  Self_Employed  \
0  LP001002       0        0         0.0          0              0   
1  LP001003       0        1         1.0          0              0   
2  LP001005       0        1         0.0          0              1   
3  LP001006       0        1         0.0          1              0   
4  LP001008       0        0         0.0          0              0   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0       128.0             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History  Property_Area  Loan_Status  
0             1.0              0            1  
1             1.0

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Gender'].fillna(train_df['Gender'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Married'].fillna(train_df['Married'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the in

In [13]:
# Features and target
X_train = train_df.drop('Loan_Status', axis=1)
y_train = train_df['Loan_Status']

# Test features
X_test = test_df.copy()


In [14]:
# Check all columns in X_train
print(X_train.dtypes)

# Drop any non-numeric columns just in case
X_train = X_train.select_dtypes(include=['int64','float64'])
X_test = X_test.select_dtypes(include=['int64','float64'])

print("✅ Columns used for training:")
print(X_train.columns)


Loan_ID               object
Gender                 int64
Married                int64
Dependents           float64
Education              int64
Self_Employed          int64
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area          int64
dtype: object
✅ Columns used for training:
Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area'],
      dtype='object')


In [15]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
print("✅ Model trained successfully")


✅ Model trained successfully


In [16]:
# Predict loan eligibility
test_predictions = rf_model.predict(X_test)

# Convert predictions to 'Y' and 'N'
test_predictions_labels = ['Y' if x==1 else 'N' for x in test_predictions]

# If you want to keep Loan_ID for submission
submission = pd.read_csv("loan-test.csv")  # original file with Loan_ID
submission['Loan_Status'] = test_predictions_labels


In [17]:
submission.to_csv("loan_predictions.csv", index=False)
print("✅ Predictions saved as 'loan_predictions.csv'")


✅ Predictions saved as 'loan_predictions.csv'
