<a href="https://colab.research.google.com/github/LahariSivalasetty/newone/blob/Loan_prediction/Loan_Prediciton.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# prompt: need code for loan predction and import data set from libraries without explicit dataset using predictive analysis for for house loan prediciton

# Install necessary libraries if not already installed
!pip install scikit-learn pandas numpy

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# --- Generate Synthetic Data ---
# In a real scenario, you would load a dataset here.
# This synthetic data simulates a loan application dataset.

np.random.seed(42) # for reproducibility

n_samples = 1000

# Features that might influence loan approval
data = {
    'Gender': np.random.choice(['Male', 'Female'], n_samples),
    'Married': np.random.choice(['Yes', 'No'], n_samples),
    'Dependents': np.random.choice(['0', '1', '2', '3+'], n_samples),
    'Education': np.random.choice(['Graduate', 'Not Graduate'], n_samples),
    'Self_Employed': np.random.choice(['Yes', 'No'], n_samples),
    'ApplicantIncome': np.random.randint(1500, 25000, n_samples),
    'CoapplicantIncome': np.random.randint(0, 10000, n_samples),
    'LoanAmount': np.random.randint(50, 700, n_samples),
    'Loan_Amount_Term': np.random.choice([12.0, 36.0, 60.0, 120.0, 180.0, 240.0, 300.0, 360.0, 480.0], n_samples),
    'Credit_History': np.random.choice([0.0, 1.0], n_samples),
    'Property_Area': np.random.choice(['Urban', 'Semiurban', 'Rural'], n_samples),
}

df = pd.DataFrame(data)

# Introduce some missing values (optional, but common in real datasets)
for col in ['LoanAmount', 'Loan_Amount_Term', 'Credit_History']:
    missing_indices = np.random.choice(df.index, size=int(n_samples * 0.05), replace=False)
    df.loc[missing_indices, col] = np.nan

# Create the target variable (Loan_Status) based on some simple rules
# This is a simplified rule; a real model would learn these patterns
df['Loan_Status'] = 'N' # Default to Not Approved

# Rule 1: Good credit history and sufficient income
df.loc[(df['Credit_History'] == 1) &
       ((df['ApplicantIncome'] + df['CoapplicantIncome']) > df['LoanAmount'] * 10) &
       (df['Loan_Amount_Term'] >= 120), 'Loan_Status'] = 'Y'

# Rule 2: Even without perfect credit, high income and low loan amount might get approved
df.loc[(df['Credit_History'] == 0) &
       ((df['ApplicantIncome'] + df['CoapplicantIncome']) > df['LoanAmount'] * 20) &
       (df['Loan_Amount_Term'] >= 120), 'Loan_Status'] = 'Y'

# Make some of the "Y" cases "N" randomly to introduce noise
not_approved_indices = df[df['Loan_Status'] == 'Y'].sample(frac=0.1).index
df.loc[not_approved_indices, 'Loan_Status'] = 'N'


print("Synthetic Dataset created:")
print(df.head())
print("\nDataset Info:")
df.info()
print("\nMissing values:")
print(df.isnull().sum())

# --- Data Preprocessing ---

# Handle missing values (simple imputation for demonstration)
for col in ['LoanAmount', 'Loan_Amount_Term', 'Credit_History']:
    if df[col].dtype == 'object' or df[col].isnull().sum() > 0:
        if df[col].dtype == 'object':
            df[col].fillna(df[col].mode()[0], inplace=True) # Mode for categorical
        else:
             df[col].fillna(df[col].mean(), inplace=True) # Mean for numerical

# Convert categorical features to numerical using one-hot encoding
df = pd.get_dummies(df, columns=['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area'], drop_first=True)

# Map target variable to numerical
df['Loan_Status'] = df['Loan_Status'].map({'Y': 1, 'N': 0})

print("\nDataset after Preprocessing:")
print(df.head())
print("\nMissing values after preprocessing:")
print(df.isnull().sum()) # Should be 0

# --- Model Training ---

# Define features (X) and target (y)
X = df.drop('Loan_Status', axis=1)
y = df['Loan_Status']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a Logistic Regression model
model = LogisticRegression(solver='liblinear') # liblinear is often good for small datasets
model.fit(X_train, y_train)

# --- Model Evaluation ---

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("\nModel Evaluation:")
print(f"Accuracy: {accuracy:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

# --- Example Prediction on New Data ---
# Create a sample new applicant data point (ensure same columns and order as training data)
# You'll need to create the appropriate dummy variables for categorical features
sample_applicant_data = {
    'ApplicantIncome': [6000],
    'CoapplicantIncome': [2000],
    'LoanAmount': [150],
    'Loan_Amount_Term': [360],
    'Credit_History': [1.0],
    'Gender_Male': [1],          # Assuming Male
    'Married_Yes': [1],          # Assuming Married
    'Dependents_1': [0],         # Assuming 0 dependents (adjust as needed for other categories)
    'Dependents_2': [0],
    'Dependents_3+': [0],
    'Education_Not Graduate': [0], # Assuming Graduate
    'Self_Employed_Yes': [0],    # Assuming Not Self-Employed
    'Property_Area_Semiurban': [1], # Assuming Semiurban
    'Property_Area_Urban': [0],
}

sample_df = pd.DataFrame(sample_applicant_data)

# Ensure all columns present in the training data are also in the sample data, fill missing with 0
missing_cols = set(X_train.columns) - set(sample_df.columns)
for c in missing_cols:
    sample_df[c] = 0
sample_df = sample_df[X_train.columns] # Ensure column order is the same

# Predict loan status for the sample applicant
prediction = model.predict(sample_df)

if prediction[0] == 1:
    print("\nPrediction for sample applicant: Loan Approved (1)")
else:
    print("\nPrediction for sample applicant: Loan Not Approved (0)")


Synthetic Dataset created:
   Gender Married Dependents     Education Self_Employed  ApplicantIncome  \
0    Male      No          2  Not Graduate            No            19854   
1  Female     Yes         3+  Not Graduate           Yes            23690   
2    Male     Yes          1  Not Graduate           Yes             7642   
3    Male     Yes         3+  Not Graduate           Yes            20703   
4    Male     Yes         3+      Graduate            No            10594   

   CoapplicantIncome  LoanAmount  Loan_Amount_Term  Credit_History  \
0               1700       698.0             300.0             0.0   
1               2700       667.0              60.0             1.0   
2               6815       656.0             480.0             NaN   
3               1647       184.0              36.0             1.0   
4               6038       629.0              60.0             0.0   

  Property_Area Loan_Status  
0         Urban           Y  
1     Semiurban           N  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True) # Mean for numerical
