# Task 2: Credit Risk Prediction

## Introduction and Problem Statement
The goal of this project is to predict whether a loan application will be approved, based on applicant details such as income, credit history, education, and other features. This type of model can be useful for financial institutions in assessing credit risk.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split  # Split data into training and testing sets
from sklearn.preprocessing import LabelEncoder, StandardScaler  # Encode categorical text values to numbers
from sklearn.linear_model import LogisticRegression  # Initialize logistic regression model
from sklearn.metrics import (accuracy_score, confusion_matrix, ConfusionMatrixDisplay,  # Calculate model accuracy
                             classification_report)  # Detailed classification performance metrics

# Load dataset
df = pd.read_csv('loan_prediction.csv')  # Load dataset from CSV
print("Dataset shape:", df.shape)
df.head()

In [None]:
df.info()  # Display column info and data types
df.describe()  # Summary statistics of numerical columns
print("\nMissing values:\n", df.isnull().sum())  # Count of missing values per column

## Data Cleaning and Preparation

In [None]:
# Fill missing categorical values with mode
for col in ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Credit_History', 'Loan_Amount_Term']:
    df[col].fillna(df[col].mode()[0], inplace=True)  # Fill missing values

# Fill numerical missing values with median
df['LoanAmount'].fillna(df['LoanAmount'].median(), inplace=True)  # Fill missing values

# Drop rows with missing target values
df.dropna(subset=['Loan_Status'], inplace=True)  # Drop rows where target variable is missing
print("\nMissing values after cleaning:\n", df.isnull().sum())  # Count of missing values per column

## Exploratory Data Analysis (EDA)

In [None]:
# Loan amount distribution
sns.histplot(df['LoanAmount'], kde=True)  # Histogram with KDE for distribution
plt.title('Loan Amount Distribution')
plt.show()

# Education vs Loan Status
sns.countplot(x='Education', hue='Loan_Status', data=df)  # Count plot for categorical comparison
plt.title('Education vs Loan Status')
plt.show()

# Applicant income distribution
sns.histplot(df['ApplicantIncome'], kde=True)  # Histogram with KDE for distribution
plt.title('Applicant Income Distribution')
plt.show()

# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')  # Heatmap to visualize feature correlations
plt.title('Feature Correlation')
plt.show()

## Data Encoding and Feature Scaling

In [None]:
le = LabelEncoder()  # Encode categorical text values to numbers
for col in df.select_dtypes(include='object').columns:
    df[col] = le.fit_transform(df[col])

X = df.drop(['Loan_ID', 'Loan_Status'], axis=1)
y = df['Loan_Status']

scaler = StandardScaler()  # Standardize features
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)  # Split data into training and testing sets

## Model Training

In [None]:
model = LogisticRegression(max_iter=2000)  # Initialize logistic regression model
model.fit(X_train, y_train)  # Train the model

## Model Evaluation

In [None]:
y_pred = model.predict(X_test)  # Predict on test data
accuracy = accuracy_score(y_test, y_pred)  # Calculate model accuracy
print("Model Accuracy:", accuracy)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)  # Display confusion matrix
disp.plot(cmap='Blues')
plt.title("Confusion Matrix")
plt.show()

# Classification Report
print("Classification Report:\n", classification_report(y_test, y_pred))  # Detailed classification performance metrics

## Conclusion
- The logistic regression model achieved a reasonable accuracy.
- Education, Applicant Income, and Credit History showed significant correlation with loan approval.
- Further improvements could include trying other classification models, feature engineering, and hyperparameter tuning.