In [6]:
# 📦 Import Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# 📥 Load Dataset
df = pd.read_csv("train.csv")
print("Original Shape:", df.shape)

# 🔍 Handle Missing Values
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
df['Married'] = df['Married'].fillna(df['Married'].mode()[0])
df['Dependents'] = df['Dependents'].fillna(df['Dependents'].mode()[0])
df['Self_Employed'] = df['Self_Employed'].fillna(df['Self_Employed'].mode()[0])
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].median())
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0])
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].mode()[0])

# 🔠 Label Encoding
cols = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status', 'Dependents']
le = LabelEncoder()
for col in cols:
    df[col] = le.fit_transform(df[col].astype(str))

# ✅ Define Features and Target
X = df.drop(['Loan_ID', 'Loan_Status'], axis=1)
y = df['Loan_Status']

# ⚖️ Stratified Split to Preserve Class Balance
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# 🧠 Train Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
acc_lr = accuracy_score(y_test, y_pred_lr)

# 🌳 Train Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
acc_dt = accuracy_score(y_test, y_pred_dt)

# 🏁 Select Best Model
if acc_dt > acc_lr:
    print("✅ Using Decision Tree Classifier")
    best_model = dt
    y_pred = y_pred_dt
    acc = acc_dt
else:
    print("✅ Using Logistic Regression")
    best_model = lr
    y_pred = y_pred_lr
    acc = acc_lr

# 📈 Evaluation
print(f"\n🔢 Accuracy: {acc:.2f}")
print("\n📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred, labels=[0,1]))
print("\n📄 Classification Report:\n", classification_report(y_test, y_pred))


Original Shape: (5, 13)


ValueError: The test_size = 1 should be greater or equal to the number of classes = 2

# 🧠 Task 2: Credit Risk Prediction

## 🎯 Objective
Predict whether a loan applicant is likely to default on a loan using classification techniques on the Loan Prediction dataset.

---

## 📥 Step 1: Load and Inspect the Dataset

We will now load the `train.csv` file and check the structure and first few records of the dataset.

---

## 🧹 Step 2: Data Cleaning

Let's check and fill in missing values using appropriate strategies:
- Mode for categorical features
- Median for numeric features

---

## 🔤 Step 3: Label Encoding

We convert categorical variables like `Gender`, `Education`, etc. into numeric values using LabelEncoder.

---

## 📊 Step 4: Exploratory Data Analysis (EDA)

We visualize key features to understand trends in the data.

---

## 🤖 Step 5: Model Training

We split the dataset and train a Logistic Regression model.

---

## 📈 Step 6: Model Evaluation

We evaluate using accuracy, confusion matrix, and classification report.

---

## ✅ Conclusion

- Logistic Regression is able to classify loan approvals fairly well.
- Feature trends such as income and education play an important role.
- Model can be further improved with tuning and additional feature engineering.
