**Milestone 1: Feature Engineering & Selection**

This script covers:
1. Data Exploration & Analysis (EDA)
2. Feature Engineering
3. Feature Selection

In [None]:
#importing the libaries
import pandas as pd #Used for data manipulation and analysis, especially for working with tabular data (DataFrames and Series).
import numpy as np #Provides support for numerical operations, including arrays, matrices, and mathematical functions.
import matplotlib.pyplot as plt #Used for creating visualizations, such as line plots, bar charts, scatter plots, etc.
import seaborn as sns #It provides attractive statistical graphics like heatmaps, violin plots, and boxplots.
from sklearn.preprocessing import LabelEncoder, StandardScaler #mports two tools from Scikit-learn's preprocessing module
#LabelEncoder: Converts categorical string labels into numeric form.
#StandardScaler: Standardizes features by removing the mean and scaling to unit variance.
from sklearn.feature_selection import SelectKBest, f_classif #Imports tools for feature selection
from sklearn.decomposition import PCA #Imports Principal Component Analysis (PCA) from Scikit-learn.
from sklearn.ensemble import RandomForestClassifier

In [None]:
#load the data
train_df = pd.read_csv("C:/Users/mamat/Downloads/train (1).csv")
test_df = pd.read_csv("C:/Users/mamat/Downloads/test.csv")

In [None]:
# Exploratory Data Analysis
print(train_df.info()) # to look at the basic infromation
print(train_df.describe(include='all')) #to describe the table
print(train_df.isnull().sum()) # to see null value
#When you build a machine learning model, the model is trained on the train.csv file and then tested or validated on the test.csv file. 
#If your test data has missing values or is not cleaned the same way as train data, your model may give incorrect or poor results.
sns.countplot(x='Loan_Status', data=train_df)
plt.show()

In [None]:
# Loop through selected categorical features to analyze their relationship with Loan Status
for col in ['Gender', 'Married', 'Education', 'Self_Employed']:
    plt.figure(figsize=(10,6)) # Set the figure size for each plot to ensure clarity
    sns.countplot(x=col, hue='Loan_Status', data=train_df) # Create a count plot for the current categorical feature,
    plt.title(f'Loan Status by {col}') # Add a title 
    plt.show()  # Display the plot before moving to the next feature

In [None]:
# Handle missing values
for col in ['Gender', 'Married', 'Dependents', 'Self_Employed']:
    train_df[col].fillna(train_df[col].mode()[0], inplace=True)
train_df['LoanAmount'].fillna(train_df['LoanAmount'].median(), inplace=True) # Fill numeric missing values with median
train_df['Loan_Amount_Term'].fillna(train_df['Loan_Amount_Term'].median(), inplace=True) # Fill numeric missing values with median
train_df['Credit_History'].fillna(train_df['Credit_History'].mode()[0], inplace=True) # Fill categorical missing values with mode

In [None]:
# Feature Engineering
# Create total income by combining applicant and co-applicant income
train_df['TotalIncome'] = train_df['ApplicantIncome'] + train_df['CoapplicantIncome']
# Create a ratio to reflect loan affordability based on income
train_df['IncomeToLoanRatio'] = train_df['TotalIncome'] / train_df['LoanAmount']
# Apply log transformation to reduce skewness in LoanAmount
train_df['LoanAmount_log'] = np.log(train_df['LoanAmount'] + 1)
# Create a ratio of loan amount to loan term, reflecting repayment intensity
train_df['LoanAmountTermRatio'] = train_df['LoanAmount'] / train_df['Loan_Amount_Term']

In [None]:
# Encode categorical features
# Initialize LabelEncoder (converts categories to integers e.g., Male=1, Female=0)
label_cols = ['Gender', 'Married', 'Education', 'Self_Employed', 'Loan_Status']
le = LabelEncoder() # Encode binary categorical variables into numeric using LabelEncoder
for col in label_cols: # Apply encoding to each categorical column
    train_df[col] = le.fit_transform(train_df[col])
train_df['Dependents'] = train_df['Dependents'].replace('3+', 3).astype(int) # Replace '3+' in Dependents with 3 and convert to integer


In [None]:
# Feature Selection using SelectKBest

"""
    Select the most important features using statistical methods.
    
    Why we do this:
    - Reduce dimensionality to improve model performance
    - Remove irrelevant or redundant features
    - Speed up model training
    - Potentially improve model interpretability
    """
# Prepare features (X) and target (y) for modeling
# Dropping irrelevant columns (Loan_ID) and redundant features (raw incomes/loan amounts)
X = train_df.drop(['Loan_ID', 'Loan_Status', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount'], axis=1)
y = train_df['Loan_Status']
# Feature selection using ANOVA F-test (measures linear relationship with target)
selector = SelectKBest(score_func=f_classif, k='all')  # k='all' keeps all features
selector.fit(X, y) # Computes F-scores
# Display feature scores in descending order (higher score = more predictive power)
feature_scores = pd.Series(selector.scores_, index=X.columns).sort_values(ascending=False)
print(feature_scores) # Print feature scores

In [None]:
# Feature Importance using Random Forest
model = RandomForestClassifier() # Default parameters
model.fit(X, y) # Train on all features
importances = pd.Series(model.feature_importances_, index=X.columns)
importances.sort_values().plot(kind='barh') # Horizontal bar plot
plt.title('Feature Importances')
plt.show() # Displays the plot