## PREPROCESSING AND DATA ANALYSIS

**Import the Required Libraries**

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

ModuleNotFoundError: No module named 'plotly'

**Load the Train and Test Datasets and store them as dataframes**

In [None]:
# Loading the training dataset
df=pd.read_csv("training_set.csv")
# Loading the testing dataset
dt=pd.read_csv("testing_set.csv")

**Analysing the Train Dataset**

In [None]:
df.head()

In [None]:
df.info()

**Analysing the Test Dataset**

In [None]:
dt.head()

In [None]:
dt.info()

## Data Analysis

**Checking the null values in train dataset**

In [None]:
df.isna().sum().sort_values(ascending = False)

**Checking the null values in test dataset**

In [None]:
dt.isna().sum().sort_values(ascending = False)

**Visualising the missing values in train dataset**

In [None]:
# Visualizing the missing values
fig = plt.figure(figsize=(18, 6))
sns.heatmap(df.isnull(),cbar=True) 

**Visualising the missing values in test dataset**

In [None]:
# Visualizing the missing values
fig = plt.figure(figsize=(18, 6))
sns.heatmap(dt.isnull(),cbar=True) 

**Dropping the column 'Loan ID' in both the datasets**

In [None]:
df=df.drop(columns='Loan_ID')
dt=dt.drop(columns='Loan_ID')

**Dividing the columns into categorical and numeric**

In [None]:
# Categorical data for train data (includes the column Loan_Status)
categorical_data = [i for i in df.columns if df[i].nunique()<=4]
categorical_data

In [None]:
# Categorical data for test data (excludes the column Loan_Status)
categorical_data_dt=['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed','property_Area']
categorical_data_dt

In [None]:
# Numerical data
numerical_data = [i for i in df.columns if i not in categorical_data]
numerical_data

**Understanding average values for all the numerical columns for each loan status category in the train dataset**

In [None]:
# Average applicant income for each loan status category
app_inc = df.groupby(['Loan_Status'])['ApplicantIncome'].mean().reset_index()
app_inc

In [None]:
# Average coapplicant income for each loan status category
coapp_inc = df.groupby(['Loan_Status'])['CoapplicantIncome'].mean().reset_index()
coapp_inc

In [None]:
# Average loan amount for each loan status category
loan_amt = df.groupby(['Loan_Status'])['LoanAmount'].mean().reset_index()
loan_amt

In [None]:
# Average loan amount term for each loan status category
loan_amt_term = df.groupby(['Loan_Status'])['Loan_Amount_Term'].mean().reset_index()
loan_amt_term

**Distribution of Numerical Features in the Train Dataset**

In [None]:
# Histograms to represent the all numerical features
df[numerical_data].hist(figsize=(15, 12), bins=20, edgecolor='black')
plt.suptitle('Distribution of Numerical Features in the Dataset', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# Distplots to represent numerical columns
plt.figure(figsize=(15,10))
for n, col in enumerate(numerical_data[:]):
  plt.subplot(4, 2, n+1)  # 4 rows, 2 columns
  sns.distplot(df[col])
  plt.title(col.title())
  plt.tight_layout()


**Interpretation**

* ApplicantIncome - Highly right-skewed, most applicants have income below ₹10,000. A few applicants have extremely high incomes, which may be outliers.
* CoapplicantIncome - Right-skewed, many co-applicants have zero or very low income. This may imply they are not earning or not included in income consideration.
* LoanAmount - Slight right skew, but more balanced. Majority of loans are between ₹100,000–₹200,000. There are some large loan amounts that stretch the distribution.
* Loan_Amount_Term - Discrete distributions showing limited choices. Most loans are for 360 months (30 years). There are fewer shorter terms like 120, 180, or 240 months.

**Distribution of Categorical Features in the Train Dataset**

In [None]:
# Defining the colours to use
colors = ['blue', 'orange', 'green', 'red']

# Creating a figure with 8 subplots (4 rows, 2 columns)
fig, axs = plt.subplots(4, 2, figsize=(14, 20))  # 4 rows, 2 columns

# Plot 1: Loan_Status
ax = axs[0, 0]
df['Loan_Status'].value_counts().plot.bar(ax=ax, color=colors)
ax.set_title('Loan_Status')
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() + p.get_width()/2., p.get_height()), 
                ha='center', va='bottom')

# Plot 2: Gender
ax = axs[0, 1]
df['Gender'].value_counts().plot.bar(ax=ax, color=colors)
ax.set_title('Gender')
for p in ax.patches:
    ax.annotate(f"{p.get_height():.2f}", (p.get_x() + p.get_width()/2., p.get_height()), 
                ha='center', va='bottom')

# Plot 3: Married
ax = axs[1, 0]
df['Married'].value_counts().plot.bar(ax=ax, color=colors)
ax.set_title('Married')
for p in ax.patches:
    ax.annotate(f"{p.get_height():.2f}", (p.get_x() + p.get_width()/2., p.get_height()), 
                ha='center', va='bottom')

# Plot 4: Self_Employed
ax = axs[1, 1]
df['Self_Employed'].value_counts().plot.bar(ax=ax, color=colors)
ax.set_title('Self_Employed')
for p in ax.patches:
    ax.annotate(f"{p.get_height():.2f}", (p.get_x() + p.get_width()/2., p.get_height()), 
                ha='center', va='bottom')

# Plot 5: Credit_History
ax = axs[2, 0]
df['Credit_History'].value_counts().plot.bar(ax=ax, color=colors)
ax.set_title('Credit_History')
for p in ax.patches:
    ax.annotate(f"{p.get_height():.2f}", (p.get_x() + p.get_width()/2., p.get_height()), 
                ha='center', va='bottom')

# Plot 6: Education
ax = axs[2, 1]
df['Education'].value_counts().plot.bar(ax=ax, color=colors)
ax.set_title('Education')
for p in ax.patches:
    ax.annotate(f"{p.get_height():.2f}", (p.get_x() + p.get_width()/2., p.get_height()), 
                ha='center', va='bottom')

# Plot 7: Dependents
ax = axs[3, 0]
df['Dependents'].value_counts().plot.bar(ax=ax, color=colors)
ax.set_title('Dependents')
for p in ax.patches:
    ax.annotate(f"{p.get_height():.2f}", (p.get_x() + p.get_width()/2., p.get_height()), 
                ha='center', va='bottom')

# Plot 8: property_Area
ax = axs[3, 1]
df['property_Area'].value_counts().plot.bar(ax=ax, color=colors)
ax.set_title('property_Area')
for p in ax.patches:
    ax.annotate(f"{p.get_height():.2f}", (p.get_x() + p.get_width()/2., p.get_height()), 
                ha='center', va='bottom')
# Displaying the plots
plt.tight_layout()
plt.show()


**Interpretation**

* Loan_Status - Has 422 entries with loan approved as 'Y' and 192 entries as 'N'
* Gender - Has 487 entries with gender as 'Male' and 112 entries as 'Female'
* Married - Has 398 entries with loan married as 'Yes' and 213 entries as 'No'
* Self_Employed - Has 500 entries with self employed as 'Yes' and 82 entries as 'No'
* Credit_History - Has 475 entries with credit history as '1.0' and 89 entries as '0.0'
* Education - Has 479 entries with education as 'Graduate' and 134 entries as 'Not Graduate'
* Dependents - Has 345 entries with dependents as '0', 102 entries as '1', 101 entries as '2' and 51 entries as '3+'
* property_Area - Has 233 entries with property area as 'Semiurban', 202 entries as 'Urban' and 179 entries as 'Rural'


**Bivariate analysis between different categorical features and Loan_Status in the train dataset**

Each graph shows the breakdown of Loan_Status for each of the categorical variables, helping to identify trends or patterns based on the categorical features. The bars represent the count of applicants in each category, and the numbers on top of the bars show the exact counts.

In [None]:
for col in categorical_data:
    if col != 'Loan_Status':  # Skip Loan_Status column
        cross_tab = pd.crosstab(df[col], df['Loan_Status'])

        # Plot side-by-side bars (not stacked)
        ax = cross_tab.plot(kind='bar', stacked=False, figsize=(7, 5), color=['orange', 'blue'])

        plt.title(f'Loan Status by {col}')
        plt.ylabel('Count')
        plt.ylim(0, cross_tab.values.max() + 20)

        # Add value labels on top of each bar
        for p in ax.patches:
            height = p.get_height()
            if height > 0:
                ax.text(
                    p.get_x() + p.get_width() / 2., 
                    height + 1, 
                    f'{int(height)}', 
                    ha='center', va='bottom', fontsize=9
                )

        plt.legend(title='Loan Status')
        plt.tight_layout()
        plt.show()

**Converting Categorical Data to Numeric Codes**

In [None]:
# Encoding categorical data in train dataset
for col in categorical_data:
    df[col] = df[col].astype('category').cat.codes
df.replace(-1, np.nan, inplace=True)

In [None]:
# Encoding categorical data in train dataset
for col in categorical_data_dt:
    dt[col] = dt[col].astype('category').cat.codes
dt.replace(-1, np.nan, inplace=True)

**Bivariate analysis between different numerical features and Loan_Status in the train dataset**

The regression plots display how each numerical feature correlates with the Loan_Status. Since Loan_Status is binary (1 for loan approval and 0 for loan denial), the graphs reveal loan approval probability trends(increase or decrease) and presence of outliers.

In [None]:
plt.figure(figsize=(12,5))

plt.suptitle('Numerical Features Data Analysis with Loan_Status', fontsize=21, fontweight='bold', y=1.03)

for i,col in enumerate(numerical_data):
  plt.subplot(2, 4, i+1) # Subplots of 2 rows and 4 columns

  # Regression plots
  sns.regplot(x=df[col], y='Loan_Status', data=df)
  
  plt.xlabel(col)
  plt.tight_layout()

**Pairplots between all the Numerical Features in the train dataset**

In [None]:
# PairPlot
sns.pairplot(df[numerical_data + ['Loan_Status']], hue='Loan_Status', palette='husl')
plt.show()

**Correlation between all the features**

In [None]:
# Calculating the correlation
plt.figure(figsize = (12,10))
corr = df.corr()
sns.heatmap(abs(corr), annot=True, cmap = 'flare')

**Correlation of all the features with Loan_Status**

In [None]:
# Correlation of all features with 'Loan_Status' and sort them in descending order
loan_status_corr = corr['Loan_Status'].sort_values(ascending=False)
loan_status_corr

## Data Preprocessing

**Filling null values in categorical columns with most frequent observation**

In [None]:
#Filling the null values of categorical columns in train dataset
for col in categorical_data:
    df[col].fillna(df[col].mode()[0], inplace=True)

#Filling the null values in test dataset
for col in categorical_data_dt:
    dt[col].fillna(dt[col].mode()[0], inplace=True)

**Filling null values in numerical columns with the median value**

In [None]:
#Filling the null values of numerical columns in train dataset
for col in numerical_data:
    df[col].fillna(df[col].median(), inplace=True)

#Filling the null values of numerical columns in test dataset
for col in numerical_data:
    dt[col].fillna(dt[col].median(), inplace=True)

#Filling in Credit_History with most frequent observation only in test data (can't be a median value) as it was encoded for in train data
dt['Credit_History'].fillna(dt['Credit_History'].mode()[0], inplace=True)

**Checking for null values in train data**

In [None]:
df.isna().sum()

**Checking for null values in test data**

In [None]:
dt.isna().sum()

**Checking for outliers in train data using boxplots**

In [None]:
# Calculate IQR and detect outliers for each column
outlier_counts = {}

for col in df.select_dtypes(include=['float64', 'int64']).columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Count outliers
    outliers = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
    outlier_counts[col] = outliers

# Plot the boxplot
plt.figure(figsize=(18, 10))
sns.boxplot(data=df)
plt.title('Boxplot of Numerical Features')
plt.show()

# Print the number of outliers for each column
print("Number of outliers for each column:")
for col, count in outlier_counts.items():
    print(f"{col}: {count} outliers")


**Individual Boxplots for numerical data**

In [None]:
# Boxplots for numerical data
for col in numerical_data:
    plt.figure(figsize=(6, 4))
    sns.boxplot(y=df[col], color='skyblue')
    plt.title(f'Boxplot for {col}', fontsize=14, fontweight='bold')
    plt.ylabel(col)
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.tight_layout()
    plt.show()


**Dealing with outliers in numerical data by capping**:  Replacing values lower than the 10th percentile with the 10th percentile value and higher than the 90th percentile with the 90th percentile value, effectively removing extreme values.

In [None]:
def cap_outliers(df, col, lower_quantile=0.1, upper_quantile=0.9):
    lower_bound = df[col].quantile(lower_quantile)
    upper_bound = df[col].quantile(upper_quantile)
    
    df[col] = np.where(df[col] < lower_bound, lower_bound, df[col])
    df[col] = np.where(df[col] > upper_bound, upper_bound, df[col])
    return df
for col in numerical_data:
     df = cap_outliers(df, col)

**Outliers after capping for numerical data in train data**

In [None]:
# Function to calculate the number of outliers for each numerical column
def count_outliers(df, col):
    # Define lower and upper bounds for outliers
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1  # Interquartile Range
    lower_bound = Q1 - 1.5 * IQR  # Lower bound
    upper_bound = Q3 + 1.5 * IQR  # Upper bound

    # Count the number of outliers (values outside the lower and upper bounds)
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    return len(outliers)

# Calculate outliers for each numerical column
print("Number of outliers for each numerical column:")
for col in numerical_data:
    outlier_count = count_outliers(df, col)
    print(f"{col}: {outlier_count} outliers")


Outliers still present in Loan_Amount_Term as it is not an evenly distributed data having most of its entries as distinct values like 360 folowed by 240,180,etc.

**Boxplots after capping the outliers of numerical data in train dataset**

In [None]:
# Boxplots for numerical data
for col in numerical_data:
    plt.figure(figsize=(6, 4))
    sns.boxplot(y=df[col], color='skyblue')
    plt.title(f'Boxplot for {col}', fontsize=14, fontweight='bold')
    plt.ylabel(col)
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.tight_layout()
    plt.show()

**Handling rear categories so as to avoid issues caused by too many unique values with low frequencies**

In [None]:
# Define a threshold for category frequency (e.g., 5% of the data)
threshold = 0.05
for col in categorical_data:
    category_counts = df[col].value_counts(normalize=True)  # Calculate frequency distribution
    rare_categories = category_counts[category_counts < threshold].index  # Find rare categories
    
    # Replace rare categories with 'Other'
    df[col] = df[col].replace(rare_categories, 'Other')

**Analysing the dataframes**

In [None]:
df.head()

Here, all the categorical columns have been encoded. The column 'property_Area' has values 0,1 and 2. One hot encoding is performed on this to form columns - Urban, Semiurban and Rural

## Feature Engineering

**One hot encoding on train dataset**

In [None]:
df['property_Area_str'] = df['property_Area'].map({0: 'Urban', 1: 'Rural', 2: 'Semiurban'})

# One-hot encode the string versions, keep original, and convert True/False to int
df_ohe = pd.get_dummies(df[['property_Area_str']], prefix=['property_Area'])
df_ohe = df_ohe.astype(int)  # Convert True/False to 1/0

# Join the one-hot encoded columns back
df = pd.concat([df, df_ohe], axis=1)

In [None]:
df.head()

**One hot encoding on test dataset**

In [None]:
dt['property_Area_str'] = dt['property_Area'].map({0: 'Urban', 1: 'Rural', 2: 'Semiurban'})

dt_ohe = pd.get_dummies(dt[['property_Area_str']], prefix=['property_Area'])
dt_ohe = dt_ohe.astype(int)

dt = pd.concat([dt, dt_ohe], axis=1)

In [None]:
dt.head()

**Removing the str columns from the train and test dataframes**

In [None]:
df=df.drop(columns=["property_Area_str"])
dt=dt.drop(columns=["property_Area_str"])

**Computing correlation between all features**

In [None]:
corr_matrix=df.corr()
plt.figure(figsize=(12,10))
sns.heatmap(corr_matrix, annot=True, cmap='rocket', fmt=".2f")
plt.title("Correlation Matrix of All Numeric Features")
plt.show()

**Computing correlation between the target Loan_Status and all columns**

In [None]:
loan_status_corr = corr_matrix['Loan_Status'].sort_values(ascending=False)
loan_status_corr

**Removing 'property_Area' col from both train and test dataset as it has negligible correlation(lesser than its one hot encoded columns)**

In [None]:
df=df.drop(columns=["property_Area"])
dt=dt.drop(columns=["property_Area"])

**Adding new features like Total Income, Loan to Income Ratio, EMI and Balance Income to both the train and test dataframes**

In [None]:
#Feature Engineering on train dataframe
df['Total_Income'] = df['ApplicantIncome'] + df['CoapplicantIncome']
df['Loan_Income_Ratio'] = df['LoanAmount'] / df['Total_Income']
df['EMI'] = df['LoanAmount'] / df['Loan_Amount_Term']
df['Balance_Income'] = df['Total_Income'] - df['EMI']

In [None]:
#Feature Engineering on test dataframe
dt['Total_Income'] = dt['ApplicantIncome'] + dt['CoapplicantIncome']
dt['Loan_Income_Ratio'] = dt['LoanAmount'] / dt['Total_Income']
dt['EMI'] = dt['LoanAmount'] / dt['Loan_Amount_Term']
dt['Balance_Income'] = dt['Total_Income'] - dt['EMI']

**Correlation calculation after adding new columns**

In [None]:
corr_matrix=df.corr()
plt.figure(figsize=(12,10))
sns.heatmap(corr_matrix, annot=True, cmap='rocket', fmt=".2f")
plt.title("Correlation Matrix of All Numeric Features")
plt.show()

**Correlation of all columns with Loan_Status after adding new columns**

In [None]:
loan_status_corr = corr_matrix['Loan_Status'].sort_values(ascending=False)
loan_status_corr

**Saving the preprocessed train and test datframes to csv files**

In [None]:
# Train dataframe saved as 'training_preprocessed.csv'
df.to_csv('training_preprocessed.csv', index=False)

# Train dataframe saved as 'testing_preprocessed.csv'
dt.to_csv('testing_preprocessed.csv', index=False)

## *CLASSIFICATION*

**Looading the preprocessed train and test datasets**

In [None]:
df_cl=pd.read_csv('training_preprocessed.csv')
dt_cl=pd.read_csv('testing_preprocessed.csv')

In [None]:
df_cl.head()

In [None]:
dt_cl.head()

**Splitting the train data into training and validation sets**

In [None]:
df_cl_train,df_cl_val=train_test_split(df_cl,test_size=0.25,random_state=42)

In [None]:
y_cl_train=df_cl_train["Loan_Status"]
x_cl_train=df_cl_train.drop(columns="Loan_Status")

In [None]:
y_cl_val=df_cl_val["Loan_Status"]
x_cl_val=df_cl_val.drop(columns="Loan_Status")

## Model - Random Forest

**Import the necessary modules**

In [None]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

**Training the data**

In [None]:
rf = RandomForestClassifier(n_estimators=80, random_state=42)
rf.fit(x_cl_train, y_cl_train)

**Making predictions on the validation data**

In [None]:
y_cl_val_pred = rf.predict(x_cl_val)

**Metrics for the evaluation of the model performance on validation data**

In [None]:
cm = confusion_matrix(y_cl_val, y_cl_val_pred)
acc = accuracy_score(y_cl_val, y_cl_val_pred)
precision = precision_score(y_cl_val, y_cl_val_pred)
recall = recall_score(y_cl_val, y_cl_val_pred)
f1 = f1_score(y_cl_val, y_cl_val_pred)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
print("Confusion Matrix:")
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

print(f"Accuracy: {acc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}\n")

**Making predictions on test dataset**

In [None]:
y_cl_test_pred = rf.predict(dt_cl)

**Printing the Predictions**

In [None]:
y_cl_test_pred

**Saving the classification predictions to a new submissions csv file**

In [None]:
#Adding predictions as a new column
dt_cl['Loan_Status'] = y_cl_test_pred

#Saving to a new CSV file
dt_cl.to_csv('final_submission_classification.csv', index=False)


## *REGRESSION TASK 1*

**Loading the datasets**

In [None]:
df_rg1=pd.read_csv("training_preprocessed.csv")
dt_rg1=pd.read_csv("final_submission_classification.csv")

In [None]:
df_rg1.head()

In [None]:
dt_rg1.head()

**Splitting the train data into training and validation sets**

In [None]:
df_rg1_train,df_rg1_val=train_test_split(df_rg1,test_size=0.2,random_state=42)

In [None]:
y_rg1_train = df_rg1_train[df_rg1_train['Loan_Status']==1]['LoanAmount']
x_rg1_train = df_rg1_train[df_rg1_train['Loan_Status']==1].drop(columns=['LoanAmount','Loan_Status'])

y_rg1_val = df_rg1_val[df_rg1_val['Loan_Status']==1]['LoanAmount']
x_rg1_val = df_rg1_val[df_rg1_val['Loan_Status']==1].drop(columns=['LoanAmount','Loan_Status'])

## Model - Gradient Boosting

**Import the necessary modules**

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
from math import sqrt

**Train the model on the training data**

In [None]:
model = GradientBoostingRegressor()
model.fit(x_rg1_train, y_rg1_train)

**Predictions on validation data**

In [None]:
y_val_rg1_predict = model.predict(x_rg1_val)

**Metrics for evaluation of predictions made on validation data**

In [None]:
rmse = sqrt(mean_squared_error(y_rg1_val, y_val_rg1_predict))
mae = mean_absolute_error(y_rg1_val, y_val_rg1_predict)
r2 = r2_score(y_rg1_val, y_val_rg1_predict)
mean_value = y_rg1_val.mean()
rmse_percentage = (rmse / mean_value) * 100

print(f'RMSE: {rmse:.3f}')
print(f'MAE: {mae:.3f}')
print(f'R2: {r2:.3f}')
print(f'RMSE (percentage): {rmse_percentage:.2f}%') 

**Predictions on the test dataset**

In [None]:
dt_y_rg1 = dt_rg1[dt_rg1['Loan_Status']==0]['LoanAmount']
dt_x_rg1 = dt_rg1[dt_rg1['Loan_Status']==0].drop(columns=['LoanAmount','Loan_Status'])

In [None]:
y_test_rg1_predict = model.predict(dt_x_rg1)

**Printing the Predictions**

In [None]:
y_test_rg1_predict

**Saving the predictions to a new submission file**

In [None]:
#Adding predictions as a new column
dt_x_rg1['LoanAmount'] = y_test_rg1_predict

#Saving to a new CSV file
dt_x_rg1.to_csv('final_submission_regression1.csv', index=False)

## *REGRESSION TASK 2*

**Loading the datasets**

In [None]:
df_rg2=pd.read_csv("training_preprocessed.csv")
dt_rg2=pd.read_csv("final_submission_classification.csv")

In [None]:
df_rg2.head()

In [None]:
dt_rg2.head()

**Splitting the data into training and validation sets**

In [None]:
df_rg2_train,df_rg2_val=train_test_split(df_rg2,test_size=0.2,random_state=42)

In [None]:
y_rg2_train = df_rg2_train[(df_rg2_train['Loan_Status']==1)]['Loan_Amount_Term']
x_rg2_train = df_rg2_train[(df_rg2_train['Loan_Status']==1)].drop(columns=['Loan_Amount_Term','Loan_Status'])

y_rg2_val = df_rg2_val[(df_rg2_val['Loan_Status'] == 1) & (df_rg2_val['Loan_Amount_Term'] > 240)]['Loan_Amount_Term']
x_rg2_val = df_rg2_val[(df_rg2_val['Loan_Status'] == 1) & (df_rg2_val['Loan_Amount_Term'] > 240)].drop(columns=['Loan_Amount_Term', 'Loan_Status'])


## Model - Gradient Boosting

**Import the necessary module**

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

**Train the model on the training data**

In [None]:
gb = GradientBoostingRegressor()
gb.fit(x_rg2_train, y_rg2_train)

**Make predictions on validation data**

In [None]:
y_val_rg2_predict = gb.predict(x_rg2_val)

**Metrics for evaluaion of the model performance on the validation dataset**

In [None]:
rmse = sqrt(mean_squared_error(y_rg2_val, y_val_rg2_predict))
mae = mean_absolute_error(y_rg2_val, y_val_rg2_predict)
r2 = r2_score(y_rg2_val, y_val_rg2_predict)
mean_value = y_rg2_val.mean()
rmse_percentage = (rmse / mean_value) * 100

print(f'RMSE: {rmse:.3f}')
print(f'MAE: {mae:.3f}')
print(f'R2: {r2:.3f}')
print(f'RMSE (percentage): {rmse_percentage:.2f}%') 

**Predictions on the test dataset**

In [None]:
y_rg2_test = dt_rg2[(dt_rg2['Loan_Status'] == 0) & (dt_rg2['Loan_Amount_Term'] < 240)]['Loan_Amount_Term']
x_rg2_test = dt_rg2[(dt_rg2['Loan_Status'] == 0) & (dt_rg2['Loan_Amount_Term'] < 240)].drop(columns=['Loan_Amount_Term', 'Loan_Status'])

In [None]:
y_test_rg2_predict = gb.predict(x_rg2_test)

**Printing the predictions**

In [None]:
y_test_rg2_predict

**Saving the predictions to a new submission file**

In [None]:
#Adding predictions as a new column
x_rg2_test['Loan_Amount_Term'] = y_test_rg2_predict

#Saving to a new CSV file
x_rg2_test.to_csv('final_submission_regression2.csv', index=False)