##  Importing necessary libraries

In [None]:
# Importing all the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.decomposition import PCA
from tqdm import tqdm
from sklearn.preprocessing import normalize
import scipy.stats
from scipy.stats import norm
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectFromModel, VarianceThreshold
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import confusion_matrix



##  Data Understanding

In [None]:
df = pd.read_csv("loan-train.csv")
test= pd.read_csv("loan-test.csv")

In [None]:
df.head()


In [None]:
df.shape


In [None]:
# @title Gender

from matplotlib import pyplot as plt
import seaborn as sns
df.groupby('Gender').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
# @title LoanAmount vs Loan_Amount_Term

from matplotlib import pyplot as plt
df.plot(kind='scatter', x='LoanAmount', y='Loan_Amount_Term', s=32, alpha=.8)
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
# @title Loan_Amount_Term

from matplotlib import pyplot as plt
df['Loan_Amount_Term'].plot(kind='hist', bins=20, title='Loan_Amount_Term')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
# @title LoanAmount

from matplotlib import pyplot as plt
df['LoanAmount'].plot(kind='hist', bins=20, title='LoanAmount')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
# @title CoapplicantIncome

from matplotlib import pyplot as plt
df['CoapplicantIncome'].plot(kind='hist', bins=20, title='CoapplicantIncome')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
# @title ApplicantIncome

from matplotlib import pyplot as plt
df['ApplicantIncome'].plot(kind='hist', bins=20, title='ApplicantIncome')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
df.info()

In [None]:
# Visualization 1: Loan Approval Status Distribution
plt.figure(figsize=(8, 5))
sns.countplot(x='Loan_Status', data=df)
plt.title('Loan Approval Status Distribution')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['ApplicantIncome'], bins=30, kde=True)
plt.title('Applicant Income Distribution')
plt.show()

In [None]:
# Visualization 3: Loan Amount Distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['LoanAmount'].dropna(), bins=30, kde=True)
plt.title('Loan Amount Distribution')
plt.show()

In [None]:
# Visualization 4: Credit History Distribution
plt.figure(figsize=(8, 5))
sns.countplot(x='Credit_History', data=df)
plt.title('Credit History Distribution')
plt.show()

In [None]:
# Visualization 5: Loan Approval Status based on Gender
plt.figure(figsize=(8, 5))
sns.countplot(x='Gender', hue='Loan_Status', data=df)
plt.title('Loan Approval Status based on Gender')
plt.show()

In [None]:
# Visualization 6: Loan Approval Status based on Marital Status
plt.figure(figsize=(8, 5))
sns.countplot(x='Married', hue='Loan_Status', data=df)
plt.title('Loan Approval Status based on Marital Status')
plt.show()

In [None]:
# Visualization 7: Loan Approval Status based on Education
plt.figure(figsize=(8, 5))
sns.countplot(x='Education', hue='Loan_Status', data=df)
plt.title('Loan Approval Status based on Education')
plt.show()

In [None]:
# Visualization 8: Applicant Income vs. Loan Amount
plt.figure(figsize=(10, 6))
sns.scatterplot(x='ApplicantIncome', y='LoanAmount', data=df)
plt.title('Applicant Income vs. Loan Amount')
plt.show()

In [None]:
# Improved Applicant Income vs. Loan Amount
plt.figure(figsize=(12, 8))

# Scatter plot with points colored by Loan Approval Status
scatter = sns.scatterplot(x='ApplicantIncome', y='LoanAmount', hue='Loan_Status', palette='Set1', data=df, alpha=0.7)

# Highlighting the points with a legend
scatter.legend(title='Loan Status', loc='upper right', labels=['Approved', 'Not Approved'])

plt.title('Applicant Income vs. Loan Amount')
plt.xlabel('Applicant Income')
plt.ylabel('Loan Amount')

plt.show()


In [None]:
# Visualization 9: Loan Approval Status based on Credit History
plt.figure(figsize=(8, 5))
sns.countplot(x='Credit_History', hue='Loan_Status', data=df)
plt.title('Loan Approval Status based on Credit History')
plt.show()

In [None]:
# Visualization: Applicant Income vs. Loan Approval Status
plt.figure(figsize=(12, 8))
sns.violinplot(x='Loan_Status', y='ApplicantIncome', data=df, palette='Set1')
plt.title('Applicant Income vs. Loan Approval Status')
plt.xlabel('Loan Approval Status')
plt.ylabel('Applicant Income')
plt.show()

In [None]:
# Visualization: Education vs. Loan Approval Status
plt.figure(figsize=(10, 6))
sns.countplot(x='Education', hue='Loan_Status', data=df, palette='Set2')
plt.title('Education vs. Loan Approval Status')
plt.xlabel('Education')
plt.ylabel('Count')
plt.show()

In [None]:

# Visualization: Loan Approval Status based on Property Area
plt.figure(figsize=(10, 6))
sns.countplot(x='Property_Area', hue='Loan_Status', data=df, palette='Set3')
plt.title('Loan Approval Status based on Property Area')
plt.xlabel('Property Area')
plt.ylabel('Count')
plt.show()


In [None]:
# prompt: display a graph showing the percentage of Loan Approval Status = Y And =N for each Property_Area

import matplotlib.pyplot as plt
import seaborn as sns

# Create a crosstab of Loan_Status and Property_Area
crosstab = pd.crosstab(df['Loan_Status'], df['Property_Area'])

# Calculate the percentage of each Loan_Status for each Property_Area
crosstab_pct = crosstab.div(crosstab.sum(axis=1), axis=0)

# Plot the results as a bar chart
crosstab_pct.plot(kind='bar', stacked=True)
plt.title('Percentage of Loan Approval Status for each Property Area')
plt.xlabel('Property Area')
plt.ylabel('Percentage')
plt.show()


In [None]:

# Create a crosstab of Loan_Status and Property_Area
crosstab = pd.crosstab(df['Property_Area'], df['Loan_Status'])

# Calculate the percentages for each Loan_Status and Property_Area
crosstab_pct = crosstab.div(crosstab.sum(axis=1), axis=0)

# Create a bar chart
fig, ax = plt.subplots(figsize=(10, 6))

# Set the x-axis labels
ax.set_xticklabels(crosstab_pct.index, rotation=45, ha='right')

# Plot the bars with light blue and dark blue colors
ax.bar(crosstab_pct.index, crosstab_pct['Y'], label='Approved', color='lightblue')
ax.bar(crosstab_pct.index, crosstab_pct['N'], bottom=crosstab_pct['Y'], label='Not Approved', color='darkblue')

# Add a legend and title
ax.legend()
ax.set_title('Percentage of Loan Approval Status for each Property Area')

# Show the plot
plt.show()


checking Missing values

In [None]:
df.isnull().sum()/len(df)

In [None]:
df_cleaned = df.dropna()

In [None]:
df_cleaned.isnull().sum()

## Data Transformation

In [None]:
df_cleaned.info()

In [None]:
df_cleaned

In [None]:
#target encoding
label_encoder = LabelEncoder()
df_cleaned['Loan_Status'] = label_encoder.fit_transform(df_cleaned['Loan_Status'])


In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
df_cleaned.groupby('Loan_Status').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
df_cleaned

In [None]:

df_cleaned['Self_Employed'] = label_encoder.fit_transform(df_cleaned['Self_Employed'])
df_cleaned['Education'] = label_encoder.fit_transform(df_cleaned['Education'])
df_cleaned['Married'] = label_encoder.fit_transform(df_cleaned['Married'])

In [None]:
df_cleaned

In [None]:
df_cleaned.info()

In [None]:
# prompt: show unique values of Dependents

print(df_cleaned['Dependents'].unique())


In [None]:
# Assuming df_cleaned is your DataFrame containing the 'Dependents' column

# Define a mapping dictionary
dependents_map = {'0': 0, '1': 1, '2': 2, '3+': 3}

# Map the values using the dictionary and replace the column in df_cleaned
df_cleaned['Dependents'] = df_cleaned['Dependents'].map(dependents_map)


In [None]:
df_cleaned

drop loan_id column

In [None]:
df_cleaned = df_cleaned.drop('Loan_ID', axis=1)


In [None]:
df_cleaned.info()

In [None]:
# prompt: select categorical columns from df_cleaned and apply one hot encoding on them

categorical_columns = df_cleaned.select_dtypes(include=['object']).columns
df_cleaned = pd.get_dummies(df_cleaned, columns=categorical_columns)


In [None]:
df_cleaned.head()

In [None]:
df_cleaned.shape

In [None]:
# Perform one-hot encoding for Property_Area
df_cleaned = pd.get_dummies(df_cleaned, columns=['Property_Area_Rural', 'Property_Area_Semiurban', 'Property_Area_Urban'], drop_first=True)

# Display the DataFrame after encoding
print(df_cleaned)

In [None]:
df_cleaned

##Feature Selection

In [None]:
# Convert boolean columns to numerical
df_cleaned['Gender_Female'] = df_cleaned['Gender_Female'].astype(int)
df_cleaned['Gender_Male'] = df_cleaned['Gender_Male'].astype(int)
df_cleaned['Property_Area_Rural_True'] = df_cleaned['Property_Area_Rural_True'].astype(int)
df_cleaned['Property_Area_Semiurban_True'] = df_cleaned['Property_Area_Semiurban_True'].astype(int)
df_cleaned['Property_Area_Urban_True'] = df_cleaned['Property_Area_Urban_True'].astype(int)

# Display the DataFrame after transformation
print(df_cleaned)


In [None]:
df_cleaned

In [None]:
def iv_woe(data, target, bins=10, show_woe=False):
    # Empty Dataframe
    newDF, woeDF = pd.DataFrame(), pd.DataFrame()

    # Extract Column Names
    cols = data.columns

    # Run WOE and IV on all the independent variables
    for ivars in cols[~cols.isin([target])]:
        if (data[ivars].dtype.kind in 'bifc') and (len(np.unique(data[ivars])) > 10):
            binned_x = pd.qcut(data[ivars], bins, duplicates='drop')
            d0 = pd.DataFrame({'x': binned_x, 'y': data[target]})
        else:
            d0 = pd.DataFrame({'x': data[ivars], 'y': data[target]})
        d0 = d0.astype({"x": str})
        d = d0.groupby("x", as_index=False, dropna=False).agg({"y": ["count", "sum"]})
        d.columns = ['Cutoff', 'N', 'Events']
        d['% of Events'] = np.maximum(d['Events'], 0.5) / d['Events'].sum()
        d['Non-Events'] = d['N'] - d['Events']
        d['% of Non-Events'] = np.maximum(d['Non-Events'], 0.5) / d['Non-Events'].sum()
        d['WoE'] = np.log(d['% of Non-Events'] / d['% of Events'])
        d['IV'] = d['WoE'] * (d['% of Non-Events'] - d['% of Events'])
        d.insert(loc=0, column='Variable', value=ivars)

        # Calculate total IV for each group in each column
        d['Total IV'] = d.groupby('Variable')['IV'].transform('sum')

        total_iv = d['IV'].sum()

        # Check if IV is above threshold

        print("Information value of " + ivars + " is " + str(round(total_iv, 6)))
        temp = pd.DataFrame({"Variable": [ivars], "IV": [total_iv]}, columns=["Variable", "IV"])
        newDF = pd.concat([newDF, temp], axis=0)
        woeDF = pd.concat([woeDF, d], axis=0)

            # Show WOE Table
        if show_woe:
          print(d)

    return newDF, woeDF


# Call the function and drop weak weight columns with IV less than 0.05
iv, woe = iv_woe(data=df_cleaned, target='Loan_Status', bins=10, show_woe=True)





In [None]:
unique_total_iv_values = woe['Total IV'].unique()

# Print the unique values
print("Unique values of Total IV column:")
print(unique_total_iv_values)

print(unique_total_iv_values)
col=df_cleaned.columns.tolist()

In [None]:
df_cleaned.head()

In [None]:
#Dropping features
#By human measures , we are going to keep the Married,Applicant Income feature
# By the threshold measure Self employed,Gender Male,Female,Dependents,CoapplicantIncome,Education,Property_Area_Rural_True,Property_Area_Urban_True are going to be dropped
#by threshold , we are going to keep Loan Amount,Loan_Amount_Term,Credit History,Property_Area_Semiurban_True
columns=['Gender_Female','Gender_Male','Self_Employed','Dependents','CoapplicantIncome','Education','Property_Area_Rural_True','Property_Area_Urban_True']

df_cleaned_drop1=df_cleaned.drop(columns,axis=1)
df_cleaned_drop1['Property_Area']=df_cleaned_drop1['Property_Area_Semiurban_True']
df_cleaned_drop1 = df_cleaned_drop1.drop('Property_Area_Semiurban_True', axis=1)






In [None]:
df_cleaned_drop1.head()

In [None]:
df_cleaned_dropped=df_cleaned_drop1


##Scaling

In [None]:
# prompt: scale the dataset df_cleaned_dropped using minmaxscaler

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_cleaned_scaled = scaler.fit_transform(df_cleaned_dropped)
df_cleaned_scaled = pd.DataFrame(df_cleaned_scaled, columns=df_cleaned_dropped.columns)
df_cleaned_scaled


## Score Prediction

#LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression

# Separate features and target variable
X = df_cleaned_scaled.drop(columns=['Credit_History'])  # Features
y = df_cleaned_scaled['Credit_History']  # Target

# Initialize logistic regression model
log_reg = LogisticRegression()

# Fit the model
log_reg.fit(X, y)

# Predict probabilities
probabilities = log_reg.predict_proba(X)[:, 0]  # Probability of class 1 (Loan_Status = 1)

# Add probabilities as a new column to the DataFrame
df_cleaned_scaled['Probability'] = probabilities

# Display the updated DataFrame
print("Updated DataFrame with Probability column:")
df_cleaned_scaled



In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Model Evaluation
predictions = log_reg.predict(X)
accuracy = accuracy_score(y, predictions)
precision = precision_score(y, predictions)
recall = recall_score(y, predictions)
f1 = f1_score(y, predictions)
roc_auc = roc_auc_score(y, probabilities)

print("Model Evaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("ROC-AUC Score:", roc_auc)

In [None]:
target = df_cleaned_scaled['Loan_Status']

In [None]:
X_raw,X_test,y_raw,y_test  = train_test_split(df_cleaned_scaled,
                                              target,
                                              test_size=0.1,
                                              stratify = target,
                                              random_state = 42)

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

# Initialize StratifiedKFold with shuffle=True
sss = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

# Initialize an empty list to store accuracy scores for each fold
accuracy_scores = []

# Iterate over each fold
for fold, (train_index, test_index) in enumerate(sss.split(X_raw, y_raw), 1):
    print(f"Fold {fold}:")
    X_train, X_val = X_raw.iloc[train_index], X_raw.iloc[test_index]
    y_train, y_val = y_raw.iloc[train_index], y_raw.iloc[test_index]

    # Initialize and fit logistic regression model
    log_reg_cv = LogisticRegression(random_state=0)
    log_reg_cv.fit(X_train, y_train)

    # Predict on validation set
    y_pred = log_reg_cv.predict(X_val)

    # Calculate accuracy score for this fold
    accuracy = accuracy_score(y_val, y_pred)
    accuracy_scores.append(accuracy)

    print(f"Accuracy: {accuracy:.4f}")
    print("-" * 30)

# Print mean accuracy across all folds
mean_accuracy = np.mean(accuracy_scores)
print(f"Mean Accuracy: {mean_accuracy:.4f}")


#RandomForestReg

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt


# Step 1: Split the data into features (X) and target variable (y)
X = df_cleaned_scaled.drop(columns=['Probability'])  # Features
y = df_cleaned_scaled['Probability']  # Target variable

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train a RandomForestRegressor model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Step 4: Extract feature importance
feature_importance = model.feature_importances_

# Create a DataFrame to store feature importance scores
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importance
})

# Sort features by importance score in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Step 5: Visualize feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.show()


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

# Step 1: Split the data into features (X) and target variable (y)
X = df_cleaned_scaled.drop(columns=['Probability'])  # Features
y = df_cleaned_scaled['Probability']  # Target variable

# Step 2: Split the data into training and testing sets (optional, if you still want to have a separate test set)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train a RandomForestRegressor model with cross-validation
model = RandomForestRegressor(random_state=42)
# Perform cross-validation
cv_scores = cross_val_score(model, X, y, cv=5)  # 5-fold cross-validation
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", np.mean(cv_scores))

# Step 4: Extract feature importance using the entire dataset (not just the training set)
model.fit(X, y)
feature_importance = model.feature_importances_

# Create a DataFrame to store feature importance scores
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importance
})

# Sort features by importance score in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Step 5: Visualize feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.show()


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Step 6: Evaluate the model on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared (R2):", r2)



#GradientBoost

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
import matplotlib.pyplot as plt

# Assuming your data is stored in a DataFrame called df

# Step 1: Split the data into features (X) and target variable (y)
X = df_cleaned_scaled.drop(columns=['Probability'])  # Features
y = df_cleaned_scaled['Probability']  # Target variable

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train a GradientBoostingRegressor model
model = GradientBoostingRegressor(random_state=42)
model.fit(X_train, y_train)

# Step 4: Extract feature importance
feature_importance = model.feature_importances_

# Create a DataFrame to store feature importance scores
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importance
})

# Sort features by importance score in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Step 5: Visualize feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.show()


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
import matplotlib.pyplot as plt

# Assuming your data is stored in a DataFrame called df

# Step 1: Split the data into features (X) and target variable (y)
X = df_cleaned_scaled.drop(columns=['Probability'])  # Features
y = df_cleaned_scaled['Probability']  # Target variable

# Step 2: Split the data into training and testing sets (optional, if you still want to have a separate test set)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train a GradientBoostingRegressor model with cross-validation
model = GradientBoostingRegressor(random_state=42)
# Perform cross-validation
cv_scores = cross_val_score(model, X, y, cv=5)  # 5-fold cross-validation
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", np.mean(cv_scores))

# Step 4: Extract feature importance using the entire dataset (not just the training set)
model.fit(X, y)
feature_importance = model.feature_importances_

# Create a DataFrame to store feature importance scores
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importance
})

# Sort features by importance score in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Step 5: Visualize feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.show()


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Step 6: Evaluate the model on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)



print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared (R2):", r2)


In [None]:
import pandas as pd

# Given feature weights
feature_weights = {
    'Married': 0.057974,
    'LoanAmount': 0.105560,
    'Loan_Amount_Term': 0.129796,
    'Credit_History': 1.431605,
    'Property_Area': 0.118846,
    'ApplicantIncome':  0.031081
}

# Given features (assuming it's a DataFrame with columns matching feature names)
features_df = pd.DataFrame({
    'Married': [1.0],
    'LoanAmount': [0.201354],
    'Loan_Amount_Term': [0.72973],
    'Credit_History': [1.0],
    'Property_Area': [0.0],
    'ApplicantIncome': [0.05483]
})
# Function to calculate credit score for a single row
def calculate_credit_score(row):
    return sum(row[feature] * weight for feature, weight in feature_weights.items())

# Apply the function to each row to calculate credit scores
df_cleaned_scaled['score'] = df_cleaned_scaled.apply(calculate_credit_score, axis=1)

print(df_cleaned_scaled)


In [None]:
df_cleaned_scaled.describe()

In [None]:
# Define the desired range for the score
min_score = 300
max_score = 900

# Calculate the min and max values of the current 'score' column
min_current = df_cleaned_scaled['score'].min()
max_current = df_cleaned_scaled['score'].max()

# Perform min-max scaling
df_cleaned_scaled['score_scaled'] = min_score + ((df_cleaned_scaled['score'] - min_current) * (max_score - min_score)) / (max_current - min_current)

# Display the updated DataFrame with the scaled score
print("Updated DataFrame with scaled score:")
df_cleaned_scaled


# **Score using Credit utlization ratio **

In [None]:
df_cleaned_scaled1 = df_cleaned_scaled.copy()
del df_cleaned_scaled1['score']
del df_cleaned_scaled1['score_scaled']
df_cleaned_scaled2 = df_cleaned_scaled.copy()
del df_cleaned_scaled2['score']
del df_cleaned_scaled2['score_scaled']
df_cleaned_scaled3 = df_cleaned_scaled2.copy()

In [None]:
# Calculate credit utilization ratio
df_cleaned_scaled1['Credit_Utilization_Ratio'] = df_cleaned_scaled1['LoanAmount'] / df_cleaned_scaled1['Loan_Amount_Term']

# Normalize credit utilization ratio to a 300-900 range (or any desired range)
min_score = 300
max_score = 900
normalized_scores_utilization = ((1 - df_cleaned_scaled1['Credit_Utilization_Ratio']) * (max_score - min_score)) + min_score

# Add normalized scores as a new column 'Credit_Score_Utilization' to the DataFrame
df_cleaned_scaled1['Credit_Score_Utilization'] = normalized_scores_utilization

# Display the updated DataFrame
print("Updated DataFrame with Credit_Score_Utilization column:")
print(df_cleaned_scaled1[['LoanAmount', 'Loan_Amount_Term', 'Credit_Utilization_Ratio', 'Credit_Score_Utilization']])


In [None]:
df_cleaned_scaled3.describe()

Altman Z-score
Altman Z-score=1.2*(Total Assets/Working Capital)+1.4*(Total Assets/Retained Earnings)+3.3*(Total Assets/EBIT)+0.6*(Total Liabilities/Market Value of Equity)


In [None]:
# Calculate financial ratios based on dataset columns
import pandas as pd

# Assuming df is your DataFrame


df_cleaned_scaled3['Working_Capital'] = df_cleaned_scaled3['ApplicantIncome'] - df_cleaned_scaled3['LoanAmount']/df_cleaned_scaled3['Loan_Amount_Term']
df_cleaned_scaled3['Retained_Earnings'] = df_cleaned_scaled3['ApplicantIncome'] / df_cleaned_scaled3['LoanAmount']
df_cleaned_scaled3['EBIT'] = df_cleaned_scaled3['Loan_Amount_Term'] * df_cleaned_scaled3['Credit_History']
df_cleaned_scaled3['Market_Value_of_Equity'] = df_cleaned_scaled3['Property_Area'] * df_cleaned_scaled3['Married']

# Calculate Altman Z-score
df_cleaned_scaled3['Altman_Z_Score'] = (1.2 * df_cleaned_scaled3['Working_Capital'] +
                                       1.4 * df_cleaned_scaled3['Retained_Earnings'] +
                                       3.3 * df_cleaned_scaled3['EBIT'] +
                                       0.6 * df_cleaned_scaled3['Market_Value_of_Equity'])

import pandas as pd
import numpy as np

# Assuming df_cleaned_scaled3 is your DataFrame

# Replace inf, -inf, and NaN values in 'Altman_Z_Score' column with -10
df_cleaned_scaled3['Altman_Z_Score'].replace([np.inf, -np.inf, np.nan], -10, inplace=True)



del df_cleaned_scaled3['Working_Capital']
del df_cleaned_scaled3['Retained_Earnings']
del df_cleaned_scaled3['EBIT']
del df_cleaned_scaled3['Market_Value_of_Equity']
df_cleaned_scaled3


In [None]:
# Calculate financial ratios based on dataset columns
df_cleaned_scaled2['Working_Capital'] = df_cleaned_scaled2['ApplicantIncome'] - df_cleaned_scaled2['LoanAmount']/df_cleaned_scaled2['Loan_Amount_Term']
df_cleaned_scaled2['Retained_Earnings'] = df_cleaned_scaled2['ApplicantIncome'] / df_cleaned_scaled2['LoanAmount']
df_cleaned_scaled2['EBIT'] = df_cleaned_scaled2['Loan_Amount_Term'] * df_cleaned_scaled2['Credit_History']
df_cleaned_scaled2['Market_Value_of_Equity'] = df_cleaned_scaled2['Property_Area'] * df_cleaned_scaled2['Married']

# Calculate Altman Z-score
df_cleaned_scaled2['Altman_Z_Score'] = (1.2 * (df_cleaned_scaled2['Working_Capital'] / df_cleaned_scaled2['ApplicantIncome']) +
                                       1.4 * (df_cleaned_scaled2['Retained_Earnings'] / df_cleaned_scaled2['ApplicantIncome']) +
                                       3.3 * (df_cleaned_scaled2['EBIT'] / df_cleaned_scaled2['ApplicantIncome']) +
                                       0.6 * (df_cleaned_scaled2['Market_Value_of_Equity'] / df_cleaned_scaled2['LoanAmount']))

# Display the updated DataFrame with Altman Z-score
print("Updated DataFrame with Altman Z-score column:")
print(df_cleaned_scaled2[['Working_Capital', 'Retained_Earnings', 'EBIT', 'Market_Value_of_Equity', 'Altman_Z_Score']])



# Cleaning the **df_cleaned_sclaed**

In [None]:

del df_cleaned_scaled2['Working_Capital']
del df_cleaned_scaled2['Retained_Earnings']
del df_cleaned_scaled2['EBIT']
del df_cleaned_scaled2['Market_Value_of_Equity']



In [None]:
df_cleaned_scaled2=df_cleaned_scaled3.copy()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming df is your DataFrame with the 'Altman_Z_Score' column

# Plotting the distribution using seaborn
plt.figure(figsize=(10, 6))
sns.histplot(data=df_cleaned_scaled2, x='Altman_Z_Score', bins=20, kde=True)
plt.title('Distribution of Altman Z-Score')
plt.xlabel('Altman Z-Score')
plt.ylabel('Count')
plt.show()


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Assuming df_cleaned_scaled2 is your DataFrame with the provided features

# Select features and target variable
X = df_cleaned_scaled2[['Married', 'ApplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area']]
y = df_cleaned_scaled2['Loan_Status']

# Perform one-hot encoding on categorical features
X = pd.get_dummies(X, drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a logistic regression model
logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = logreg.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Generate a classification report
print(classification_report(y_test, y_pred))

# Calculate credit scores based on the logistic regression model
df_cleaned_scaled2['Credit_Score'] = logreg.predict_proba(scaler.transform(X))[:, 1]

# Print the DataFrame with Credit Score added
print(df_cleaned_scaled2)


In [None]:
df_cleaned_scaled2

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming df is your DataFrame with 'Altman_Z_Score' and 'Loan_Status' columns

# Create a new DataFrame with the relevant columns
df_cleaned_scaled2_plot = df_cleaned_scaled2[['Altman_Z_Score', 'Loan_Status']]

# Plot the distribution using seaborn
plt.figure(figsize=(10, 6))
sns.histplot(data=df_cleaned_scaled2_plot, x='Altman_Z_Score', hue='Loan_Status', multiple='stack', bins=20)
plt.title('Distribution of Loan Status along Altman Z-Score')
plt.xlabel('Altman Z-Score')
plt.ylabel('Count')
plt.legend(title='Loan_Status')
plt.show()


# Evaluating the linear regression model targetting the Altman_Z_Score

In [None]:
df_cleaned_scaled2 = df_cleaned_scaled2.dropna()
df_cleaned_scaled3 = df_cleaned_scaled2.copy()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Assuming df_cleaned_scaled contains your dataset

# Separate features and target variable
X = df_cleaned_scaled2.drop(columns=['Altman_Z_Score'])  # Features
y = df_cleaned_scaled2['Altman_Z_Score']  # Target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize linear regression model
linear_reg = LinearRegression()

# Fit the model on the training data
linear_reg.fit(X_train, y_train)

# Predict Altman Z-scores on the test set
y_pred = linear_reg.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared Score:", r2)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Evaluating the gradient boost regression model targetting the Altman_Z_Score

In [None]:
# Initialize Gradient Boosting Regression model
gbr = GradientBoostingRegressor(random_state=42)

# Fit the model on the training data
gbr.fit(X_train, y_train)

# Predict Altman Z-scores on the test set using Gradient Boosting Regression
y_pred_gbr = gbr.predict(X_test)

# Evaluate the Gradient Boosting Regression model
mse_gbr = mean_squared_error(y_test, y_pred_gbr)
r2_gbr = r2_score(y_test, y_pred_gbr)

print("Gradient Boosting Regression:")
print("Mean Squared Error:", mse_gbr)
print("R-squared Score:", r2_gbr)

# Evaluating the support vector regression model targetting the Altman_Z_Score

In [None]:
svr = SVR()

# Fit the model on the training data
svr.fit(X_train, y_train)

# Predict Altman Z-scores on the test set using Support Vector Regression
y_pred_svr = svr.predict(X_test)

# Evaluate the Support Vector Regression model
mse_svr = mean_squared_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)

print("\nSupport Vector Regression:")
print("Mean Squared Error:", mse_svr)
print("R-squared Score:", r2_svr)

# Evaluating the decision tree regression model targetting the Altman_Z_Score

In [None]:
dtr = DecisionTreeRegressor(random_state=42)

# Fit the model on the training data
dtr.fit(X_train, y_train)

# Predict Altman Z-scores on the test set using Decision Tree Regression
y_pred_dtr = dtr.predict(X_test)

# Evaluate the Decision Tree Regression model
mse_dtr = mean_squared_error(y_test, y_pred_dtr)
r2_dtr = r2_score(y_test, y_pred_dtr)

print("\nDecision Tree Regression:")
print("Mean Squared Error:", mse_dtr)
print("R-squared Score:", r2_dtr)

# Evaluating all previous models with a cross validation applied

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
X = df_cleaned_scaled3.drop(columns=['Altman_Z_Score'])  # Features
y = df_cleaned_scaled3['Altman_Z_Score']  # Target
# Initialize Gradient Boosting Regression model
gbr = GradientBoostingRegressor(random_state=42)

# Perform cross-validation for Gradient Boosting Regression
gbr_cv_scores = cross_val_score(gbr, X, y, cv=5, scoring='neg_mean_squared_error')
gbr_cv_mse_mean = -gbr_cv_scores.mean()

# Initialize Support Vector Regression model
svr = SVR()

# Perform cross-validation for Support Vector Regression
svr_cv_scores = cross_val_score(svr, X, y, cv=5, scoring='neg_mean_squared_error')
svr_cv_mse_mean = -svr_cv_scores.mean()

# Initialize Decision Tree Regression model
dtr = DecisionTreeRegressor(random_state=42)

# Perform cross-validation for Decision Tree Regression
dtr_cv_scores = cross_val_score(dtr, X, y, cv=5, scoring='neg_mean_squared_error')
dtr_cv_mse_mean = -dtr_cv_scores.mean()

# Print mean MSE from cross-validation for each model
print("Gradient Boosting Regression - Mean MSE:", gbr_cv_mse_mean)
print("Support Vector Regression - Mean MSE:", svr_cv_mse_mean)
print("Decision Tree Regression - Mean MSE:", dtr_cv_mse_mean)

In [None]:
from sklearn.cluster import KMeans

# Define the number of clusters (classes)
num_clusters = 4

# Initialize KMeans clustering model
kmeans = KMeans(n_clusters=num_clusters, random_state=42)

# Fit KMeans model on the 'score_scaled' column
kmeans.fit(df_cleaned_scaled[['score_scaled']])

# Get cluster centers (representative values for each cluster)
cluster_centers = kmeans.cluster_centers_

# Sort the cluster centers to get the intervals for each class
sorted_cluster_centers = sorted(cluster_centers.ravel())

# Define class labels
class_labels = ['poor', 'average', 'good', 'excellent']

# Map each cluster center to its corresponding class label
class_intervals = {}
for i, label in enumerate(class_labels):
    if i == 0:
        class_intervals[label] = (-float('inf'), sorted_cluster_centers[i])
    elif i == len(class_labels) - 1:
        class_intervals[label] = (sorted_cluster_centers[i-1], float('inf'))
    else:
        class_intervals[label] = (sorted_cluster_centers[i-1], sorted_cluster_centers[i])

# Assign class labels to each row based on the 'score_scaled' value
df_cleaned_scaled['class'] = df_cleaned_scaled['score_scaled'].apply(lambda x: next(class_label for class_label, interval in class_intervals.items() if interval[0] <= x <= interval[1]))

# Display the class intervals for each class label
for class_label, interval in class_intervals.items():
    print(f"{class_label}: {interval}")

# Display the updated DataFrame with class labels
print("Updated DataFrame with class labels:")
df_cleaned_scaled


## Evaluation

In [None]:
from sklearn.metrics import silhouette_score, completeness_score

# Calculate silhouette score
silhouette_avg = silhouette_score(df_cleaned_scaled[['score_scaled']], df_cleaned_scaled['class'])

# Calculate completeness score
completeness_avg = completeness_score(df_cleaned_scaled['class'], kmeans.labels_)

# Display evaluation metrics
print(f"Silhouette Score: {silhouette_avg}")
print(f"Completeness Score: {completeness_avg}")


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

# Assuming y_test contains continuous predictions (probabilities or scores)
threshold = 0.5  # Adjust the threshold as needed

# Convert to binary labels based on threshold
y_pred_binary = (y_pred >= threshold).astype(int)
y_test_binary = (y_test >= threshold).astype(int)

# Now you can calculate metrics like accuracy and confusion matrix
conf_matrix = confusion_matrix(y_test_binary, y_pred_binary)
accuracy = accuracy_score(y_test_binary, y_pred_binary)

print("Confusion Matrix:")
print(conf_matrix)
print("Accuracy:", accuracy)



In [None]:
# Evaluate clustering using various metrics
silhouette_score = metrics.silhouette_score(X, kmeans.labels_)
calinski_harabasz_score = metrics.calinski_harabasz_score(X, kmeans.labels_)
davies_bouldin_score = metrics.davies_bouldin_score(X, kmeans.labels_)
inertia = kmeans.inertia_

print("Silhouette Score:", silhouette_score)
print("Calinski-Harabasz Index:", calinski_harabasz_score)
print("Davies-Bouldin Index:", davies_bouldin_score)
print("Inertia:", inertia)

In [None]:
df_cleaned_scaled

#GMM

In [None]:
from sklearn.mixture import GaussianMixture

# Initialize Gaussian Mixture Model
gmm = GaussianMixture(n_components=4, random_state=42)

# Fit GMM model on the 'score_scaled' column
gmm.fit(df_cleaned_scaled[['score_scaled']])

# Evaluate the silhouette score for GMM
gmm_silhouette_score = metrics.silhouette_score(df_cleaned_scaled[['score_scaled']], gmm.predict(df_cleaned_scaled[['score_scaled']]))

print("Silhouette Score for Gaussian Mixture Model:", gmm_silhouette_score)


#Agg

In [None]:
from sklearn.cluster import AgglomerativeClustering

# Initialize Agglomerative Clustering model
agglomerative = AgglomerativeClustering(n_clusters=num_clusters)

# Fit Agglomerative Clustering model on the 'score_scaled' column
agglomerative.fit(df_cleaned_scaled[['score_scaled']])

# Evaluate the silhouette score for Agglomerative Clustering
agglomerative_silhouette_score = metrics.silhouette_score(df_cleaned_scaled[['score_scaled']], agglomerative.labels_)

print("Silhouette Score for Agglomerative Clustering:", agglomerative_silhouette_score)


In [None]:
from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score, silhouette_score

# Compute additional metrics for KMeans
kmeans_davies_bouldin = davies_bouldin_score(df_cleaned_scaled[['score_scaled']], kmeans.labels_)
kmeans_calinski_harabasz = calinski_harabasz_score(df_cleaned_scaled[['score_scaled']], kmeans.labels_)
kmeans_silhouette = silhouette_score(df_cleaned_scaled[['score_scaled']], kmeans.labels_)

# Compute additional metrics for Agglomerative Clustering
agglomerative_davies_bouldin = davies_bouldin_score(df_cleaned_scaled[['score_scaled']], agglomerative.labels_)
agglomerative_calinski_harabasz = calinski_harabasz_score(df_cleaned_scaled[['score_scaled']], agglomerative.labels_)
agglomerative_silhouette = silhouette_score(df_cleaned_scaled[['score_scaled']], agglomerative.labels_)

# Compute additional metrics for GMM
gmm_davies_bouldin = davies_bouldin_score(df_cleaned_scaled[['score_scaled']], gmm.predict(df_cleaned_scaled[['score_scaled']]))
gmm_calinski_harabasz = calinski_harabasz_score(df_cleaned_scaled[['score_scaled']], gmm.predict(df_cleaned_scaled[['score_scaled']]))
gmm_silhouette = silhouette_score(df_cleaned_scaled[['score_scaled']], gmm.predict(df_cleaned_scaled[['score_scaled']]))

# Print the computed metrics
print("KMeans:")
print("Davies-Bouldin Index:", kmeans_davies_bouldin)
print("Calinski-Harabasz Index:", kmeans_calinski_harabasz)
print("Silhouette Score:", kmeans_silhouette)
print()

print("Agglomerative Clustering:")
print("Davies-Bouldin Index:", agglomerative_davies_bouldin)
print("Calinski-Harabasz Index:", agglomerative_calinski_harabasz)
print("Silhouette Score:", agglomerative_silhouette)
print()

print("Gaussian Mixture Model (GMM):")
print("Davies-Bouldin Index:", gmm_davies_bouldin)
print("Calinski-Harabasz Index:", gmm_calinski_harabasz)
print("Silhouette Score:", gmm_silhouette)
