# Importing Libraries

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename)) 

In [None]:
import numpy as np # linear algebra
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [None]:
train_df = pd.read_csv('/kaggle/input/bluechip-summit-credit-worthiness-prediction/Train.csv')

train_df.head() 

In [None]:
train_df.info()

In [None]:
train_df.describe().T.round(2) 

In [None]:
train_df2 = train_df.copy()

cat_cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Credit_History', 'Property_Area']# 'Loan_Status']

# Convert each categorical column while preserving the DataFrame structure
for col in cat_cols:
    train_df2[col] = train_df2[col].astype('category')

# Verify the changes
print("\nUpdated Data Types:")
print(train_df2.dtypes) 

# EDA

In [None]:
train_df2.hist(bins=30, figsize=(12, 8))
plt.show()

# Feature Engineering

In [None]:
train_df2['Total_Income'].unique() 

I will drop the 'Total_Income' feature because its distribution is not normal, there are a few unique values, making it look more like a categorical variable, also it has little importance in our model.

In [None]:
# Replace '3+' with '3' and convert to int
train_df2['Dependents'] = train_df2['Dependents'].replace('3+', '3').astype('int')

# Convert Married column to int (assuming it's binary Yes/No)
train_df2['Married'] = train_df2['Married'].astype('int')

# Create Married_Encoded column
train_df2['Married_Encoded'] = train_df2['Married'].apply(lambda x: 2 if x > 0 else 1)

# Calculate Family_Size
train_df2['Family_Size'] = train_df2['Married_Encoded'] + train_df2['Dependents']

train_df2['Has_Coapplicant'] = train_df2['CoapplicantIncome'].apply(lambda x: 1 if x > 0 else 0)

# Calculate Actual_LoanAmount (using underscore for readability)
train_df2['Actual_LoanAmount'] = train_df2['LoanAmount'] * 1_000

# Calculate ratios
train_df2['TotalIncome_LoanAmount_Ratio'] = train_df2['Actual_LoanAmount'] / (train_df2['ApplicantIncome'] + train_df2['CoapplicantIncome']) 
train_df2['FamilySize_TotalIncome_ratio'] = train_df2['Family_Size'] / (train_df2['ApplicantIncome'] + train_df2['CoapplicantIncome'])

# Drop unnecessary columns
train_df2.drop(columns=['Married_Encoded','Married', 'Dependents', 'LoanAmount', 'Total_Income'], inplace=True)

In [None]:
train_df2.head()

In [None]:
#train_df2['Education_Score'] = train_df2['Education'].map({1: 5, 0: 1}).astype('int')
#train_df2.info() 

In [None]:
#train_df['Credit_History'].value_counts() 
# Graduate 1 =5, not graduate 0=1
# property area 0: rural=1, 1: semi urban=3, 2: urban=5
# credit history( does it meet guidelines), 0:no =1, 1:yes =5

# Create score for Education
train_df2['Education_Score'] = train_df2['Education'].map({1: 5, 0: 1}).astype('int')

# Create score for Property Area
train_df2['Property_Area_Score'] = train_df2['Property_Area'].map({
    0: 1, # Rural
    1: 3, # Semiurban
    2: 5 # Urban
}).astype('int')

# Create score for Credit History
train_df2['Credit_Score'] = train_df2['Credit_History'].map({0: 1, 1: 5}).astype('int')

# Calculate final Applicant Score (sum of all scores)
train_df2['Applicant_Score'] = (
    train_df2['Education_Score'] + 
    train_df2['Property_Area_Score'] + 
    train_df2['Credit_Score']
)

train_df2.drop(columns=['Education_Score', 'Property_Area_Score', 'Credit_Score'], inplace=True) 

In [None]:
train_df2.sample(n=5)

In [None]:
train_df2.info()

In [None]:
# Married,Family_Size, Has_Coapplicant

train_df3 = train_df2.copy() 

cat_cols = ['Gender', 'Education', 'Self_Employed', 'Credit_History', 'Property_Area', 'Family_Size', 'Has_Coapplicant']# 'Loan_Status']

# Convert each categorical column while preserving the DataFrame structure
for col in cat_cols:
    train_df3[col] = train_df3[col].astype('category') 

# Verify the changes
print("\nUpdated Data Types:")
print(train_df3.dtypes) 

**Correlation**

In [None]:
#corr_matrix = train_df3.corr()
#corr_matrix["Loan_Status"].sort_values(ascending=False)

# Create correlation matrix only with numeric columns
numeric_df = train_df3.select_dtypes(include=['float64', 'int64'])
corr_matrix = numeric_df.corr()
corr_matrix["Loan_Status"].sort_values(ascending=False) 

# Transformation Pipelines

In [None]:
train_df3.drop(columns = ['ID', 'Loan_ID'], inplace=True)

#train_df4 = train_df3.copy()
#train_df4.drop(columns= ['Education', 'Credit_History', 'Property_Area'], inplace=True)

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set =  train_test_split(train_df3, test_size=0.2, random_state=42)

test_set2, val_set = train_test_split(test_set, test_size=0.5, random_state=42)


X_train = train_set.drop(columns=['Loan_Status'])
y_train = train_set['Loan_Status'].copy()


X_valid = val_set.drop(columns=['Loan_Status'])
y_valid = val_set['Loan_Status'].copy()


X_test = test_set2.drop(columns=['Loan_Status'])
y_test = test_set2['Loan_Status'].copy()

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer

num_pipeline = make_pipeline(SimpleImputer(strategy="median"),
                             StandardScaler())

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder()) 
    #OneHotEncoder(handle_unknown="ignore"))

preprocessing = make_column_transformer(
(num_pipeline, make_column_selector(dtype_include=np.number)),
(cat_pipeline, make_column_selector(dtype_include=['object','category'])
))

In [None]:
X_train_prepared = preprocessing.fit_transform(X_train)

X_train_prepared_fr = pd.DataFrame(
    X_train_prepared,
    columns=preprocessing.get_feature_names_out(),
    index=X_train.index)
X_train_prepared_fr.head(2)

In [None]:
#feature_names = preprocessing.get_feature_names_out()
#feature_names

# Classification Models

**Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
lr_clf = make_pipeline(preprocessing, LogisticRegression(random_state=42))#max_iter = 1000, class_weight = 'balanced', solver = 'liblinear'
lr_clf.fit(X_train, y_train) 

In [None]:
from sklearn.model_selection import cross_val_score

lr_accuracy = cross_val_score(lr_clf, X_valid, y_valid,
                              scoring="accuracy", cv=5) 

lr_accuracy 

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

test_pred = lr_clf.predict(X_test)
conf_matrix = confusion_matrix(y_test, test_pred)
print(classification_report(y_test, test_pred))

In [None]:
conf_matrix

In [None]:
train_pred = lr_clf.predict(X_train)
#labels = l_encoder.classes_
conf_matrix2 = confusion_matrix(y_train, train_pred)
print(classification_report(y_train, train_pred)) 

In [None]:
conf_matrix2

The Logistic Regression model is underfitting

**SVC**

In [None]:
from sklearn.svm import SVC

svc_clf = make_pipeline(preprocessing, SVC(random_state=42, kernel='poly')) #   gamma='auto', class_weight='balanced
svc_clf.fit(X_train, y_train) # kernel=poly, sigmoid

In [None]:
svc_accuracy = cross_val_score(svc_clf, X_valid, y_valid,
                              scoring="accuracy", cv=5) 

svc_accuracy

In [None]:
test_pred2 = svc_clf.predict(X_test)
conf_matrix2 = confusion_matrix(y_test, test_pred2)
print(classification_report(y_test, test_pred2))

**RandomForestClassifier**

In [None]:
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.model_selection import GridSearchCV

#param_grid = {
    #'max_depth': [5, 10, 15, 20, None],
    #'min_samples_split': [2, 5, 10],
    #'min_samples_leaf': [1, 2, 4],
    #'max_features': ['auto', 'sqrt', 0.7],
    #'n_estimators': [100, 200, 300]
#}

#grid_search = GridSearchCV(
    #estimator=RandomForestClassifier(random_state=42),
    #param_grid=param_grid,
    #cv=5,
    #scoring='accuracy'
#)
#grid_search.fit(X_train2, y_train2)


#best_params = grid_search.best_params_
#print("Best Parameters:", best_params)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = make_pipeline(preprocessing, RandomForestClassifier(random_state=42, max_features= 6, n_estimators = 200)) # n_estimators=900, ,gamma='auto', min_samples_leaf=,class_weight='balanced
rf_clf.fit(X_train, y_train) 

In [None]:
rf_accuracy = cross_val_score(rf_clf, X_valid, y_valid,
                              scoring="accuracy", cv=5)

rf_accuracy 

In [None]:
test_pred3 = rf_clf.predict(X_test)
conf_matrix3 = confusion_matrix(y_test, test_pred3)
print(classification_report(y_test, test_pred3)) 

In [None]:
print(conf_matrix3)

Feature Importance 

In [None]:
rf = rf_clf.steps[-1][1]
feature_importances = rf.feature_importances_
print(feature_importances.round(4)) 

In [None]:
# First, let's get the original feature names
numeric_features = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

# Get feature importances
rf = rf_clf.steps[-1][1]
feature_importances = rf.feature_importances_

# Create a list of all feature names in order
feature_names = numeric_features + categorical_features

# Create a DataFrame with feature names and importance scores
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

# Sort by importance in descending order
feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)
feature_importance_df['Importance'] = feature_importance_df['Importance'].round(3)

# Print the results
print("Feature Importance Scores:")
print(feature_importance_df)

# Create a bar plot
plt.figure(figsize=(10, 6))
plt.bar(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xticks(rotation=45, ha='right')
plt.title('Feature Importance Scores')
plt.tight_layout()
plt.show()

# Print percentages
print("\nFeature Importance Percentages:")
total_importance = feature_importance_df['Importance'].sum()
for idx, row in feature_importance_df.iterrows():
    percentage = (row['Importance'] / total_importance) * 100
    print(f"{row['Feature']:<30} {percentage:.1f}%")

**XGBClassifier**

In [None]:
from xgboost import XGBClassifier

xgb_clf = make_pipeline(preprocessing, XGBClassifier(random_state=42,  n_estimators = 500, learning_rate=0.2)) 
xgb_clf.fit(X_train, y_train)

In [None]:
# Create and fit the preprocessing pipeline separately first
preprocessor = preprocessing.fit(X_train)

# Transform both training and validation data
X_train_transformed = preprocessor.transform(X_train)
X_valid_transformed = preprocessor.transform(X_valid)  #

# Create and fit XGBoost with preprocessed eval_set
xgb_clf2 = XGBClassifier(random_state=42, n_estimators=1_000, learning_rate=0.05)
xgb_clf2.fit(
    X_train_transformed, 
    y_train,
    early_stopping_rounds=5,
    eval_set=[(X_train_transformed, y_train), (X_valid_transformed, y_valid)],
    verbose=False
)

# If you still want to use the pipeline for convenience, combine them
#final_pipeline = make_pipeline(preprocessing, xgb_clf)

In [None]:
xgb_accuracy = cross_val_score(xgb_clf, X_valid, y_valid,
                              scoring="accuracy", cv=5)

xgb_accuracy

In [None]:
xgb2_accuracy = cross_val_score(xgb_clf2, X_valid_transformed, y_valid,
                              scoring="accuracy", cv=5) 

xgb2_accuracy

In [None]:
test_pred4 = xgb_clf.predict(X_test)
conf_matrix4 = confusion_matrix(y_test, test_pred4)  
print(classification_report(y_test, test_pred4)) 

In [None]:
print(conf_matrix4)

In [None]:
train_pred5 = xgb_clf.predict(X_train)
conf_matrix5 = confusion_matrix(y_train, train_pred5)
print(classification_report(y_train, train_pred5)) 

The XGB Classifier is overfitting the training set

In [None]:
def analyze_prediction_confidence(classifier, X, y_true, model_name="Classifier"):
    """
    Analyze and visualize prediction confidence for a classifier
    
    Parameters:
    -----------
    classifier : estimator object
        Fitted classifier with predict_proba method
    X : array-like
        Input features
    y_true : array-like
        True labels
    model_name : str
        Name of the classifier for plot titles
    """
    # Get probability predictions
    probabilities = classifier.predict_proba(X)
    
    # Get the confidence scores (maximum probability for each prediction)
    confidence_scores = np.max(probabilities, axis=1)
    
    # Get predicted classes
    predictions = classifier.predict(X)

     # Create a DataFrame for analysis
    results_df = pd.DataFrame({
        'True_Label': y_true,
        'Predicted_Label': predictions,
        'Confidence': confidence_scores,
        'Correct': y_true == predictions
    })
    
    # Print summary statistics
    print(f"\n{model_name} Confidence Analysis:")
    print("--------------------------------")
    print(f"Average Confidence: {confidence_scores.mean():.3f}")
    print(f"Median Confidence: {np.median(confidence_scores):.3f}")
    print("\nConfidence for Correct vs Incorrect Predictions:")
    print(results_df.groupby('Correct')['Confidence'].describe())
    
    # Create visualizations
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # Distribution of confidence scores
    sns.histplot(data=results_df, x='Confidence', hue='Correct', 
                bins=30, ax=ax1)
    ax1.set_title(f'{model_name} Confidence Distribution')
    ax1.set_xlabel('Confidence Score')
    ax1.set_ylabel('Count')
    
    # Box plot of confidence by correctness
    sns.boxplot(data=results_df, x='Correct', y='Confidence', ax=ax2)
    ax2.set_title(f'{model_name} Confidence by Prediction Correctness')
    ax2.set_xlabel('Prediction Correct')
    ax2.set_ylabel('Confidence Score') 

    plt.tight_layout()
    plt.show()
    
    return results_df


analyze_prediction_confidence(xgb_clf, X_valid, y_valid, model_name="XGBClassifier") 

In [None]:
analyze_prediction_confidence(xgb_clf, X_train, y_train, model_name="XGBClassifier")

Voting Classifier

In [None]:
from sklearn.ensemble import VotingClassifier 

# Create pipelines for each estimator
rf_pipeline = Pipeline([
    ('preprocessor', preprocessing),
    ('classifier', RandomForestClassifier(random_state=42, max_depth=10, n_estimators=300))
])

lr_pipeline = Pipeline([
    ('preprocessor', preprocessing),
    ('classifier', LogisticRegression(random_state=42))
])

xgb_pipeline = Pipeline([
    ('preprocessor', preprocessing),
    ('classifier', XGBClassifier(random_state=42, n_estimators=500, learning_rate=0.2))
])

# Create VotingClassifier with pipelines
voting_clf = VotingClassifier( 
    estimators=[
        ('rf', rf_pipeline),
        ('lr', lr_pipeline), 
        ('xgb', xgb_pipeline)
    ],
    voting='soft'
    # weights can be uncommented and adjusted if needed
    # weights=[2, 1, 1]
) 


voting_clf.fit(X_train, y_train) 

In [None]:
for name, clf in voting_clf.named_estimators_.items():
    print(name, "=", clf.score(X_valid, y_valid))

In [None]:
voting_clf.score(X_test, y_test) 

In [None]:
test_pred6 = voting_clf.predict(X_test)
print(confusion_matrix(y_test, test_pred6)) 

In [None]:
#from sklearn.ensemble import VotingClassifier
#voting_clf2 = VotingClassifier(
    #estimators=[
        #('rf', RandomForestClassifier(random_state=42, max_depth=10, n_estimators=300)),
        #('svc', SVC(random_state=42, probability=True, C=10)),
        #('xgb', XGBClassifier(random_state=42, n_estimators=500, learning_rate=0.1))
    #],
    #voting='soft',
    #weights=[2, 1, 1]  # Favor XGB if it's performing better
#)
#voting_clf2.fit(X_train, y_train)

# Submission

In [None]:
test_df = pd.read_csv('/kaggle/input/bluechip-summit-credit-worthiness-prediction/Test.csv') 

test_df.head()

In [None]:
test_df1 = test_df.copy() 
test_df1.drop(columns= ['Loan_ID','ID'], inplace=True)

In [None]:
# Replace '3+' with '3' and convert to int
test_df1['Dependents'] = test_df1['Dependents'].replace('3+', '3').astype('int')

# Convert Married column to int (assuming it's binary Yes/No or 1/0)
test_df1['Married'] = test_df1['Married'].astype('int')

# Create Married_Encoded column
test_df1['Married_Encoded'] = test_df1['Married'].apply(lambda x: 2 if x > 0 else 1)

# Calculate Family_Size
test_df1['Family_Size'] = test_df1['Married_Encoded'] + test_df1['Dependents']

# Create Has_Coapplicant feature
test_df1['Has_Coapplicant'] = test_df1['CoapplicantIncome'].apply(lambda x: 1 if x > 0 else 0)

# Calculate Actual_LoanAmount (using underscore for readability)
test_df1['Actual_LoanAmount'] = test_df1['LoanAmount'] * 1_000

# Calculate ratios
test_df1['TotalIncome_LoanAmount_Ratio'] = test_df1['Actual_LoanAmount'] / (test_df1['ApplicantIncome'] + test_df1['CoapplicantIncome']) 
test_df1['FamilySize_TotalIncome_ratio'] = test_df1['Family_Size'] / (test_df1['ApplicantIncome'] + test_df1['CoapplicantIncome'])

# Drop unnecessary columns
test_df1.drop(columns=['Married_Encoded','Married', 'Dependents', 'LoanAmount', 'Total_Income'], inplace=True)

In [None]:
# Create score for Education
test_df1['Education_Score'] = test_df1['Education'].map({1: 5, 0: 1}).astype('int')

# Create score for Property Area
test_df1['Property_Area_Score'] = test_df1['Property_Area'].map({
    0: 1, # Rural
    1: 3, # Semiurban
    2: 5 # Urban
}).astype('int')

# Create score for Credit History
test_df1['Credit_Score'] = test_df1['Credit_History'].map({0: 1, 1: 5}).astype('int')

# Calculate final Applicant Score (sum of all scores)
test_df1['Applicant_Score'] = (
    test_df1['Education_Score'] + 
    test_df1['Property_Area_Score'] + 
    test_df1['Credit_Score']
)

test_df1.drop(columns=['Education_Score', 'Property_Area_Score', 'Credit_Score'], inplace=True)

In [None]:
test_df1.head()

In [None]:
test_df2 = test_df1.copy() 

cat_cols = ['Gender', 'Education', 'Self_Employed', 'Credit_History', 'Property_Area', 'Family_Size', 'Has_Coapplicant']# 'Loan_Status']

# Convert each categorical column while preserving the DataFrame structure
for col in cat_cols:
    test_df2[col] = test_df2[col].astype('category') 

# Verify the changes
print("\nUpdated Data Types:")
print(train_df2.dtypes) 

In [None]:
submission = pd.read_csv('/kaggle/input/bluechip-summit-credit-worthiness-prediction/Sample Submission.csv')
submission.head()

In [None]:
submission['Loan_Status'] = rf_clf.predict(test_df2)

In [None]:
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)

Thank you for exploring this notebook! If you enjoyed it and found value in the content, please consider giving it an upvote ⬆️. Your support means a lot to me and encourages me to create more helpful notebooks