In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [9]:
# Replace with your actual paths or use uploaded files in Colab
train_df = pd.read_csv("/content/Train_Dataset.csv")
test_df = pd.read_csv("/content/Test_Dataset.csv")

  train_df = pd.read_csv("/content/Train_Dataset.csv")
  test_df = pd.read_csv("/content/Test_Dataset.csv")


In [10]:
# Drop columns with more than 50% missing data
cols_to_drop = ['Own_House_Age', 'Score_Source_1', 'Social_Circle_Default']
train_df.drop(columns=cols_to_drop, inplace=True)
test_df.drop(columns=cols_to_drop, inplace=True)

In [11]:
numeric_cols = ['Client_Income', 'Credit_Amount', 'Loan_Annuity',
                'Population_Region_Relative', 'Age_Days', 'Employed_Days',
                'Registration_Days', 'ID_Days', 'Score_Source_3', 'Score_Source_2']

for col in numeric_cols:
    train_df[col] = pd.to_numeric(train_df[col].astype(str).str.replace(",", "").str.strip(), errors='coerce')
    test_df[col] = pd.to_numeric(test_df[col].astype(str).str.replace(",", "").str.strip(), errors='coerce')

In [12]:
# Separate numerical and categorical columns
numerical = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical.remove('Default')
categorical = train_df.select_dtypes(include='object').columns.tolist()

# Median imputation for numerical, mode for categorical
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

train_df[numerical] = num_imputer.fit_transform(train_df[numerical])
test_df[numerical] = num_imputer.transform(test_df[numerical])

train_df[categorical] = cat_imputer.fit_transform(train_df[categorical])
test_df[categorical] = cat_imputer.transform(test_df[categorical])

In [13]:
encoders = {}
for col in categorical:
    le = LabelEncoder()
    combined = pd.concat([train_df[col], test_df[col]], axis=0).astype(str)
    le.fit(combined)
    train_df[col] = le.transform(train_df[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))
    encoders[col] = le

In [14]:
X = train_df.drop("Default", axis=1)
y = train_df["Default"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [15]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(test_df)

In [16]:
rf = RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=42)
rf.fit(X_train_scaled, y_train)

In [17]:
y_val_pred = rf.predict(X_val_scaled)
y_val_proba = rf.predict_proba(X_val_scaled)[:, 1]

print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("\nClassification Report:\n", classification_report(y_val, y_val_pred))
print("\nROC AUC Score:", roc_auc_score(y_val, y_val_proba))

Confusion Matrix:
 [[17795  4608]
 [  947  1022]]

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.79      0.86     22403
           1       0.18      0.52      0.27      1969

    accuracy                           0.77     24372
   macro avg       0.57      0.66      0.57     24372
weighted avg       0.89      0.77      0.82     24372


ROC AUC Score: 0.7336492720595558


In [18]:
# Predict on test data
test_preds = rf.predict(X_test_scaled)

# Create submission file
submission = pd.DataFrame({
    "UniqueID": test_df["UniqueID"],  # make sure this column exists
    "Default": test_preds
})

# Save to CSV
submission.to_csv("vehicle_loan_default_predictions.csv", index=False)
print("✅ Submission saved as 'vehicle_loan_default_predictions.csv'")

KeyError: 'UniqueID'

In [19]:
# Check for identifier column
possible_id_columns = [col for col in test_df.columns if 'id' in col.lower()]
print("Possible ID columns in test data:", possible_id_columns)

# Use the first ID-like column if found
if possible_id_columns:
    id_column = possible_id_columns[0]
else:
    id_column = None
    print("⚠️ No ID column found. Using index instead.")

# Prepare submission
submission = pd.DataFrame({
    id_column if id_column else "Index": test_df[id_column] if id_column else test_df.index,
    "Default": test_preds
})

# Rename column to 'UniqueID' if needed
if id_column and id_column.lower() != "uniqueid":
    submission.rename(columns={id_column: "UniqueID"}, inplace=True)

# Save
submission.to_csv("vehicle_loan_default_predictions.csv", index=False)
print("✅ Submission saved as 'vehicle_loan_default_predictions.csv'")

Possible ID columns in test data: ['ID', 'ID_Days']
✅ Submission saved as 'vehicle_loan_default_predictions.csv'


In [23]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.47.0-py3-none-any.whl.metadata (9.0 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.47.0-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m92.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m89.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInst

In [24]:
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import io

st.set_page_config(page_title="NBFC Loan Default Predictor", layout="wide")
st.title("🚗 NBFC Vehicle Loan Default Prediction App")

# Step 1: Upload files
train_file = st.file_uploader("Upload Train_Dataset.csv", type="csv")
test_file = st.file_uploader("Upload Test_Dataset.csv", type="csv")

if train_file and test_file:
    train_df = pd.read_csv(train_file)
    test_df = pd.read_csv(test_file)
    st.success("✅ Files uploaded successfully!")

    if st.button("🚀 Train Model and Predict"):

        # Drop high-missing columns
        drop_cols = ['Own_House_Age', 'Score_Source_1', 'Social_Circle_Default']
        for col in drop_cols:
            if col in train_df.columns: train_df.drop(columns=col, inplace=True)
            if col in test_df.columns: test_df.drop(columns=col, inplace=True)

        # Convert numeric-looking object columns
        numeric_cols = ['Client_Income', 'Credit_Amount', 'Loan_Annuity',
                        'Population_Region_Relative', 'Age_Days', 'Employed_Days',
                        'Registration_Days', 'ID_Days', 'Score_Source_3', 'Score_Source_2']

        for col in numeric_cols:
            train_df[col] = pd.to_numeric(train_df[col].astype(str).str.replace(",", ""), errors='coerce')
            test_df[col] = pd.to_numeric(test_df[col].astype(str).str.replace(",", ""), errors='coerce')

        # Impute missing values
        numerical = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
        if 'Default' in numerical: numerical.remove('Default')
        categorical = train_df.select_dtypes(include='object').columns.tolist()

        num_imputer = SimpleImputer(strategy='median')
        cat_imputer = SimpleImputer(strategy='most_frequent')

        train_df[numerical] = num_imputer.fit_transform(train_df[numerical])
        test_df[numerical] = num_imputer.transform(test_df[numerical])

        train_df[categorical] = cat_imputer.fit_transform(train_df[categorical])
        test_df[categorical] = cat_imputer.transform(test_df[categorical])

        # Encode categoricals
        encoders = {}
        for col in categorical:
            le = LabelEncoder()
            combined = pd.concat([train_df[col], test_df[col]], axis=0).astype(str)
            le.fit(combined)
            train_df[col] = le.transform(train_df[col].astype(str))
            test_df[col] = le.transform(test_df[col].astype(str))
            encoders[col] = le

        # Split features and labels
        X = train_df.drop("Default", axis=1)
        y = train_df["Default"]

        # Train-test split
        from sklearn.model_selection import train_test_split
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

        # Scaling
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_val_scaled = scaler.transform(X_val)
        X_test_scaled = scaler.transform(test_df)

        # Train Random Forest
        model = RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=42)
        model.fit(X_train_scaled, y_train)

        # Evaluate
        y_pred = model.predict(X_val_scaled)
        y_proba = model.predict_proba(X_val_scaled)[:, 1]

        st.subheader("📊 Evaluation on Validation Set")
        st.text("Confusion Matrix:")
        st.text(confusion_matrix(y_val, y_pred))
        st.text("\nClassification Report:")
        st.text(classification_report(y_val, y_pred))
        st.text(f"ROC AUC Score: {roc_auc_score(y_val, y_proba):.4f}")

        # Predict on test
        test_preds = model.predict(X_test_scaled)

        # Find ID column for submission
        possible_ids = [col for col in test_df.columns if 'id' in col.lower()]
        id_col = possible_ids[0] if possible_ids else None

        submission = pd.DataFrame({
            "UniqueID": test_df[id_col] if id_col else test_df.index,
            "Default": test_preds
        })

        csv_buffer = io.StringIO()
        submission.to_csv(csv_buffer, index=False)
        st.download_button("📥 Download Predictions", data=csv_buffer.getvalue(), file_name="vehicle_loan_default_predictions.csv", mime='text/csv')


2025-07-21 13:06:32.450 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]


In [30]:
%%writefile my_app.py

import streamlit as st
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import io

st.set_page_config(page_title="NBFC Loan Default Predictor", layout="wide")
st.title("NBFC Vehicle Loan Default Prediction App")

# Step 1: Upload files
train_file = st.file_uploader("Upload Train_Dataset.csv", type="csv")
test_file = st.file_uploader("Upload Test_Dataset.csv", type="csv")

if train_file and test_file:
    train_df = pd.read_csv(train_file)
    test_df = pd.read_csv(test_file)
    st.success("✅Files uploaded successfully!")

    if st.button("🚀Train Model and Predict"):

        # Drop high-missing columns
        drop_cols = ['Own_House_Age', 'Score_Source_1', 'Social_Circle_Default']
        for col in drop_cols:
            if col in train_df.columns: train_df.drop(columns=col, inplace=True)
            if col in test_df.columns: test_df.drop(columns=col, inplace=True)

        # Convert numeric-looking object columns
        numeric_cols = ['Client_Income', 'Credit_Amount', 'Loan_Annuity',
                        'Population_Region_Relative', 'Age_Days', 'Employed_Days',
                        'Registration_Days', 'ID_Days', 'Score_Source_3', 'Score_Source_2']

        for col in numeric_cols:
            train_df[col] = pd.to_numeric(train_df[col].astype(str).str.replace(",", ""), errors='coerce')
            test_df[col] = pd.to_numeric(test_df[col].astype(str).str.replace(",", ""), errors='coerce')

        # Impute missing values
        numerical = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
        if 'Default' in numerical: numerical.remove('Default')
        categorical = train_df.select_dtypes(include='object').columns.tolist()

        num_imputer = SimpleImputer(strategy='median')
        cat_imputer = SimpleImputer(strategy='most_frequent')

        train_df[numerical] = num_imputer.fit_transform(train_df[numerical])
        test_df[numerical] = num_imputer.transform(test_df[numerical])

        train_df[categorical] = cat_imputer.fit_transform(train_df[categorical])
        test_df[categorical] = cat_imputer.transform(test_df[categorical])

        # Encode categoricals
        encoders = {}
        for col in categorical:
            le = LabelEncoder()
            combined = pd.concat([train_df[col], test_df[col]], axis=0).astype(str)
            le.fit(combined)
            train_df[col] = le.transform(train_df[col].astype(str))
            test_df[col] = le.transform(test_df[col].astype(str))
            encoders[col] = le

        # Split features and labels
        X = train_df.drop("Default", axis=1)
        y = train_df["Default"]

        # Train-test split
        from sklearn.model_selection import train_test_split
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

        # Scaling
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_val_scaled = scaler.transform(X_val)
        X_test_scaled = scaler.transform(test_df)

        # Train Random Forest
        model = RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=42)
        model.fit(X_train_scaled, y_train)

        # Evaluate
        y_pred = model.predict(X_val_scaled)
        y_proba = model.predict_proba(X_val_scaled)[:, 1]

        st.subheader("📊Evaluation on Validation Set")
        st.text("Confusion Matrix:")
        st.text(confusion_matrix(y_val, y_pred))
        st.text("\nClassification Report:")
        st.text(classification_report(y_val, y_pred))
        st.text(f"ROC AUC Score: {roc_auc_score(y_val, y_proba):.4f}")

        # Predict on test
        test_preds = model.predict(X_test_scaled)

        # Find ID column for submission
        possible_ids = [col for col in test_df.columns if 'id' in col.lower()]
        id_col = possible_ids[0] if possible_ids else None

        submission = pd.DataFrame({
            "UniqueID": test_df[id_col] if id_col else test_df.index,
            "Default": test_preds
        })

        csv_buffer = io.StringIO()
        submission.to_csv(csv_buffer, index=False)
        st.download_button("📥Download Predictions", data=csv_buffer.getvalue(), file_name="vehicle_loan_default_predictions.csv", mime='text/csv')


Overwriting my_app.py


In [31]:
!wget -q -O - ipv4.icanhazip.com

34.143.172.9


In [32]:
!npm install localtunnel

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K[1mnpm[22m [33mwarn[39m [94mdeprecated[39m debug@4.1.1: Debug versions >=3.2.0 <3.2.7 || >=4 <4.3.1 have a low-severity ReDos regression when used in a Node.js environment. It is recommended you upgrade to 3.2.7 or 4.3.1. (https://github.com/visionmedia/debug/issues/797)
[1G[0K⠙[1G[0K[1mnpm[22m [33mwarn[39m [94mdeprecated[39m axios@0.19.0: Critical security vulnerability fixed in v0.21.1. For more information, see https://github.com/axios/axios/pull/3410
[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G

In [33]:
!npm audit fix --force

[1mnpm[22m [33mwarn[39m [94musing --force[39m Recommended protections disabled.
[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K[1mnpm[22m [33mwarn[39m [94maudit[39m Updating localtunnel to 2.0.2, which is a SemVer major change.
[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K
added 5 packages, removed 42 packages, changed 15 packages, and audited 23 packages in 2s
[1G[0K⠼[1G[0K
[1G[0K⠼[1G[0K3 packages are looking for funding
[1G[0K⠼[1G[0K  run `npm fund` for details
[1G[0K⠼[1G[0K
[1m# npm audit report[22m

[1maxios[22m  <=0.29.0
Severity: [31m[1mhigh[22m[39m
[1mAxios Cross-Site Request Forgery Vulnerability[22m - https://github.com/advisories/GHSA-wf5p-g6vw-rhxx
[1maxios Requests Vulnerable To Possible SSRF and Credential Leakage via Absolute URL[22m - https://github.com/advisories/GHSA-jr5f-v2jv-69x6
[33m[1mfix available[22m[39m via `npm audit fix --force`
Will install localtunnel@1.8.3

In [34]:
! streamlit run my_app.py & npx localtunnel --port 8501

[1G[0K
Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
⠙[1G[0K⠹[1G[0K[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.143.172.9:8501[0m
[0m
your url is: https://lemon-hats-visit.loca.lt
  train_df = pd.read_csv(train_file)
  test_df = pd.read_csv(test_file)
  train_df = pd.read_csv(train_file)
  test_df = pd.read_csv(test_file)
[34m  Stopping...[0m
^C


In [35]:
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import io

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split

# Streamlit config
st.set_page_config(page_title="NBFC Loan Default Predictor", layout="wide")
st.title("🚗 NBFC Vehicle Loan Default Prediction App")

# File upload section
train_file = st.file_uploader("📤 Upload Train_Dataset.csv", type="csv")
test_file = st.file_uploader("📤 Upload Test_Dataset.csv", type="csv")

# Run pipeline if files uploaded
if train_file and test_file:
    train_df = pd.read_csv(train_file)
    test_df = pd.read_csv(test_file)
    st.success("✅ Files uploaded successfully!")

    if st.button("🚀 Train Model and Predict"):

        # Drop high-missing columns
        drop_cols = ['Own_House_Age', 'Score_Source_1', 'Social_Circle_Default']
        for col in drop_cols:
            if col in train_df.columns: train_df.drop(columns=col, inplace=True)
            if col in test_df.columns: test_df.drop(columns=col, inplace=True)

        # Convert numeric-like columns
        numeric_cols = ['Client_Income', 'Credit_Amount', 'Loan_Annuity',
                        'Population_Region_Relative', 'Age_Days', 'Employed_Days',
                        'Registration_Days', 'ID_Days', 'Score_Source_3', 'Score_Source_2']
        for col in numeric_cols:
            train_df[col] = pd.to_numeric(train_df[col].astype(str).str.replace(",", ""), errors='coerce')
            test_df[col] = pd.to_numeric(test_df[col].astype(str).str.replace(",", ""), errors='coerce')

        # Handle missing values
        numerical = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
        if 'Default' in numerical: numerical.remove('Default')
        categorical = train_df.select_dtypes(include='object').columns.tolist()

        num_imputer = SimpleImputer(strategy='median')
        cat_imputer = SimpleImputer(strategy='most_frequent')

        train_df[numerical] = num_imputer.fit_transform(train_df[numerical])
        test_df[numerical] = num_imputer.transform(test_df[numerical])
        train_df[categorical] = cat_imputer.fit_transform(train_df[categorical])
        test_df[categorical] = cat_imputer.transform(test_df[categorical])

        # Encode categorical features
        encoders = {}
        for col in categorical:
            le = LabelEncoder()
            all_vals = pd.concat([train_df[col], test_df[col]], axis=0).astype(str)
            le.fit(all_vals)
            train_df[col] = le.transform(train_df[col].astype(str))
            test_df[col] = le.transform(test_df[col].astype(str))
            encoders[col] = le

        # Split train data
        X = train_df.drop("Default", axis=1)
        y = train_df["Default"]
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

        # Scale
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_val_scaled = scaler.transform(X_val)
        X_test_scaled = scaler.transform(test_df)

        # Train Random Forest
        model = RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=42)
        model.fit(X_train_scaled, y_train)

        # Evaluation
        y_pred = model.predict(X_val_scaled)
        y_proba = model.predict_proba(X_val_scaled)[:, 1]
        cm = confusion_matrix(y_val, y_pred)
        report = classification_report(y_val, y_pred, output_dict=True)
        roc_auc = roc_auc_score(y_val, y_proba)

        # 📊 Confusion Matrix
        st.subheader("📊 Model Evaluation on Validation Set")
        fig, ax = plt.subplots()
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["No Default", "Default"], yticklabels=["No Default", "Default"], ax=ax)
        ax.set_xlabel("Predicted")
        ax.set_ylabel("Actual")
        ax.set_title("Confusion Matrix")
        st.pyplot(fig)

        # 📈 Classification Report
        st.markdown("### 🔍 Classification Report Summary")
        metrics_df = pd.DataFrame(report).transpose()
        st.dataframe(metrics_df.style.format({
            "precision": "{:.2f}",
            "recall": "{:.2f}",
            "f1-score": "{:.2f}",
            "support": "{:.0f}"
        }).highlight_max(axis=0, color='lightgreen'))

        # ROC AUC
        st.markdown(f"### 🧮 ROC AUC Score: **{roc_auc:.4f}**")

        # 📘 Interpretation
        st.markdown("""
#### 📘 Quick Interpretation:
- **Precision**: Out of predicted defaults, how many were correct? (Low means many false positives)
- **Recall**: Out of actual defaults, how many did we catch? (Important in risk modeling)
- **F1-score**: Balances precision and recall.
- **ROC AUC**: Area under the curve — 0.73 means decent discrimination between defaulters vs. non-defaulters.
""")

        # Prediction
        test_preds = model.predict(X_test_scaled)
        possible_ids = [col for col in test_df.columns if 'id' in col.lower()]
        id_col = possible_ids[0] if possible_ids else None

        submission = pd.DataFrame({
            "UniqueID": test_df[id_col] if id_col else test_df.index,
            "Default": test_preds
        })

        csv_buffer = io.StringIO()
        submission.to_csv(csv_buffer, index=False)
        st.download_button("📥 Download Prediction CSV", data=csv_buffer.getvalue(), file_name="vehicle_loan_default_predictions.csv", mime='text/csv')




In [42]:
%%writefile my_app.py

import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import io

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split

# Streamlit config
st.set_page_config(page_title="NBFC Loan Default Predictor", layout="wide")
st.title("NBFC Vehicle Loan Default Prediction App")

# File upload section
train_file = st.file_uploader("📤Upload Train_Dataset.csv", type="csv")
test_file = st.file_uploader("📤Upload Test_Dataset.csv", type="csv")

# Run pipeline if files uploaded
if train_file and test_file:
    train_df = pd.read_csv(train_file)
    test_df = pd.read_csv(test_file)
    st.success("✅Files uploaded successfully!")

    if st.button("🚀Train Model and Predict"):

        # Drop high-missing columns
        drop_cols = ['Own_House_Age', 'Score_Source_1', 'Social_Circle_Default']
        for col in drop_cols:
            if col in train_df.columns: train_df.drop(columns=col, inplace=True)
            if col in test_df.columns: test_df.drop(columns=col, inplace=True)

        # Convert numeric-like columns
        numeric_cols = ['Client_Income', 'Credit_Amount', 'Loan_Annuity',
                        'Population_Region_Relative', 'Age_Days', 'Employed_Days',
                        'Registration_Days', 'ID_Days', 'Score_Source_3', 'Score_Source_2']
        for col in numeric_cols:
            train_df[col] = pd.to_numeric(train_df[col].astype(str).str.replace(",", ""), errors='coerce')
            test_df[col] = pd.to_numeric(test_df[col].astype(str).str.replace(",", ""), errors='coerce')

        # Handle missing values
        numerical = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
        if 'Default' in numerical: numerical.remove('Default')
        categorical = train_df.select_dtypes(include='object').columns.tolist()

        num_imputer = SimpleImputer(strategy='median')
        cat_imputer = SimpleImputer(strategy='most_frequent')

        train_df[numerical] = num_imputer.fit_transform(train_df[numerical])
        test_df[numerical] = num_imputer.transform(test_df[numerical])
        train_df[categorical] = cat_imputer.fit_transform(train_df[categorical])
        test_df[categorical] = cat_imputer.transform(test_df[categorical])

        # Encode categorical features
        encoders = {}
        for col in categorical:
            le = LabelEncoder()
            all_vals = pd.concat([train_df[col], test_df[col]], axis=0).astype(str)
            le.fit(all_vals)
            train_df[col] = le.transform(train_df[col].astype(str))
            test_df[col] = le.transform(test_df[col].astype(str))
            encoders[col] = le

        # Split train data
        X = train_df.drop("Default", axis=1)
        y = train_df["Default"]
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

        # Scale
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_val_scaled = scaler.transform(X_val)
        X_test_scaled = scaler.transform(test_df)

        # Train Random Forest
        model = RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=42)
        model.fit(X_train_scaled, y_train)

        # Evaluation
        y_pred = model.predict(X_val_scaled)
        y_proba = model.predict_proba(X_val_scaled)[:, 1]
        cm = confusion_matrix(y_val, y_pred)
        report = classification_report(y_val, y_pred, output_dict=True)
        roc_auc = roc_auc_score(y_val, y_proba)

        # 📊Confusion Matrix
        st.subheader("📊Model Evaluation on Validation Set")
        fig, ax = plt.subplots()
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["No Default", "Default"], yticklabels=["No Default", "Default"], ax=ax)
        ax.set_xlabel("Predicted")
        ax.set_ylabel("Actual")
        ax.set_title("Confusion Matrix")
        st.pyplot(fig)

        # 📈Classification Report
        st.markdown("### Classification Report Summary")
        metrics_df = pd.DataFrame(report).transpose()
        st.dataframe(metrics_df.style.format({
            "precision": "{:.2f}",
            "recall": "{:.2f}",
            "f1-score": "{:.2f}",
            "support": "{:.0f}"
        }).highlight_max(axis=0, color='lightgreen'))

        # ROC AUC
        st.markdown(f"### 🧮 ROC AUC Score: **{roc_auc:.4f}**")

        # 📘Interpretation
        st.markdown("""
#### 📘Quick Interpretation:
- **Precision**: Out of predicted defaults, how many were correct? (Low means many false positives)
- **Recall**: Out of actual defaults, how many did we catch? (Important in risk modeling)
- **F1-score**: Balances precision and recall.
- **ROC AUC**: Area under the curve — 0.73 means decent discrimination between defaulters vs. non-defaulters.
""")

        # Prediction
        test_preds = model.predict(X_test_scaled)
        possible_ids = [col for col in test_df.columns if 'id' in col.lower()]
        id_col = possible_ids[0] if possible_ids else None

        submission = pd.DataFrame({
            "UniqueID": test_df[id_col] if id_col else test_df.index,
            "Default": test_preds
        })

        csv_buffer = io.StringIO()
        submission.to_csv(csv_buffer, index=False)
        st.download_button("📥Download Prediction CSV", data=csv_buffer.getvalue(), file_name="vehicle_loan_default_predictions.csv", mime='text/csv')


Overwriting my_app.py


In [43]:
!wget -q -O - ipv4.icanhazip.com

34.143.172.9


In [44]:
!npm install localtunnel

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K[1mnpm[22m [33mwarn[39m [94mdeprecated[39m debug@4.1.1: Debug versions >=3.2.0 <3.2.7 || >=4 <4.3.1 have a low-severity ReDos regression when used in a Node.js environment. It is recommended you upgrade to 3.2.7 or 4.3.1. (https://github.com/visionmedia/debug/issues/797)
[1G[0K⠴[1G[0K[1mnpm[22m [33mwarn[39m [94mdeprecated[39m axios@0.19.0: Critical security vulnerability fixed in v0.21.1. For more information, see https://github.com/axios/axios/pull/3410
[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K
added 38 packages, removed 74 packages, changed 5 packages, and audited 60 packages in 3s
[1G[0K⠏[1G[0K
[1G[0K⠏[1G[0K5 packages are looking for funding
[1G[0K⠏[1G[0K  run `npm fund` for details
[1G[0K⠏[1G[0K
[31m[1m6[22m[39m vu

In [45]:
!npm audit fix --force

[1mnpm[22m [33mwarn[39m [94musing --force[39m Recommended protections disabled.
[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K[1mnpm[22m [33mwarn[39m [94maudit[39m Updating localtunnel to 2.0.2, which is a SemVer major change.
[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K
added 5 packages, removed 42 packages, changed 15 packages, and audited 23 packages in 2s
[1G[0K⠴[1G[0K
[1G[0K⠴[1G[0K3 packages are looking for funding
[1G[0K⠴[1G[0K  run `npm fund` for details
[1G[0K⠴[1G[0K
[1m# npm audit report[22m

[1maxios[22m  <=0.29.0
Severity: [31m[1mhigh[22m[39m
[1mAxios Cross-Site Request Forgery Vulnerability[22m - https://github.com/advisories/GHSA-wf5p-g6vw-rhxx
[1maxios Requests Vulnerable To Possible SSRF and Credential Leakage via Absolute URL[22m - https://github.com/advisories/GHSA-jr5f-v2jv-69x6
[33m[1mfix available[22m[39m via `npm audit fix --force`
Will install localtunnel@1.8.3

In [46]:
! streamlit run my_app.py & npx localtunnel --port 8501

[1G[0K⠙
Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.143.172.9:8501[0m
[0m
your url is: https://chubby-cats-fix.loca.lt
  train_df = pd.read_csv(train_file)
  test_df = pd.read_csv(test_file)
  train_df = pd.read_csv(train_file)
  test_df = pd.read_csv(test_file)
  train_df = pd.read_csv(train_file)
  test_df = pd.read_csv(test_file)
[34m  Stopping...[0m
^C
