In [1]:
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
import pickle
import warnings
warnings.filterwarnings("ignore")

# Data Generation

In [3]:
np.random.seed(42)
num_samples = 10000  # Adjust the number of samples as needed
data = {
    'CNT_CHILDREN': np.random.randint(0, 3, num_samples),
    'AMT_INCOME_TOTAL': np.random.uniform(20000, 150000, num_samples),
    'NAME_INCOME_TYPE': np.random.choice(['Working', 'Commercial_associate', 'Pensioner', 'State_servant', 'Student'], num_samples),
    'NAME_EDUCATION_TYPE': np.random.choice(['Higher_education', 'Secondary', 'Incomplete_higher', 'Lower_secondary', 'Academic_degree'], num_samples),
    'NAME_FAMILY_STATUS': np.random.choice(['Civil_marriage', 'Married', 'Single', 'Separated', 'Widow'], num_samples),
    'NAME_HOUSING_TYPE': np.random.choice(['Rented_apartment','Municipal_apartment', 'With_parents', 'Co-op_apartment', 'Office_apartment'], num_samples),
    'YEARS_EMPLOYED': np.random.randint(1, 30, num_samples) * 365,  # Assuming employment duration between 1 and 30 years
    'FLAG_PHONE': np.random.choice(['Y', 'N'], num_samples),
    'CNT_FAM_MEMBERS': np.random.randint(1, 5, num_samples),
    'CODE_GENDER': np.random.choice(['M', 'F'], num_samples),
    'FLAG_OWN_CAR': np.random.choice(['Y', 'N'], num_samples),
    'FLAG_OWN_REALTY': np.random.choice(['Y', 'N'], num_samples),
    'TARGET': np.random.choice([0, 1], num_samples)  # Binary target variable (approved or declined)
}
df = pd.DataFrame(data)

# Train the Model 

In [4]:
# Separate features and labels
X = df.drop('TARGET', axis=1)
y = df['TARGET']
# Create dummy variables for categorical features
X = pd.get_dummies(X, columns=['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_PHONE', 'FLAG_OWN_REALTY', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE'], drop_first=True)
df = pd.concat([df, X],axis=1)
df = df.drop(columns={'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_PHONE', 'FLAG_OWN_REALTY', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE'},axis=1)
df = df.T.drop_duplicates().T
# Split the dataset into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# Train XGBoostClassifier
clf_xgb = XGBClassifier(n_estimators=100, random_state=42)
clf_xgb.fit(X_train,y_train)
# Evaluate the model on the validation set
y_val_pred = clf_xgb.predict(X_val)
# Display evaluation metrics
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("Validation Classification Report:")
print(classification_report(y_val, y_val_pred))
# Save the trained XGBoostClassifier model
pickle.dump(clf_xgb, open('finalized_model_xgb.sav', 'wb'))
# Save dummy data for reference
df.to_csv('sample_training_data.csv', index=False)

Validation Accuracy: 0.5045
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.49      0.49       980
           1       0.51      0.52      0.52      1020

    accuracy                           0.50      2000
   macro avg       0.50      0.50      0.50      2000
weighted avg       0.50      0.50      0.50      2000



# Lets Deploy the Base iteration model in the Streamlit app 

In [6]:
st.write("""
# Credit Card Approval Prediction App
This app predicts the credit card approval probability
""")
# Get Input
st.header('User Input Parameters')
def user_input_features():
    gender = st.selectbox("CODE_GENDER", ('M', 'F'))
    own_car = st.selectbox("FLAG_OWN_CAR", ('Y', 'N'))
    own_realty = st.selectbox("FLAG_OWN_REALTY", ('Y', 'N'))
    own_phone = st.selectbox("FLAG_PHONE", ('Y', 'N'))
    cnt_children = st.number_input("CNT_CHILDREN", min_value=0, max_value=20, step=1)
    amt_income_total = st.number_input("AMT_INCOME_TOTAL", min_value=0.0, max_value=2000000.0)
    days_employed = st.number_input("DAYS_EMPLOYED", min_value=-20000, max_value=400000, step=1)
    cnt_fam_members = st.number_input("CNT_FAM_MEMBERS", min_value=0, max_value=20, step=1)
    name_income_type = st.selectbox("NAME_INCOME_TYPE",('Working', 'Commercial_associate', 'Pensioner', 'State_servant', 'Student'))
    name_education_type = st.selectbox("NAME_EDUCATION_TYPE",('Higher_education', 'Secondary', 'Incomplete_higher', 'Lower_secondary', 'Academic_degree'))
    name_family_status = st.selectbox("NAME_FAMILY_STATUS",('Civil_marriage', 'Married', 'Single', 'Separated', 'Widow'))
    name_housing_type = st.selectbox("NAME_HOUSING_TYPE",('Rented_apartment', 'Municipal_apartment', 'With_parents','Co-op_apartment', 'Office_apartment'))
    data = {'CNT_CHILDREN': cnt_children,
            'AMT_INCOME_TOTAL': amt_income_total,
            'NAME_INCOME_TYPE': name_income_type,
            'NAME_EDUCATION_TYPE': name_education_type,
            'NAME_FAMILY_STATUS': name_family_status,
            'NAME_HOUSING_TYPE': name_housing_type,
            'DAYS_EMPLOYED': days_employed,
            'FLAG_PHONE': own_phone,
            'CNT_FAM_MEMBERS': cnt_fam_members,
            'CODE_GENDER': gender,
            'FLAG_OWN_CAR': own_car,
            'FLAG_OWN_REALTY': own_realty
           }
    features = pd.DataFrame(data, index=[0])
    return features

df_input = user_input_features()

st.subheader('User Input parameters')
st.write(df_input.to_dict())

# Create dummy variables for categorical features
cat_features = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_PHONE', 'FLAG_OWN_REALTY', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE']
df_input_encoded = pd.get_dummies(df_input, columns=cat_features, drop_first=True)
# Add missing columns with default values
missing_columns = set(clf_xgb.get_booster().feature_names) - set(df_input_encoded.columns)
for col in missing_columns:
    df_input_encoded[col] = 0
# Reorder columns to match the model's feature names
df_input_encoded = df_input_encoded[clf_xgb.get_booster().feature_names]
# Model Loading
clf_xgb = pickle.load(open('finalized_model_xgb.sav', 'rb'))
# Model Inferencing
prediction = clf_xgb.predict(df_input_encoded)
prediction_proba = clf_xgb.predict_proba(df_input_encoded)
st.subheader('Prediction')
if prediction == 0:
    st.write('Approved')
else:
    st.write('Declined')
st.subheader('Prediction Probability')
st.write(prediction_proba)