# Notebook details
- Code to write streamlit app and host it using local tunnel
- NOTE: code MUST be run on google colab

# Install and import libraries

In [None]:
#Required libraries to install
!pip install streamlit pyngrok transformers datasets evaluate bitsandbytes pinecone-client langchain langchain-community langchain-pinecone -qU pinecone-notebooks

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m83.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install -U bitsandbytes



In [None]:
#Mount Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# App

In [None]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
import shap
import joblib
import torch
import matplotlib.pyplot as plt
import matplotlib
import warnings
from pinecone import Pinecone
from langchain_pinecone import PineconeEmbeddings
# from langchain_community.vectorstores import Pinecone as PineconeStore
from langchain.vectorstores import Pinecone as PineconeStore
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from langchain import HuggingFacePipeline
warnings.filterwarnings("ignore")


prediction_pipeline = joblib.load("/content/drive/MyDrive/Capstone/artifacts/xgb_pipeline.pkl")
explainer = joblib.load("/content/drive/MyDrive/Capstone/artifacts/xgb_shap_explainer.pkl")
X_test = pd.read_csv("/content/drive/MyDrive/Capstone/data/test_data_transformed.csv")

# Tokens
pinecone_token = "pinecone_token"
huggingface_token = "huggingface_token"

st.title("CORAL-X")

idx = st.number_input(
    "Select a record index from X_test",
    min_value=0,
    max_value=len(X_test) - 1,
    value=0
)

if st.button("Predict and Explain"):
    single_row = X_test.iloc[[idx]]

    #Predict outcome and probability
    prediction_proba = prediction_pipeline.predict_proba(single_row)[0][1]
    prediction_class = prediction_pipeline.predict(single_row)[0]

    loan_status_str = "Loan Rejected" if prediction_class == 0 else "Loan Approved"
    st.write(f"**Model Predicted:** {loan_status_str}")
    st.write(f"**Probability of Full Payment:** {prediction_proba:.4f}")

    #SHAP computation
    preprocessor = prediction_pipeline.named_steps["preprocessor"]
    single_row_transformed = preprocessor.transform(single_row)

    if hasattr(single_row_transformed, "toarray"):
        single_row_transformed = single_row_transformed.toarray()

    single_shap_values = explainer.shap_values(single_row_transformed)[0]

    try:
        feature_names = preprocessor.get_feature_names_out()
        rename_features = []
        for name in feature_names:
            rename_features.append(name[5:])
    except AttributeError:
        rename_features = [f"{i}" for i in range(single_row.shape[1])]

    #Single DataFrame that includes the SHAP values, Probability, Target
    shap_dict = dict(zip(rename_features, single_shap_values))
    shap_dict["Prediction Proba"] = prediction_proba
    shap_dict["Target"] = prediction_class

    shap_row_df = pd.DataFrame([shap_dict])

    df_row = single_row.reset_index(drop=True).head(1)
    st.write("**Record Row:**", df_row)

    ##Prepare prompt for LLM

    #Convert state abbrevs

    us_state_to_abbrev = {"Alabama": "AL", "Alaska": "AK", "Arizona": "AZ", "Arkansas": "AR", "California": "CA", "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE",
          "Florida": "FL", "Georgia": "GA", "Hawaii": "HI", "Idaho": "ID", "Illinois": "IL", "Indiana": "IN", "Iowa": "IA", "Kansas": "KS", "Kentucky": "KY",
          "Louisiana": "LA", "Maine": "ME", "Maryland": "MD", "Massachusetts": "MA", "Michigan": "MI", "Minnesota": "MN", "Mississippi": "MS", "Missouri": "MO",
          "Montana": "MT", "Nebraska": "NE", "Nevada": "NV", "New Hampshire": "NH", "New Jersey": "NJ", "New Mexico": "NM", "New York": "NY", "North Carolina": "NC",
          "North Dakota": "ND", "Ohio": "OH", "Oklahoma": "OK", "Oregon": "OR", "Pennsylvania": "PA", "Rhode Island": "RI", "South Carolina": "SC", "South Dakota": "SD",
          "Tennessee": "TN", "Texas": "TX", "Utah": "UT", "Vermont": "VT", "Virginia": "VA", "Washington": "WA", "West Virginia": "WV", "Wisconsin": "WI", "Wyoming": "WY",
          "District of Columbia": "DC", "American Samoa": "AS", "Guam": "GU", "Northern Mariana Islands": "MP", "Puerto Rico": "PR",
          "United States Minor Outlying Islands": "UM", "Virgin Islands, U.S.": "VI",
      }
    abbrev_to_us_state = dict(map(reversed, us_state_to_abbrev.items()))

    def get_addr_state(idx = 0):
      addr_state = abbrev_to_us_state.get(single_row["addr_state"].values[0])
      return addr_state

    #Data dictionary
    data_dict = {"loan_amnt": "Loan amount applied for by borrower",
            "int_rate": "Estimated interest rate on loan",
            "installment": "Monthly installments owed",
            "grade": "Estimated loan grade",
            "sub_grade": "Estimated loan sub-grade",
            "emp_length": "Employment length",
            "home_ownership": "Home ownership status",
            "annual_inc": "Annual income of borrower",
            "verification_status": "Income verification status",
            "purpose": "purpose of loan provided by borrower",
            "title": "Loan title provided by the borrower",
            "addr_state": "Residence state of borrower",
            "dti": "Monthly debt-to-income ratio",
            "open_acc": "Number of borrower's open credit lines",
            "pub_rec": "Number of derogatory public records",
            "revol_bal": "Total credit revolving balance",
            "revol_util": "Revolving line utilization rate",
            "total_acc": "Total number of borrower's credit lines",
            "initial_list_status": "Initial listing status of loan",
            "application_type": "Individual or joint application",
            "mort_acc": "Number of mortgage accounts",
            "pub_rec_bankruptcies": "Number of public record bankruptices",
            "term_months": "Number of monthly payments in loan",
            "earliest_cr_line_year": "Year earliest credit line was opened",
            "earliest_cr_line_month": "Month earliest credit line was opened"
    }

    # Convert SHAPs to strings
    def get_features_shaps(shap_row, idx = 0):
      shap_row_df = shap_row.drop(columns = ["Prediction Proba", "Target"])

      shap_dict = {"Feature": [], "SHAP value": []}
      feature_shap_vals = []

      for col in single_row:
        if col in shap_row_df:
          feature_shap_vals.append(f"feature_name: {col}, feature_value: {single_row[col].values[0]}, shap_value: {shap_row_df[col].values[0]}\n")
          shap_dict["Feature"].append(col)
          shap_dict["SHAP value"].append(shap_row_df[col].values[0])
        else:
          try:
            col_val = single_row[col].values[0]
            shap_val = shap_row_df[f"{col}_{col_val}"].values[0]
            if col == "addr_state":
              col_val = abbrev_to_us_state.get(col_val)
            feature_shap_vals.append(f"feature_name: {col}, feature_value: {col_val}, shap_value: {shap_val}\n")
            shap_dict["Feature"].append(col)
            shap_dict["SHAP value"].append(shap_val)
          except Exception as e:
            raise e


      return "\n".join(feature_shap_vals), pd.DataFrame(shap_dict)


    ##Pinecone setup
    pc = Pinecone(api_key=pinecone_token)
    try:
      index_name = "coral-x"
      host = pc.describe_index(index_name)["host"]
      index = pc.Index(host = host)
      model_name = 'multilingual-e5-large'
      embeddings = PineconeEmbeddings(model=model_name, pinecone_api_key=pinecone_token)
      # vectordb = PineconeStore(index, embeddings, text_key = "text")
    except Exception as e:
      raise e

    ## Download model and tokenizer with 4bit config
    compute_dtype = getattr(torch, "float16")
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False)

    base_model = "meta-llama/Llama-3.2-3B-Instruct"
    model = AutoModelForCausalLM.from_pretrained(
        base_model,
        quantization_config=quant_config,
        device_map={"": 0},
        use_auth_token=huggingface_token
    )
    tokenizer = AutoTokenizer.from_pretrained(
        base_model,
        trust_remote_code=True,
        token=huggingface_token,
    )
    tokenizer.pad_token = tokenizer.eos_token

    # Retrieve context + helper functions
    def rm_duplicate_lines(text, sep = "\n"):
      lines = text.split(sep)
      lines_set = set()
      clean_text = ""
      for line in lines:
        if line not in lines_set:
          # clean_text += "\n".join(line)
          clean_text += line + "\n"
          lines_set.add(line)

      return clean_text

    def format_docs(results, clean = False):
      rel_docs = results["matches"]
      full_doc = ""
      for doc in rel_docs:
        full_doc += "".join(doc["metadata"]["text"])
        full_doc += "\n"

      if clean == True:
        clean_docs = rm_duplicate_lines(text = full_doc)
        return clean_docs

      return full_doc

    def retrieve(query, namespace, embeddings = embeddings, top_k = 3, pc_index = index, clean = False):
      results = index.query(
          namespace = namespace,
          vector = embeddings.embed_query(query),
          top_k = top_k,
          include_metadata = True,
          include_values = False
      )

      return format_docs(results, clean = clean)

    def generate_context(idx = 0):
      query,_ = get_features_shaps(shap_row = shap_row_df, idx = idx)
      context = retrieve(
          query = query,
          namespace = "data-context-docs",
          top_k = 5,
          clean = True)

      state = get_addr_state(idx = idx)

      context += retrieve(
          query = f"What are the statistics for the state of {state}",
          namespace = "data-context-docs",
          top_k = 1,
          clean = True)

      doc_set = set()
      for key,val in data_dict.items():
        try:
          doc = retrieve(
              query = f"How does {val} affect loan eligibility?",
              namespace = "global-context-docs",
              top_k = 1,
              clean = True
          )

          if doc not in doc_set:
            context += doc
            doc_set.add(doc)

        except Exception as e:
            raise e

      return context

    # Prompt constructor
    def custom_prompt(context, idx = 0):

      features_shaps,_ = get_features_shaps(shap_row = shap_row_df, idx = idx)
      ## Task Description
      system_prompt = f"""You are an assistant for a loan agent who is analyzing a borrower's loan application, \
    The agent has used an XGBoost machine learning model to predict probability of that borrower paying back the loan timely.
    Your task is to analyze as per the given instructions - the borrower features provided to the model, the model's prediction and the SHAP values generated by the model; and \
    provide a coherent and non-technical explanation on how the provided features influence the model's prediction.\n"""

      pred_features_intro = """\nPrediction, features and SHAP values input:\n"""
      pred_string = f"""- Prediction for the instance: {round(prediction_proba * 100, 2)}% probability of paying back the loan timely.\n"""
      features_string = f"""- Features values and SHAP values of the instance:\n{features_shaps}\n"""
      addr_state = get_addr_state(idx = idx)

      instructions = f"""Instructions:
    Rank the features based on the magnitude of SHAP values from highest to lowest, regardless of the direction, and pick the top 10 features.

    For each feature in the top 10 features:
    - Retrieve the feature definition for the feature.
    - Utilize the definition to find how the feature influences loan eligibility.
    - Utilize the feature's provided SHAP value to determine it's importance for decision making \
    Make sure to interpret the magnitude and direction of the SHAP value correctly.
    - Determine how the feature and it's value contribute to the prediction, such as whether a higher value increases the loan repayment probability or decreases it.\
    Make sure you are considering the general context as well as the context of the particular borrower.
    - Output the following two sections for each feature before moving to another feature:
    Section 1- Feature Definition:
    - The title of the feature
    - The value of this feature for the borrower
    - A brief explanation of what this feature represents, and how it influences the loan repayment probability generally as well as specifically for the borrower.
    - Is there a median value for this feature available for the state of {addr_state}? \
    If yes, how does the borrower's value for this feature compare to the state median for this feature? If no median value is available then do note compare.
    The length of this section should be between 2 to 4 sentences.
    Section 2- SHAP Context:
    - The SHAP value for the feature
    - A brief explanation of the feature's influence on the model's prediction for this particular borrower as represented by the corresponding SHAP value.
    Make sure to interpret the magnitude and direction of the SHAP value correctly. The length of this section should be between 1 to 3 sentences.

    Once all features are covered, conclude with a brief summary of how the top 10 influencing features affect the decision and contribute to the model's prediction. This summary should be under 5 sentences.\n"""

      context = f"""Context:\n{context}\nOutput:\n"""

      prompt = system_prompt + instructions + pred_features_intro + pred_string + features_string + context
      return prompt

    #Set up huggingface pipeline
    temperature = 0.35
    max_length = 7000

    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=max_length, truncation = True)

    llm_agent = HuggingFacePipeline(
        pipeline=pipe,
        model_kwargs={"temperature": temperature, "max_length": max_length},
    )

    def rag_pipeline(idx = 0):
      context = generate_context(idx = idx)
      prompt = custom_prompt(context = context, idx = idx)
      results = llm_agent.invoke(prompt)
      return results

    def rag_generate(idx = 0):
      result = rag_pipeline(idx = idx)
      result_index = result.find("Output:\n")
      filter_result = result[result_index+8:]
      return filter_result

    full_result = rag_generate(idx = idx)
    st.write(f"**Explanation:**\n")
    st.text(f"{full_result}")



    def top_shaps_plot(shap_df, idx):
      _,shaps = get_features_shaps(shap_row = shap_df, idx = idx)
      shaps["Effect"] = shaps["SHAP value"].apply(lambda x: -1 if x < 0 else 1)

      #Plot
      fig, ax = plt.subplots(figsize=(6, 4))
      ax.barh(
          y=shaps["Feature"],
          width=shaps["SHAP value"],
          color=shaps["Effect"].apply(lambda x: "red" if x == -1 else "green")
      )
      ax.set_title("SHAP values for features")
      ax.set_xlabel("SHAP Value (Absolute)")

      legend_items = [
          matplotlib.lines.Line2D([0], [0], marker='o', color='w',
                                  label='Positive Effect', markerfacecolor='green', markersize=10),
          matplotlib.lines.Line2D([0], [0], marker='o', color='w',
                                  label='Negative Effect', markerfacecolor='red', markersize=10),
      ]
      ax.legend(handles=legend_items)

      st.write("**SHAP value plot:**\n")
      st.pyplot(fig)

    top_shaps_plot(shap_df = shap_row_df, idx = idx)

Overwriting app.py


In [None]:
#Fetch local ip address
!curl ifconfig.me

35.229.43.146

# Host app on local tunnel
Enter local ip address as password

In [None]:
!streamlit run app.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K⠙[1G[0K⠹[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://35.229.43.146:8501[0m
[0m
[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0Kyour url is: https://empty-pets-drive.loca.lt
2025-04-05 21:19:22.463 Uncaught exception GET /_stcore/stream (127.0.0.1)
HTTPServerRequest(protocol='http', host='empty-pets-drive.loca.lt', method='GET', uri='/_stcore/stream', version='HTTP/1.1', remote_ip='127.0.0.1')
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/streamlit/web/bootstrap.py", line 347, in run
    if asyncio.get_running_loop().is_running():
       ^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: no running event loop

During handling of the above exception, another exception occurred:

Traceback (most recent