# **Important Libraries**

In [None]:
import re, string, warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from collections import Counter
from wordcloud import WordCloud

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag


from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE


warnings.filterwarnings("ignore")
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

# ***Load DataFrame***

In [None]:
from datasets import load_dataset

ds = load_dataset("bilalRahib/fiqa-personal-finance-dataset")

README.md: 0.00B [00:00, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


data/train-00000-of-00001.parquet:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1987 [00:00<?, ? examples/s]

In [None]:
dfs = [ds[split].to_pandas() for split in ds.keys()]

df = pd.concat(dfs, ignore_index=True)
df

Unnamed: 0,input,output
0,Does U.S. tax code call for small business own...,I am going to keep things very simple and expl...
1,What are 'business fundamentals'?,From http://financial-dictionary.thefreedictio...
2,Business Investment Loss from prior year,You need to give specific dates! In the United...
3,Would the purchase of a car for a business thr...,You don't say what country you live in. If it'...
4,Why is “cheque cashing” a legitimate business?,"In my experience (in the US), the main draw of..."
...,...,...
1982,How do you save money on clothes and shoes for...,I'm all for thrift stores and yard sales. When...
1983,Lump sum annuity distribution — do I owe estat...,"If you are the beneficiary of an annuity, you ..."
1984,Am I required to have a lawyer create / overse...,"This is not intended as legal advice, and only..."
1985,What does it mean to a life insurance policy h...,A stock insurance company is structured like a...


In [None]:
df = df.rename(columns={"input": "question", "output": "answer"})
df


Unnamed: 0,question,answer
0,Does U.S. tax code call for small business own...,I am going to keep things very simple and expl...
1,What are 'business fundamentals'?,From http://financial-dictionary.thefreedictio...
2,Business Investment Loss from prior year,You need to give specific dates! In the United...
3,Would the purchase of a car for a business thr...,You don't say what country you live in. If it'...
4,Why is “cheque cashing” a legitimate business?,"In my experience (in the US), the main draw of..."
...,...,...
1982,How do you save money on clothes and shoes for...,I'm all for thrift stores and yard sales. When...
1983,Lump sum annuity distribution — do I owe estat...,"If you are the beneficiary of an annuity, you ..."
1984,Am I required to have a lawyer create / overse...,"This is not intended as legal advice, and only..."
1985,What does it mean to a life insurance policy h...,A stock insurance company is structured like a...


## ***Explore Data***

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1987 entries, 0 to 1986
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  1987 non-null   object
 1   answer    1987 non-null   object
dtypes: object(2)
memory usage: 31.2+ KB


**Check Null & Drop Them**

In [None]:
df.isnull().sum()

Unnamed: 0,0
question,0
answer,0


**Check Duplicates & Drop Them**

In [None]:
df.duplicated().sum()

np.int64(0)

***Investigate the Data***

In [None]:
df["question"].head(10).tolist()

['Does U.S. tax code call for small business owners to count business purchases as personal income?',
 "What are 'business fundamentals'?",
 'Business Investment Loss from prior year',
 'Would the purchase of a car for a business through the use of a business loan be considered a business expense?',
 'Why is “cheque cashing” a legitimate business?',
 'Valuing a small business to invest in ',
 'Borrow money to invest in a business venture with equity?',
 "Why can't online transactions be completed outside of business hours?",
 'Full-time work + running small side business: Best business structure for taxes?',
 'New vending route business, not sure how to determine taxes']

In [None]:
df["answer"].head(5).tolist()

['I am going to keep things very simple and explain the common-sense reason why the accountant is right: Also, my sister in law owns a small restaurant, where they claim their accountant informed them of the same thing, where a portion of their business purchases had to be counted as taxable personal income. In this case, they said their actual income for the year (through their paychecks) was around 40-50K, but because of this detail, their taxable income came out to be around 180K, causing them to owe a huge amount of tax (30K ish). Consider them and a similarly situated couple that didn\'t make these purchases. Your sister in law is better off in that she has the benefit of these purchases (increasing the value of her business and her expected future income), but she\'s worse off because she got less pay. Presumably, she thought this was a fair trade, otherwise she wouldn\'t have made those purchases. So why should she pay any less in taxes? There\'s no reason making fair trades sho

# **Clean Text**

## 💵 **Currency Normalization Function**

This function standardizes currency mentions in text so that amounts like 40K, 30k ish, $50M, or 200,000 USD become consistent and structured.

✨ **Examples of conversion:**

40K → 40000 USD

30k ish → 30000 USD approx.

$50M → 50000000 USD

200,000 EUR → 200000 EUR

⚙️ **How it works**

1️⃣ Regex Matching 🕵️

Detects currency formats with symbols ($, €, £) or codes (USD, EUR, GBP).

Supports unit suffixes (K, M, B).

2️⃣ Group Logic 📦

3 groups → e.g., $50K → (symbol, number, unit).

2 groups → e.g., 50K USD → (number, unit + code).

3️⃣ Number Cleaning 🧹

Removes commas and periods (e.g., 200,000 → 200000).

4️⃣ Unit Multipliers 📊

K → ×1,000

M → ×1,000,000

B → ×1,000,000,000

5️⃣ Currency Detection 💱

$ or USD → USD

€ or EUR → EUR

£ or GBP → GBP

Defaults to USD

6️⃣ Approximation 🤏

If the original text has "ish" (e.g., 30k ish) → adds "approx.".

In [None]:
def normalize_currency_amount(match):
    """Normalize individual currency amounts"""
    # Different patterns will have different group structures
    groups = match.groups()

    # Determine pattern type based on group count
    if len(groups) == 3:  # Pattern: ([$€£])\s*(\d+)\s*([KkMmBb]?)
        currency_symbol, amount, unit = groups
        approx = ""
    elif len(groups) == 2:  # Pattern: (\d+)\s*([KkMmBb]?)\s*(USD|EUR|GBP)
        amount, unit = groups
        currency_symbol = ""
        approx = "approx." if "ish" in match.group(0).lower() else ""
    else:
        return match.group(0)

    amount = amount.replace(",", "").replace(".", "")

    try:
        n = float(amount)
    except:
        return match.group(0)

    # Apply unit multiplier
    if unit and unit.upper() == "K":
        n *= 1000
    elif unit and unit.upper() == "M":
        n *= 1_000_000
    elif unit and unit.upper() == "B":
        n *= 1_000_000_000

    # Determine currency
    if currency_symbol == "$" or "USD" in match.group(0).upper():
        currency = "USD"
    elif currency_symbol == "€" or "EUR" in match.group(0).upper():
        currency = "EUR"
    elif currency_symbol == "£" or "GBP" in match.group(0).upper():
        currency = "GBP"
    else:
        currency = "USD"

    return f"{int(n)} {currency} {approx}".strip()

## **📊 Currency Range Normalization Function**

This function standardizes currency ranges so that amounts like 40–50K, $5–10M, or 10,000–20,000 USD are converted into a consistent structured format.

✨ **Examples of conversion:**

40–50K → 40000–50000 USD

$5–10M → 5000000–10000000 USD

€1000–2000 → 1000–2000 EUR

⚙️ **How it works**

1️⃣ Regex Matching 🕵️

-Detects numeric ranges with optional unit suffixes (K, M, B).

-Supports currency symbols ($, €, £) or codes (USD, EUR, GBP).

2️⃣ Group Logic 📦

-3 groups → e.g., 40–50K → (start, end, unit).

-4 groups → e.g., $5–10M → (symbol, start, symbol, end).

3️⃣ Number Conversion 🔢

-Converts string numbers into floats, cleans commas.

4️⃣ Unit Multipliers 📊

-K → ×1,000

-M → ×1,000,000

-B → ×1,000,000,000 (💡 can be added if needed).

5️⃣ Currency Detection 💱

-$ or USD → USD

-€ or EUR → EUR

-£ or GBP → GBP

-Defaults to USD

In [None]:
def normalize_currency_range(match):
    """Normalize currency ranges"""
    groups = match.groups()

    if len(groups) == 3:  # Pattern: (\d+)[-–](\d+)\s*([KkMmBb])
        start, end, unit = groups
        currency_symbol = ""
    elif len(groups) == 4:  # Pattern: ([$€£])\s*(\d+)\s*[-–]\s*([$€£])\s*(\d+)
        currency_symbol, start, _, end = groups
    else:
        return match.group(0)

    try:
        start_val = float(start.replace(",", ""))
        end_val = float(end.replace(",", ""))
    except:
        return match.group(0)

    # Apply unit multiplier
    if unit and unit.upper() == "K":
        start_val *= 1000
        end_val *= 1000
    elif unit and unit.upper() == "M":
        start_val *= 1_000_000
        end_val *= 1_000_000

    # Determine currency
    if currency_symbol == "$" or "USD" in match.group(0).upper():
        currency = "USD"
    elif currency_symbol == "€" or "EUR" in match.group(0).upper():
        currency = "EUR"
    elif currency_symbol == "£" or "GBP" in match.group(0).upper():
        currency = "GBP"
    else:
        currency = "USD"

    return f"{int(start_val)}–{int(end_val)} {currency}"

## 🧹 **Financial Text Cleaning**

⚙️ **Steps it performs**

1️⃣ Remove unwanted characters & HTML ✂️

-Gets rid of stray symbols (│ | — ( ) . -)

-Strips out raw HTML tags <...>

2️⃣ Remove URLs & filler phrases 🌐

-Deletes links (http://..., www...)

-Removes unnecessary phrases like “see, for starters at least”

3️⃣ Standardize abbreviations & spelling 🔤

-Converts U.S. → US

-Normalizes check-cashing / check cashing

4️⃣ Normalize currency amounts 💵 💶 💷

-Ranges with units:

Example: 5K–10K → 5000–10000 USD

-Simple amounts with symbols:

Example: $100 → 100 USD

Example: €50 → 50 EUR

-Amounts with units:

Example: $5K → 5000 USD

Example: €2.5M → 2500000 EUR

5️⃣ Whitespace cleanup 📏

-Collapses multiple spaces into one

-Trims leading/trailing spaces

In [None]:
def clean_financial_text(text):
    """
    Comprehensive financial text cleaning and normalization
    """
    if not isinstance(text, str):
        return text

    # 1) Remove unwanted characters and HTML
    text = re.sub(r'[|\ǀ│’|"|—().-]', '', text)
    text = re.sub(r'<.*?>', '', text)

    # 2) Remove URLs and specific phrases
    text = re.sub(r'http\S+|www\S+', ' ', text)
    text = re.sub(r'see,? for starters at least,?', '', text, flags=re.IGNORECASE)

    # 3) Standardize abbreviations
    text = re.sub(r"\bU\.S\.\b", "US", text)
    text = re.sub(r'check[- ]cashing', 'check cashing', text, flags=re.IGNORECASE)

    # 4) Normalize currency patterns - SIMPLIFIED version

    # Handle ranges with units (5K-10K, 1M-2M)
    text = re.sub(r'(\d+[,.]?\d*)[-–](\d+[,.]?\d*)\s*([KkMmBb])\b',
                 lambda m: f"{normalize_single_amount(m.group(1) + m.group(3))}–{normalize_single_amount(m.group(2) + m.group(3))}", text)

    # Handle simple amounts with symbols ($100, €50)
    text = re.sub(r'([$€£])\s*(\d+[,.]?\d*)\b',
                 lambda m: f"{m.group(2).replace(',', '')} {get_currency_code(m.group(1))}", text)

    # Handle amounts with units ($5K, €2.5M)
    text = re.sub(r'([$€£])\s*(\d+[,.]?\d*)\s*([KkMmBb])\b',
                 lambda m: normalize_single_amount(m.group(2) + m.group(3) + " " + get_currency_code(m.group(1))), text)

    # 6) Clean whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

## 🔢 **Normalize Single Amount Helper**

-This function normalize_single_amount(amount_str) converts shorthand financial numbers into their full numeric values.

⚙️ **Steps it performs**

1️⃣ Extracts numbers & units 🔍

-Captures the number part (5, 2.5, 10)

-Detects shorthand units (K, M, B)

2️⃣ Converts shorthand → full value 🔄

-K → multiply by 1,000

-M → multiply by 1,000,000

-B → multiply by 1,000,000,000

3️⃣ Outputs normalized integer ✅

-Returns the expanded numeric string

In [None]:
def normalize_single_amount(amount_str):
    """Helper to normalize single amount strings"""
    amount_str = amount_str.upper()

    # Extract number and unit
    num_match = re.search(r'(\d+[,.]?\d*)', amount_str)
    unit_match = re.search(r'([KMB])', amount_str)

    if not num_match:
        return amount_str

    amount = float(num_match.group(1).replace(",", ""))

    if unit_match:
        unit = unit_match.group(1)
        if unit == "K":
            amount *= 1000
        elif unit == "M":
            amount *= 1_000_000
        elif unit == "B":
            amount *= 1_000_000_000

    return str(int(amount))

## 💱 **Get Currency Code**

-This function get_currency_code(symbol) maps currency symbols to their ISO currency codes.

⚙️ **Steps it performs**

1️⃣ Checks a lookup dictionary 🗂️

-symbol_map = {'$': 'USD', '€': 'EUR', '£': 'GBP'}


2️⃣ Returns the code based on the symbol provided.

3️⃣ Defaults to USD 💵 if the symbol isn’t found.

In [None]:
def get_currency_code(symbol):
    """Map currency symbol to code"""
    symbol_map = {'$': 'USD', '€': 'EUR', '£': 'GBP'}
    return symbol_map.get(symbol, 'USD')

## 🧹**Normalize DataFrame Text**

-This function normalize_dataframe_text(df, text_columns) cleans and applies financial text normalization 🏦 to specific text columns in a Pandas DataFrame.

⚙️ **Steps it performs**

1️⃣ Iterates through given columns 🗂️

-Only processes columns that actually exist in the DataFrame.

2️⃣ Basic cleaning ✨

-Converts values to string

-Removes leading/trailing spaces

-Collapses multiple spaces → single space


3️⃣ Applies financial text cleaning 💱

-Runs your clean_financial_text() function

-Normalizes numbers, currency ranges, removes links, unwanted chars, etc.

4️⃣ Returns cleaned DataFrame 📊

In [None]:
def normalize_dataframe_text(df, text_columns):
    """
    Apply text normalization to specified columns in dataframe
    """
    for col in text_columns:
        if col in df.columns:
            # Basic cleaning first
            df[col] = df[col].astype(str).str.strip().str.replace("\s+", " ", regex=True)
            # Then apply financial normalization
            df[col] = df[col].apply(clean_financial_text)

    return df

**Cleaned Data**

In [None]:
df = normalize_dataframe_text(df, ['question', 'answer'])
df

Unnamed: 0,question,answer
0,Does US tax code call for small business owner...,I am going to keep things very simple and expl...
1,What are 'business fundamentals'?,From The facts that affect a company's underly...
2,Business Investment Loss from prior year,You need to give specific dates! In the United...
3,Would the purchase of a car for a business thr...,You don't say what country you live in If it's...
4,Why is “cheque cashing” a legitimate business?,"In my experience in the US, the main draw of c..."
...,...,...
1982,How do you save money on clothes and shoes for...,I'm all for thrift stores and yard sales When ...
1983,Lump sum annuity distribution do I owe estate ...,"If you are the beneficiary of an annuity, you ..."
1984,Am I required to have a lawyer create / overse...,"This is not intended as legal advice, and only..."
1985,What does it mean to a life insurance policy h...,A stock insurance company is structured like a...


In [None]:
df["question"].head(10).tolist()

['Does US tax code call for small business owners to count business purchases as personal income?',
 "What are 'business fundamentals'?",
 'Business Investment Loss from prior year',
 'Would the purchase of a car for a business through the use of a business loan be considered a business expense?',
 'Why is “cheque cashing” a legitimate business?',
 'Valuing a small business to invest in',
 'Borrow money to invest in a business venture with equity?',
 "Why can't online transactions be completed outside of business hours?",
 'Fulltime work + running small side business: Best business structure for taxes?',
 'New vending route business, not sure how to determine taxes']

In [None]:
df["answer"].head(5).tolist()

["I am going to keep things very simple and explain the commonsense reason why the accountant is right: Also, my sister in law owns a small restaurant, where they claim their accountant informed them of the same thing, where a portion of their business purchases had to be counted as taxable personal income In this case, they said their actual income for the year through their paychecks was around 4050K, but because of this detail, their taxable income came out to be around 180K, causing them to owe a huge amount of tax 30K ish Consider them and a similarly situated couple that didn't make these purchases Your sister in law is better off in that she has the benefit of these purchases increasing the value of her business and her expected future income, but she's worse off because she got less pay Presumably, she thought this was a fair trade, otherwise she wouldn't have made those purchases So why should she pay any less in taxes? There's no reason making fair trades should reduce anyone

# **Visualization of Top `20` Words Appear**

In [None]:
stop_words = set(stopwords.words("english"))
all_words = []
for text in df["question"]:
    for word in text.split():
        if word not in stop_words:
            all_words.append(word)

word_freq = Counter(all_words)
most_common_words = word_freq.most_common(20)  # top 20 words

df_words = pd.DataFrame(most_common_words, columns=["word", "count"])
df_words

Unnamed: 0,word,count
0,I,383
1,How,355
2,What,276
3,stock,157
4,Is,134
5,Why,129
6,money,85
7,Can,85
8,company,82
9,US,74


In [None]:
fig = px.bar(df_words,x="count",y="word",orientation="h",color="count",color_continuous_scale="Viridis",  # Stylish gradient
             title="Top 20 Most Frequent Words in Reviews")

fig.update_layout(
    xaxis_title="Count",
    yaxis_title="Word",
    template="plotly_dark",
    yaxis={'categoryorder':'total ascending'}
)
fig.show()


In [None]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go

top_n = 80
items = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:top_n]
words, counts = zip(*items)
dfw = pd.DataFrame({"word": words, "count": counts})

min_size, max_size = 12, 48
sizes = np.interp(dfw["count"], (dfw["count"].min(), dfw["count"].max()), (min_size, max_size))

n_cols = 6
n_rows = int(np.ceil(len(dfw) / n_cols))
x = np.tile(np.arange(n_cols), n_rows)[:len(dfw)]
y = -np.repeat(np.arange(n_rows), n_cols)[:len(dfw)]

fig = go.Figure(go.Scatter(
    x=x, y=y, mode="text",
    text=dfw["word"],
    textfont=dict(size=sizes),
    hovertext=[f"{w}: {c}" for w, c in zip(dfw["word"], dfw["count"])],
    hoverinfo="text"
))
fig.update_xaxes(visible=False)
fig.update_yaxes(visible=False)
fig.update_layout(
    title="Interactive Tag Wall (descending sizes, hover for counts)",
    plot_bgcolor="white", margin=dict(t=60, l=0, r=0, b=0)
)
fig.show()


In [None]:
stop_words = set(stopwords.words("english"))
all_words = []
for text in df["answer"]:
    for word in text.split():
        if word not in stop_words:
            all_words.append(word)

word_freq = Counter(all_words)
most_common_words = word_freq.most_common(20)  # top 20 words

df_words = pd.DataFrame(most_common_words, columns=["word", "count"])
df_words

Unnamed: 0,word,count
0,I,2791
1,The,2087
2,If,1902
3,would,1892
4,money,1657
5,USD,1614
6,tax,1390
7,stock,1295
8,You,1266
9,get,1196


In [None]:
fig = px.bar(df_words,x="count",y="word",orientation="h",color="count",color_continuous_scale="Viridis",  # Stylish gradient
             title="Top 20 Most Frequent Words in Reviews")

fig.update_layout(
    xaxis_title="Count",
    yaxis_title="Word",
    template="plotly_dark",
    yaxis={'categoryorder':'total ascending'}
)
fig.show()


In [None]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go

top_n = 80
items = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:top_n]
words, counts = zip(*items)
dfw = pd.DataFrame({"word": words, "count": counts})

min_size, max_size = 12, 48
sizes = np.interp(dfw["count"], (dfw["count"].min(), dfw["count"].max()), (min_size, max_size))

n_cols = 6
n_rows = int(np.ceil(len(dfw) / n_cols))
x = np.tile(np.arange(n_cols), n_rows)[:len(dfw)]
y = -np.repeat(np.arange(n_rows), n_cols)[:len(dfw)]

fig = go.Figure(go.Scatter(
    x=x, y=y, mode="text",
    text=dfw["word"],
    textfont=dict(size=sizes),
    hovertext=[f"{w}: {c}" for w, c in zip(dfw["word"], dfw["count"])],
    hoverinfo="text"
))
fig.update_xaxes(visible=False)
fig.update_yaxes(visible=False)
fig.update_layout(
    title="Interactive Tag Wall (descending sizes, hover for counts)",
    plot_bgcolor="white", margin=dict(t=60, l=0, r=0, b=0)
)
fig.show()


In [None]:
!pip install -q faiss-cpu sentence-transformers transformers streamlit



[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m52.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m78.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m79.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
import faiss
import torch
import numpy as np
import os
from sentence_transformers import SentenceTransformer
from transformers import pipeline, AutoTokenizer

In [None]:
df

Unnamed: 0,question,answer
0,Does US tax code call for small business owner...,I am going to keep things very simple and expl...
1,What are 'business fundamentals'?,From The facts that affect a company's underly...
2,Business Investment Loss from prior year,You need to give specific dates! In the United...
3,Would the purchase of a car for a business thr...,You don't say what country you live in If it's...
4,Why is “cheque cashing” a legitimate business?,"In my experience in the US, the main draw of c..."
...,...,...
1982,How do you save money on clothes and shoes for...,I'm all for thrift stores and yard sales When ...
1983,Lump sum annuity distribution do I owe estate ...,"If you are the beneficiary of an annuity, you ..."
1984,Am I required to have a lawyer create / overse...,"This is not intended as legal advice, and only..."
1985,What does it mean to a life insurance policy h...,A stock insurance company is structured like a...


In [None]:
df.to_csv("cleaned_fiqa.csv", index=False)


In [None]:
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

qa_model = pipeline(
    "question-answering",
    model="deepset/bert-base-cased-squad2",
    tokenizer="deepset/bert-base-cased-squad2",
    device=0 if torch.cuda.is_available() else -1
)

summarizer = pipeline(
    "summarization",
    model="facebook/bart-large-cnn",
    device=0 if torch.cuda.is_available() else -1
)

summarizer_tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at deepset/bert-base-cased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/152 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Fetching 0 files: 0it [00:00, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 0 files: 0it [00:00, ?it/s]

Device set to use cuda:0


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


In [None]:
# ======================
# Safe truncate helper
# ======================
def safe_truncate_for_summarizer(text, tokenizer, safety_margin=50):
    max_tokens = getattr(tokenizer, "model_max_length", 1024)
    if max_tokens is None or max_tokens <= 0 or max_tokens > 4096:
        max_tokens = 1024
    allowed_tokens = max_tokens - safety_margin
    tokens = tokenizer.encode(text, truncation=False)
    if len(tokens) <= allowed_tokens:
        return text
    return tokenizer.decode(tokens[:allowed_tokens], skip_special_tokens=True)

In [151]:
# ======================
# Formatter
# ======================
def format_response(direct_answer, explanation):
    return f"""
💡 ** Answer:** {direct_answer.strip()}

📖 **Explanation:** {explanation.strip()}
"""

In [None]:
# ======================
# FAISS index
# ======================
INDEX_FILE = "fiqa.index"
EMBEDDINGS_FILE = "fiqa_embeddings.npy"

if os.path.exists(INDEX_FILE) and os.path.exists(EMBEDDINGS_FILE):
    index = faiss.read_index(INDEX_FILE)
    embeddings = np.load(EMBEDDINGS_FILE)
else:
    corpus = (df["question"] + " " + df["answer"]).tolist()
    embeddings = embedder.encode(corpus, convert_to_numpy=True)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    faiss.write_index(index, INDEX_FILE)
    np.save(EMBEDDINGS_FILE, embeddings)

In [None]:
# ======================
# Retrieval + Answer
# ======================
def retrieve_answer(query, top_k=3):
    query_vec = embedder.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_vec, top_k)

    retrieved_contexts = []
    for i in indices[0]:
        q = df.iloc[i]["question"]
        a = df.iloc[i]["answer"]
        retrieved_contexts.append(f"Q: {q}\nA: {a}")
    context_text = " ".join(retrieved_contexts)

    # Step 1: QA
    try:
        result = qa_model(question=query, context=context_text)
        direct_answer = result.get("answer", "").strip()
    except Exception as e:
        print("QA extraction error:", e)
        direct_answer = "No short answer found."

    # Step 2: Explanation
    explanation = ""
    try:
        summary_input = f"Question: {query}\nContext: {context_text}"
        truncated_input = safe_truncate_for_summarizer(summary_input, summarizer_tokenizer)
        summ = summarizer(
            truncated_input,
            max_length=120,
            min_length=30,
            do_sample=False
        )
        explanation = summ[0]["summary_text"].strip()
    except Exception as e:
        print("Summarization failed:", e)
        explanation = "Explanation unavailable."

    return format_response(direct_answer, explanation), retrieved_contexts



In [None]:
# 5.1 Quick Test
# ===========================
query = "Can I Accept Gold?"
answer, refs = retrieve_answer(query)

print("🤖 Advisor Answer:\n", answer)
print("\n📚 References:")
for r in refs:
    print("-", r)

🤖 Advisor Answer:
 
💡 **Direct Answer:** Can I Accept Gold?
A: Of course you can accept gold

📖 **Explanation:** Gold is not an investment Aside from a few industrial uses, it has no productive value It is, at best, a hedge against inflation. buying tiny amounts of gold as coin or bullion from a retail dealer will always involve a fairly significant spread from the commodity spot price.


📚 References:
- Q: Can I Accept Gold?
A: Of course you can accept gold as payment Would anyone pay in gold? Would it have tax consequences on your federal taxes? These additional questions are offtopic on this site about personal finance
- Q: I want to invest in Gold Where do I go and buy it?
A: I do not know anything about retail investing in India, since I am in the US However, there are a couple of general things to keep in mind about gold that should be largely independent of country First, gold is not an investment Aside from a few industrial uses, it has no productive value It is, at best, a hed

In [None]:
# 5.2 Quick Test
# ===========================
query = "Is it better to rent or buy a house?"
answer, refs = retrieve_answer(query)

print("🤖 Advisor Answer:\n", answer)
print("\n📚 References:")
for r in refs:
    print("-", r)

🤖 Advisor Answer:
 
💡 **Direct Answer:** buy

📖 **Explanation:** If a house can't be sold due to the crowded market you will be forced to rent the house. One benefit to buying a house in a market that is easy to rent a house would be if you are forced to move quickly, then you aren't stuck being 3 months into a 12 month lease. A portion of the tax savings is 'lost' to the fact that you have a standard deduction of nearly 6000 USD in 2012.


📚 References:
- Q: Buying a house for a shorter term
A: If there are a lot of houses for sale, can you be sure that in a year or two you can sell yours? How long does the average house in that area stay on the market before it is sold? What percentage of houses never get sold? If it can't be sold due to the crowded market you will be forced to rent the house The question for you then is how much rental income can you get? Compare the rental income to your monthly cost of owning, and managing the house One benefit to buying a house in a market that i

In [None]:
# 5.3 Quick Test
# ===========================
query = "How much should I save for retirement?"
response, refs = retrieve_answer(query)

print("🤖 Advisor Answer:\n", response)
print("\n📚 References:")
for r in refs:
    print("-", r)

🤖 Advisor Answer:
 
💡 **Direct Answer:** 15%

📖 **Explanation:** If you plan to retire early and spend the same amount of money every year adjusted for inflation, then you need to save twentytimes your yearly spending to satisfy the 4% Safe Withdrawal rule of thumb. In America, people who retire in their sixties tend to reduce their spending by 30% This is for a host of reasons like not eating out as much, not driving to work, paid off mortgages, and their children being adults now.


📚 References:
- Q: When should I start saving/investing for my retirement?
A: My basic rule I tell everyone who will listen is to always live like you're a college student if you could make it on 20000 a year, when you get your first real job at 40000 eg, put all the rest into savings to start 401k, IRA, etc Gradually increase your lifestyle expenses after you hit major savings goals 3+ month emergency fund, house down payment, etc Any time you get a raise, start by socking it all into your employer's 401

In [None]:
!pip install bert-score

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13


In [None]:
from bert_score import score

# Example: reference vs model answer
references = ["Of course you can accept gold as payment. Would anyone pay in gold? Would it have tax consequences on your federal taxes? These additional questions are off-topic on this site about personal finance."]
candidates = ["Of course you can accept goldزGold is not an investment Aside from a few industrial uses, it has no productive value It is, at best, a hedge against inflation. buying tiny amounts of gold as coin or bullion from a retail dealer will always involve a fairly significant spread from the commodity spot price."]

P, R, F1 = score(candidates, references, lang="en", verbose=True)
print("BERTScore F1:", F1.mean().item())

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.20 seconds, 4.91 sentences/sec
BERTScore F1: 0.8534860610961914


In [118]:
!pip install -q streamlit streamlit-audiorecorder transformers sentence-transformers faiss-cpu gTTS librosa plotly


In [202]:
%%writefile app.py
# Finance Advisor — chats-only sidebar + mic + magical pink
# - Messages render *inside* bubbles
# - Green composer is the actual input
# - RAG: MiniLM + FAISS + SQuAD2 QA + BART summary
# - Donut chart (transparent), finance robot SVG, chats history

import os, time, uuid
from io import BytesIO
from html import escape

import numpy as np
import pandas as pd
import streamlit as st
import plotly.graph_objects as go

import faiss
import torch
import librosa

from sentence_transformers import SentenceTransformer
from transformers import pipeline, Wav2Vec2ForCTC, Wav2Vec2Processor, AutoTokenizer
from gtts import gTTS
from audiorecorder import audiorecorder  # 🎤

# -------------------------------
# Page config + Global CSS theme
# -------------------------------
st.set_page_config(
    page_title="Finance Advisor",
    page_icon="🤖",
    layout="wide",
    initial_sidebar_state="expanded",
)

def _rerun():
    if hasattr(st, "rerun"):
        st.rerun()
    elif hasattr(st, "experimental_rerun"):
        st.experimental_rerun()

MAGICAL_CSS = """
<style>
:root{
  --bg1:#fff0f6; --bg2:#f3e8ff; --bg3:#fff7fb;
  --panel:#ffffff; --text:#1f2a44; --muted:#6b5566;
  --accent:#d946ef; --accent2:#a78bfa; --border:rgba(167,139,250,0.30);
  --bubbleUser:#e6faf5; --bubbleBot:#ffffff; --shadow: 0 16px 44px rgba(167,139,250,0.18);
  --composeBG:#e9fbf4; --composeBorder:rgba(52,199,142,0.35);
}
[data-testid="stHeader"]{ display:none !important; }
html, body, .stApp {
  background:
    radial-gradient(120% 160% at 0% 0%, var(--bg1) 0%, rgba(255,240,246,0) 70%),
    radial-gradient(120% 160% at 100% 0%, var(--bg2) 0%, rgba(243,232,255,0) 65%),
    radial-gradient(160% 220% at 50% 100%, var(--bg3) 0%, rgba(255,247,251,0) 72%),
    #fff9fd; color: var(--text);
}
[data-testid="stSidebar"]{
  background: linear-gradient(180deg, #fff7fb 0%, #f6ecff 100%);
  border-right: 1px solid var(--border);
}

/* Cards & typography */
.glass{ background: var(--panel); border: 1px solid var(--border); box-shadow: var(--shadow);
  -webkit-backdrop-filter: blur(14px); backdrop-filter: blur(14px); border-radius: 24px; }
.title{ font-weight:900; font-size:2rem; letter-spacing:.2px;
  background: linear-gradient(90deg, var(--accent), var(--accent2));
  -webkit-background-clip:text; background-clip:text; color:transparent; }
.subtitle{ color:var(--muted); }

/* Chat bubbles */
.chat-wrap{ padding: 6px 8px 12px; }
.bubble{ max-width: 86%; padding:14px 18px; margin:12px 0; border-radius:22px;
  border:1px solid var(--border); box-shadow:0 10px 24px rgba(167,139,250,0.12); }
.user{ margin-left:auto; background: var(--bubbleUser); color:#0e2730; border-color: rgba(88,196,183,.25); }
.assistant{ margin-right:auto; background: var(--bubbleBot); color:var(--text); }

/* Assistant answer formatting */
.ans .hdr{margin:0 0 6px 0}
.ans b{color:#1f2a44}

/* Text input itself is the green pill */
.stTextInput>div>div{
  border-radius: 999px !important;
  background: var(--composeBG) !important;
  border:1px solid var(--composeBorder) !important;
  box-shadow: 0 10px 26px rgba(52,199,142,0.12);
}
.stTextInput>div>div>input{
  background: transparent !important;
  color: var(--text) !important;
  padding: 10px 16px; height: 42px;
}
.stTextInput input::placeholder{ color:#3c6b5f; opacity:.75; }

/* Buttons */
.stButton>button{
  background: linear-gradient(135deg, var(--accent), var(--accent2)); color:#fff; border:none;
  border-radius: 999px; padding:10px 16px; box-shadow:0 6px 18px rgba(167,139,250,0.25);
}
.stButton>button:hover{ filter: brightness(1.05); transform: translateY(-1px); }

/* Mic small circle */
.mic-btn .stButton>button{
  width:42px; height:42px; padding:0; display:flex; align-items:center; justify-content:center; border-radius:50%;
}

/* Mic panel */
.mic-panel{ margin-top:8px; border-radius:18px; border:1px solid var(--border);
  box-shadow:0 14px 36px rgba(167,139,250,0.22); padding:10px 12px; background:#fff; }

/* Sidebar chats list */
.chat-active{
  margin:10px 0 6px; padding:10px 12px; border-radius:12px; background:#fff; border:1px solid var(--border);
  box-shadow:0 6px 18px rgba(167,139,250,0.12); font-weight:700; color:var(--text);
}
.sidebar-history{
  margin-top:10px; background:#fff; border:1px solid var(--border);
  border-radius:16px; padding:10px; max-height:38vh; overflow:auto;
  box-shadow:0 6px 18px rgba(167,139,250,0.12);
}
.qa-li{ border-bottom:1px dashed var(--border); padding:8px 2px; }
.qa-li:last-child{ border-bottom:none; }
.qa-q{ font-weight:700; color:var(--text); }
.qa-a{ color:var(--muted); font-size:.92rem; margin-top:2px; }
.small{ color:var(--muted); font-size:.84rem; }

/* Footer */
.footer{ text-align:center; color: var(--muted); border-top:1px solid var(--border);
  background: linear-gradient(90deg, #fff0f6 0%, #f3e8ff 100%);
  padding: 8px 12px; border-radius: 16px; margin-top: 12px; }

/* spacers */
.spacer-12{height:12px;} .spacer-16{height:16px;} .spacer-24{height:24px;}
</style>
"""
st.markdown(MAGICAL_CSS, unsafe_allow_html=True)

# -----------------------
# Session State
# -----------------------
if "chats" not in st.session_state:
    cid = str(uuid.uuid4())[:8]
    st.session_state.chats = {
        cid: {
            "title": "New Chat",
            "messages": [
                {"role": "assistant", "content": "<p>Hello! How can I assist you today?</p>", "is_html": True, "refs": []}
            ],
            "created_ts": time.time(),
        }
    }
    st.session_state.current_chat_id = cid

if "mic_open" not in st.session_state:
    st.session_state.mic_open = False
if "last_user_fp" not in st.session_state:
    st.session_state.last_user_fp = None
if "last_voice_fp" not in st.session_state:
    st.session_state.last_voice_fp = None

def _fingerprint(text: str) -> int:
    return hash((" ".join(str(text).strip().split())).lower())

def _current_chat():
    return st.session_state.chats[st.session_state.current_chat_id]

def _new_chat():
    cid = str(uuid.uuid4())[:8]
    st.session_state.chats[cid] = {
        "title":"New Chat",
        "messages":[{"role":"assistant","content":"<p>Hello! How can I assist you today?</p>","is_html":True,"refs":[]}],
        "created_ts":time.time()
    }
    st.session_state.current_chat_id = cid

def _derive_title(text:str)->str:
    t = " ".join(text.strip().split())
    return (t[:28]+"…") if len(t)>31 else (t if t else "New Chat")

def _append(role, content, via_voice=False, refs=None, is_html=False, preview=None):
    chat = _current_chat()
    chat["messages"].append({
        "role": role, "content": content, "via_voice": via_voice, "refs": refs or [], "is_html": is_html,
        "preview": preview or (content if not is_html else "")
    })
    if role == "user":
        user_count = sum(1 for m in chat["messages"] if m["role"]=="user")
        if user_count == 1:
            chat["title"] = _derive_title(content)

def pairs_from_messages(msgs):
    pairs = []
    i = 0
    while i < len(msgs):
        if msgs[i]["role"] == "user":
            q = msgs[i]["preview"] if msgs[i].get("preview") else msgs[i]["content"]
            j = i + 1
            while j < len(msgs) and msgs[j]["role"] != "assistant":
                j += 1
            a_msg = msgs[j] if j < len(msgs) and msgs[j]["role"] == "assistant" else None
            a = (a_msg.get("preview") if a_msg and a_msg.get("preview") else "") if a_msg else ""
            pairs.append((q, a))
            i = j + 1 if a_msg else i + 1
        else:
            i += 1
    return pairs

def short(txt, n=90):
    t = " ".join(str(txt).split())
    return (t[:n] + "…") if len(t) > n else t

# ---------------
# Data (FIQA CSV)
# ---------------
@st.cache_resource(show_spinner=True)
def load_data():
    if not os.path.exists("cleaned_fiqa.csv"):
        raise FileNotFoundError("cleaned_fiqa.csv not found next to app.py")
    df = pd.read_csv("cleaned_fiqa.csv")
    rename = {}
    if "input" in df.columns: rename["input"] = "question"
    if "output" in df.columns: rename["output"] = "answer"
    if rename: df = df.rename(columns=rename)
    if "question" not in df.columns or "answer" not in df.columns:
        raise ValueError("CSV must have columns 'question' and 'answer' (or 'input'/'output').")
    return df.drop_duplicates().reset_index(drop=True)

try:
    df = load_data()
    DATA_OK = True
except Exception as e:
    st.error(f"🚨 Data loading error: {e}")
    DATA_OK = False
    df = pd.DataFrame(columns=["question","answer"])

# -------------
# Build models
# -------------
@st.cache_resource(show_spinner=True)
def build_models():
    embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    qa_device = 0 if torch.cuda.is_available() else -1
    qa_model = pipeline(
        "question-answering",
        model="deepset/bert-base-cased-squad2",
        tokenizer="deepset/bert-base-cased-squad2",
        device=qa_device
    )
    sum_device = 0 if torch.cuda.is_available() else -1
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=sum_device)
    summarizer_tok = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
    asr_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
    return embedder, qa_model, summarizer, summarizer_tok, processor, asr_model

if DATA_OK:
    embedder, qa_model, summarizer, summarizer_tok, processor, asr_model = build_models()
else:
    embedder = qa_model = summarizer = summarizer_tok = processor = asr_model = None

# ------------
# FAISS index
# ------------
INDEX_FILE = "fiqa.index"
EMB_FILE   = "fiqa_embeddings.npy"

@st.cache_resource(show_spinner=True)
def build_or_load_index(corpus, _embedder):
    if os.path.exists(INDEX_FILE) and os.path.exists(EMB_FILE):
        index = faiss.read_index(INDEX_FILE)
        embeddings = np.load(EMB_FILE)
    else:
        embeddings = _embedder.encode(corpus, convert_to_numpy=True, show_progress_bar=True)
        index = faiss.IndexFlatL2(embeddings.shape[1])
        index.add(embeddings)
        faiss.write_index(index, INDEX_FILE)
        np.save(EMB_FILE, embeddings)
    return index, embeddings

if DATA_OK:
    corpus = (df["question"].astype(str) + " " + df["answer"].astype(str)).tolist()
    index, _ = build_or_load_index(corpus, embedder)
else:
    index = None

# ------------
# RAG helpers
# ------------
def safe_truncate_for_summarizer(text, tokenizer, safety_margin=50):
    max_tokens = getattr(tokenizer, "model_max_length", 1024)
    if not max_tokens or max_tokens <= 0 or max_tokens > 4096:
        max_tokens = 1024
    allowed = max_tokens - safety_margin
    tokens = tokenizer.encode(text, truncation=False)
    if len(tokens) <= allowed:
        return text
    return tokenizer.decode(tokens[:allowed], skip_special_tokens=True)

def retrieve_answer(query, top_k=3, summarizer_max_len=120, summarizer_min_len=30):
    """Return direct, explanation, refs(list)"""
    if not (DATA_OK and embedder is not None and index is not None):
        return ("Please add your dataset (cleaned_fiqa.csv) to enable finance Q&A.", "Explanation unavailable.", [])

    q_vec = embedder.encode([query], convert_to_numpy=True)
    _, indices = index.search(q_vec, top_k)

    retrieved = []
    for i in indices[0]:
        q = str(df.iloc[i]["question"])
        a = str(df.iloc[i]["answer"])
        retrieved.append(f"Q: {q}\nA: {a}")
    context_text = " ".join(retrieved)

    # QA
    try:
        res = qa_model(question=query, context=context_text)
        direct = (res.get("answer") or "").strip() or "I couldn't extract a short answer automatically."
    except Exception:
        direct = "I couldn't extract a short answer automatically."

    # Summary
    try:
        summary_input = f"Question: {query}\nContext: {context_text}"
        truncated = safe_truncate_for_summarizer(summary_input, summarizer_tok)
        summ = summarizer(truncated, max_length=summarizer_max_len, min_length=summarizer_min_len, do_sample=False)
        explain = summ[0]["summary_text"].strip()
    except Exception:
        explain = "Explanation unavailable. Here are the retrieved contexts."

    return direct, explain, retrieved

def build_answer_html(direct: str, explanation: str) -> str:
    return f"""
    <div class="ans">
      <p class="hdr">💡 <b>Direct Answer:</b> {escape(direct)}</p>
      <p>📖 <b>Explanation:</b> {escape(explanation)}</p>
    </div>
    """

def transcribe_audiosegment_to_text(seg, wav_path="input.wav"):
    seg.export(wav_path, format="wav")
    audio_input, sr = librosa.load(wav_path, sr=16000)
    input_values = processor(audio_input, return_tensors="pt", sampling_rate=16000).input_values
    with torch.no_grad():
        logits = asr_model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    return processor.batch_decode(predicted_ids)[0]

def tts_bytes(text, lang="en"):
    buf = BytesIO()
    gTTS(text=text, lang=lang).write_to_fp(buf)
    return buf.getvalue()

# ----------
# Donut fig
# ----------
def donut_fig():
    labels = ['Housing','Transport','Food','Savings','Fun','Other']
    values = [35, 15, 20, 10, 10, 10]
    colors = ['#FBCFE8','#C4B5FD','#FDE68A','#86EFAC','#93C5FD','#FCA5A5']
    fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=0.62)])
    fig.update_traces(
        textinfo='percent',
        hovertemplate='%{label}: %{value}%<extra></extra>',
        marker=dict(colors=colors, line=dict(color='white', width=1))
    )
    fig.update_layout(
        showlegend=True,
        legend=dict(orientation="h", y=-0.1, font=dict(size=12, color="#1f2a44")),
        margin=dict(t=10,b=10,l=10,r=10),
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        font=dict(color="#1f2a44")
    )
    return fig

# -----------
# SIDEBAR UI (Chats + History)
# -----------
with st.sidebar:
    st.markdown("### Chats")
    if st.button("➕ New Chat", use_container_width=True, key="btn_new_chat"):
        _new_chat()
        _rerun()

    # Active chat badge (not a button)
    active_chat = _current_chat()
    st.markdown(f"<div class='chat-active'>🗂️ {escape(active_chat['title'])}</div>", unsafe_allow_html=True)

    # Other chats as buttons
    for cid, meta in sorted(st.session_state.chats.items(), key=lambda x: x[1]["created_ts"], reverse=True):
        if cid == st.session_state.current_chat_id:
            continue
        label = meta["title"] or "New Chat"
        if st.button("➤ " + label, key=f"open_{cid}", use_container_width=True):
            st.session_state.current_chat_id = cid
            _rerun()

    # History of current chat (Q→A)
    st.markdown("### History")
    pairs = pairs_from_messages(active_chat["messages"])
    if not pairs:
        st.caption("No messages yet.")
    else:
        st.markdown('<div class="sidebar-history">', unsafe_allow_html=True)
        for q, a in pairs[::-1][:50]:
            q_sh = short(q, 100)
            a_sh = short(a or '— pending —', 120)
            st.markdown(
                f"<div class='qa-li'><div class='qa-q'>Q: {escape(q_sh)}</div>"
                f"<div class='qa-a'>A: {escape(a_sh)}</div></div>",
                unsafe_allow_html=True
            )
        st.markdown("</div>", unsafe_allow_html=True)

# -----------------
# MAIN LAYOUT AREA
# -----------------
left, center, right = st.columns([0.02, 0.66, 0.32])

with center:
    st.markdown('<div class="glass" style="padding:18px;">', unsafe_allow_html=True)

    # Header — nicer finance robot SVG
    finance_bot = """
    <svg width="64" height="64" viewBox="0 0 180 180" xmlns="http://www.w3.org/2000/svg" aria-hidden="true">
      <defs>
        <linearGradient id="g1" x1="0" y1="0" x2="1" y2="1">
          <stop offset="0" stop-color="#D946EF"/>
          <stop offset="1" stop-color="#A78BFA"/>
        </linearGradient>
        <linearGradient id="g2" x1="0" y1="0" x2="0" y2="1">
          <stop offset="0" stop-color="#FBCFE8"/>
          <stop offset="1" stop-color="#E9D5FF"/>
        </linearGradient>
      </defs>
      <!-- Body -->
      <rect x="28" y="52" rx="20" ry="20" width="124" height="92" fill="#FFFFFF" stroke="url(#g1)" stroke-width="4"/>
      <!-- Head bar -->
      <rect x="54" y="28" rx="10" ry="10" width="72" height="18" fill="url(#g1)"/>
      <!-- Antenna coin -->
      <circle cx="90" cy="24" r="6" fill="#FFD166"/>
      <!-- Eyes -->
      <circle cx="72" cy="90" r="11" fill="#8B5CF6"/>
      <circle cx="108" cy="90" r="11" fill="#8B5CF6"/>
      <!-- Mouth -->
      <rect x="64" y="110" rx="6" ry="6" width="52" height="10" fill="#E9D5FF"/>
      <!-- Finance coin -->
      <circle cx="44" cy="128" r="16" fill="url(#g2)" stroke="#A78BFA" stroke-width="3"/>
      <text x="44" y="132" text-anchor="middle" font-size="14" font-weight="700" fill="#7C3AED">$</text>
      <!-- Chart -->
      <polyline points="96,132 112,118 126,124 148,102" fill="none" stroke="#34C78E" stroke-width="5"/>
      <circle cx="96" cy="132" r="4" fill="#34C78E"/>
      <circle cx="112" cy="118" r="4" fill="#34C78E"/>
      <circle cx="126" cy="124" r="4" fill="#34C78E"/>
      <circle cx="148" cy="102" r="4" fill="#34C78E"/>
    </svg>
    """
    cA, cB = st.columns([0.12, 0.88])
    with cA: st.markdown(finance_bot, unsafe_allow_html=True)
    with cB:
        st.markdown('<div class="title">Finance Advisor</div>', unsafe_allow_html=True)
        st.markdown('<div class="subtitle">Clear answers on budgeting, investing, taxes, and loans.</div>', unsafe_allow_html=True)

    st.markdown('<div style="height:10px;border-bottom:1px solid var(--border);"></div>', unsafe_allow_html=True)

    # Chat history: render bubbles (single call per message)
    st.markdown('<div class="chat-wrap">', unsafe_allow_html=True)
    for m in _current_chat()["messages"]:
        cls = "assistant" if m["role"] == "assistant" else "user"
        # content
        if m.get("is_html"):
            inner = m["content"]
        else:
            inner = escape(str(m["content"])).replace("\n", "<br/>")
        st.markdown(f'<div class="bubble {cls}">{inner}</div>', unsafe_allow_html=True)

        if m["role"] == "assistant" and m.get("refs"):
            with st.expander("📚 References"):
                for r in m["refs"]:
                    if "\nA:" in r and r.startswith("Q: "):
                        q = r.split("\n",1)[0][3:].strip()
                        a = r.split("\n",1)[1][3:].strip()
                        a_short = short(a, 260)
                        st.markdown(f"<div class='ans'><p><b>Q:</b> {escape(q)}</p><p>{escape(a_short)}</p></div>", unsafe_allow_html=True)
                    else:
                        st.markdown(f"<div class='ans'>{escape(r)}</div>", unsafe_allow_html=True)
    st.markdown('</div>', unsafe_allow_html=True)

    st.markdown('<div class="spacer-16"></div>', unsafe_allow_html=True)

    # Composer row (input is the green pill by CSS). No extra bar above it.
    col1, col2, col3 = st.columns([0.74, 0.10, 0.16])
    with col1:
        user_text = st.text_input("Type your message…", placeholder="Type your message…",
                                  label_visibility="collapsed", key="composer_text")
    with col2:
        st.markdown('<div class="mic-btn">', unsafe_allow_html=True)
        mic_clicked = st.button("🎤", key="mic_button", help="Record voice")
        st.markdown('</div>', unsafe_allow_html=True)
    with col3:
        send_clicked = st.button("Send", key="send_button")

    if mic_clicked:
        st.session_state.mic_open = not st.session_state.mic_open

    # Handle text send + save to history
    if user_text and send_clicked:
        fp = _fingerprint(user_text)
        if st.session_state.last_user_fp != fp:
            _append("user", user_text, via_voice=False, is_html=False, preview=user_text)
            st.session_state.last_user_fp = fp
            with st.spinner("Thinking…"):
                direct, explain, refs = retrieve_answer(user_text)
                html_ans = build_answer_html(direct, explain)
            _append("assistant", html_ans, via_voice=False, refs=refs, is_html=True,
                    preview=f"Direct: {direct}")
            try:
                st.audio(tts_bytes(direct), format="audio/mp3", autoplay=True)
            except Exception:
                pass
            _rerun()

    # Mic panel
    if st.session_state.mic_open:
        st.markdown('<div class="mic-panel">', unsafe_allow_html=True)
        st.markdown("**🎙️ Recorder**")
        voice = audiorecorder("🎤 Start", "⏹ Stop")
        if len(voice) > 0:
            try:
                transcription = transcribe_audiosegment_to_text(voice)
                fp = _fingerprint(transcription)
                if st.session_state.last_voice_fp != fp:
                    _append("user", transcription, via_voice=True, is_html=False, preview=transcription)
                    st.session_state.last_voice_fp = fp
                    with st.spinner("Thinking…"):
                        direct, explain, refs = retrieve_answer(transcription)
                        html_ans = build_answer_html(direct, explain)
                    _append("assistant", html_ans, via_voice=False, refs=refs, is_html=True,
                            preview=f"Direct: {direct}")
                    try:
                        st.audio(tts_bytes(direct), format="audio/mp3", autoplay=True)
                    except Exception:
                        pass
                    st.success("Voice captured ✓")
                    _rerun()
            except Exception as e:
                st.error(f"ASR failed: {e}")
        st.markdown('</div>', unsafe_allow_html=True)

with right:
    st.markdown('<div class="glass" style="padding:16px;">', unsafe_allow_html=True)
    st.markdown("**Spending snapshot**")
    st.plotly_chart(donut_fig(), use_container_width=True, config={"displayModeBar": False})
    st.markdown('</div>', unsafe_allow_html=True)

    st.markdown('<div class="glass" style="padding:16px; margin-top:12px;">', unsafe_allow_html=True)
    st.markdown("**Goals**")
    st.markdown("- Build an emergency fund\n- Save for a vacation")
    st.markdown('</div>', unsafe_allow_html=True)

# Footer
st.markdown('<div class="footer">Finance Advisor • Calm, clear guidance.</div>', unsafe_allow_html=True)


Overwriting app.py


In [168]:
from pyngrok import ngrok
ngrok.set_auth_token("32QCzlk2KRXMeUta1w56akOljT4_YqBGQiKSMPeG82iJB55T")


In [203]:
# Expose the Streamlit app through ngrok
public_url = ngrok.connect('8501')
print(f"Streamlit app is live at: {public_url}")


Streamlit app is live at: NgrokTunnel: "https://26f89c197bfb.ngrok-free.app" -> "http://localhost:8501"


In [None]:
!streamlit run /content/app.py &



Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://35.197.27.133:8501[0m
[0m
2025-09-14 04:41:07.328190: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757824867.368913   68403 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757824867.380980   68403 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1757824867.411559   68403 computation_placer.cc:177] computation placer already regist

In [110]:
!pip install streamlit-audiorecorder transformers gTTS librosa




In [None]:
!apt-get update && apt-get install -y ffmpeg
!pip install gtts librosa

0% [Working]            Get:1 https://cli.github.com/packages stable InRelease [3,917 B]
0% [Connecting to archive.ubuntu.com (185.125.190.83)] [Connecting to security.                                                                               Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [2,006 kB]
Get:5 https://cli.github.com/packages stable/main amd64 Packages [346 B]
Get:6 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:7 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:8 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:9 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:10 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,797 kB]
Hit:11 https://ppa.launchpadconten

In [None]:
pip install streamlit-audiorecorder


Collecting streamlit-audiorecorder
  Downloading streamlit_audiorecorder-0.0.6-py3-none-any.whl.metadata (4.5 kB)
Downloading streamlit_audiorecorder-0.0.6-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.7/487.7 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: streamlit-audiorecorder
Successfully installed streamlit-audiorecorder-0.0.6


In [None]:
!pip install pyngrok


Collecting pyngrok
  Downloading pyngrok-7.3.0-py3-none-any.whl.metadata (8.1 kB)
Downloading pyngrok-7.3.0-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.3.0


In [None]:
!ls
