In [1]:
!pip install -q bertopic[visualization] sentence-transformers umap-learn hdbscan transformers torch datasets nltk matplotlib wordcloud pycountry pandas

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
!pip install streamlit transformers torch

Collecting streamlit
  Downloading streamlit-1.46.1-py3-none-any.whl.metadata (9.0 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.46.1-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m71.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hI

In [7]:
%%writefile app.py
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import torch
import requests
from io import StringIO
import pycountry
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from wordcloud import WordCloud
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

nltk.download('stopwords')

# === Setup === #
@st.cache_resource
def load_resources():
    # Stopwords and City/Country
    url = "https://raw.githubusercontent.com/datasets/world-cities/master/data/world-cities.csv"
    df_cities = pd.read_csv(StringIO(requests.get(url).text))
    city_names = df_cities['name'].str.lower().unique().tolist()
    country_names = [country.name.lower() for country in pycountry.countries]
    custom_stopwords = list(set(
        stopwords.words('english') + city_names + country_names +
        ['flight', 'airline', 'seat', 'boarding', 'ticket', 'airport', 'staff', 'delta', 'emirates', 'lufthansa', 'united']
    ))

    # Models
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
    vectorizer_model = CountVectorizer(stop_words=custom_stopwords, min_df=5)
    umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
    hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', prediction_data=True)

    topic_model = BERTopic(
        embedding_model=embedding_model,
        vectorizer_model=vectorizer_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        language="english",
        calculate_probabilities=True,
        verbose=False
    )

    sentiment_pipe = pipeline(
        "sentiment-analysis",
        model=AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment"),
        tokenizer=AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment"),
        device=0 if torch.cuda.is_available() else -1
    )

    return topic_model, sentiment_pipe

# === Load data === #
@st.cache_data
def load_data():
    df = pd.read_csv("airline_reviews_cleaned_dataset.csv")
    df = df[['customer_review']].dropna()
    df['customer_review'] = df['customer_review'].astype(str).str[:800]
    return df

# === Sentiment Mapper === #
label_map = {"LABEL_0": "negative", "LABEL_1": "neutral", "LABEL_2": "positive"}

def classify_sentiment(texts, sentiment_pipe):
    results = sentiment_pipe(texts)
    return [label_map[r['label']] for r in results]

# === Streamlit App === #
st.set_page_config(page_title="BERTopic + Sentiment Explorer", layout="wide")
st.title("✈️ Airline Review Analyzer with BERTopic & Sentiment")

topic_model, sentiment_pipe = load_resources()
df = load_data()

# === Fit model once and cache === #
if "topics_fitted" not in st.session_state:
    with st.spinner("Fitting BERTopic model on full data..."):
        topics, probs = topic_model.fit_transform(df['customer_review'].tolist())
        df['topic'] = topics
        df['sentiment'] = classify_sentiment(df['customer_review'].tolist(), sentiment_pipe)
        st.session_state['topics_fitted'] = True
        st.session_state['df'] = df
        st.session_state['topic_model'] = topic_model
else:
    df = st.session_state['df']
    topic_model = st.session_state['topic_model']

# === BERTopic Summary Button === #
if st.button("Show Topic Insights"):
    topic_info = topic_model.get_topic_info()
    topic_summary = df.groupby('topic').agg(
        review_count=('customer_review', 'count'),
        positive=('sentiment', lambda x: (x == 'positive').mean()),
        neutral=('sentiment', lambda x: (x == 'neutral').mean()),
        negative=('sentiment', lambda x: (x == 'negative').mean()),
        top_sentiment=('sentiment', lambda x: x.value_counts().idxmax())
    ).reset_index().merge(topic_info[['Topic', 'Name']], left_on='topic', right_on='Topic').drop(columns=['Topic'])

    st.success("Topic modeling and sentiment classification loaded!")

    st.subheader("🔍 Top 10 Most Negative Topics")
    topic_summary_sorted = topic_summary.sort_values('negative', ascending=False).head(10)
    st.dataframe(topic_summary_sorted[['Name', 'negative', 'review_count']])

    plt.figure(figsize=(10,5))
    sns.barplot(data=topic_summary_sorted, x='Name', y='negative', palette='Reds_r')
    plt.xticks(rotation=45, ha='right')
    plt.title("Top 10 Topics by % Negative Sentiment")
    st.pyplot(plt.gcf())

    worst_topic = topic_summary_sorted.iloc[0]['Name']
    topic_id = topic_summary_sorted.iloc[0]['topic']
    st.markdown(f"### 👎 Negative Review Samples for Topic: **{worst_topic}**")
    sample_reviews = df[(df['topic'] == topic_id) & (df['sentiment'] == 'negative')]['customer_review'].sample(3)
    for r in sample_reviews:
        st.write(f"- {r[:300]}...")

# === Try Your Own Review === #
st.subheader("🧪 Try Your Own Review")
user_input = st.text_area("Enter a customer review", height=150)

if st.button("Analyze My Review"):
    topic_id, _ = topic_model.transform([user_input])
    topic_words = topic_model.get_topic(topic_id[0])
    topic_name = ", ".join([w for w, _ in topic_words]) if topic_words else "Unknown/Outlier"

    sentiment = sentiment_pipe([user_input])[0]
    st.markdown("**📝 Review:**")
    st.write(user_input)
    st.markdown(f"**🏷️ Topic:** {topic_name}")
    st.markdown(f"**Sentiment:** {label_map[sentiment['label']]} ({round(sentiment['score'], 2)})")


Overwriting app.py


In [4]:
!pip install streamlit transformers torch
!npm install -g localtunnel

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K
added 22 packages in 3s
[1G[0K⠧[1G[0K
[1G[0K⠧[1G[0K3 packages are looking for funding
[1G[0K⠧[1G[0K  run `npm fund` for details
[1G[0K⠧[1G[0K

In [5]:
# Your public ip is the password to the localtunnel
!curl ipv4.icanhazip.com

34.125.129.36


In [None]:
!streamlit run app.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.125.129.36:8501[0m
[0m
[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0Kyour url is: https://ten-ads-clean.loca.lt
2025-07-13 22:04:14.177769: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752444254.202581    2452 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752444254.210220    2452 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has