In [1]:
# Colab cell (bash)
!pip install -q streamlit scikit-learn sentence-transformers pyngrok joblib matplotlib seaborn scipy


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m58.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m107.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [19]:
%%bash
cat > streamlit_app.py <<'PY'
# streamlit_app.py
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sentence_transformers import SentenceTransformer
from scipy.sparse import hstack, csr_matrix

st.set_page_config(page_title="Hybrid Sentiment Demo", layout="wide")
sns.set_style("whitegrid")

# ---- User inputs (change paths here if needed) ----
ELEC_PATH = "/content/Amazon_Reviews_Filtered.csv"   # training CSV (Electronics)
BOOKS_PATH = "/content/filtered_reviews.csv"         # testing CSV (Books)
TFIDF_MAX_FEATURES = 5000
# ----------------------------------------------------

@st.cache_data(show_spinner=False)
def load_data(elec_path, books_path):
    df_e = pd.read_csv(elec_path)
    df_b = pd.read_csv(books_path)
    return df_e, df_b

@st.cache_resource(show_spinner=False)
def prepare_models(train_df, test_df, text_col_train='reviewText', text_col_test='reviewText', max_tfidf=5000):
    # Prepare labels (drop neutral = 3)
    train = train_df[[text_col_train, 'overall']].dropna().copy()
    test = test_df[[text_col_test, 'overall']].dropna().copy()
    train[text_col_train] = train[text_col_train].astype(str)
    test[text_col_test] = test[text_col_test].astype(str)

    def to_label(x):
        try:
            xv = float(x)
        except:
            return np.nan
        if xv <= 2.0:
            return 0
        elif xv >= 4.0:
            return 1
        else:
            return np.nan

    train['label'] = train['overall'].apply(to_label)
    test['label'] = test['overall'].apply(to_label)
    train = train.dropna(subset=['label'])
    test = test.dropna(subset=['label'])
    train['label'] = train['label'].astype(int)
    test['label'] = test['label'].astype(int)

    X_train_texts = train[text_col_train].values
    y_train = train['label'].values
    X_test_texts = test[text_col_test].values
    y_test = test['label'].values

    # TF-IDF
    vectorizer = TfidfVectorizer(stop_words='english', max_features=max_tfidf)
    X_train_tfidf = vectorizer.fit_transform(X_train_texts)
    X_test_tfidf = vectorizer.transform(X_test_texts)

    # SBERT (embedding)
    sbert = SentenceTransformer('all-MiniLM-L6-v2')
    X_train_bert = sbert.encode(list(X_train_texts), convert_to_numpy=True, show_progress_bar=False)
    X_test_bert = sbert.encode(list(X_test_texts), convert_to_numpy=True, show_progress_bar=False)

    # Hybrid
    X_train_hybrid = hstack([X_train_tfidf, csr_matrix(X_train_bert)])
    X_test_hybrid = hstack([X_test_tfidf, csr_matrix(X_test_bert)])

    # Models
    nb = MultinomialNB()
    lr = LogisticRegression(max_iter=2000)
    svm = LinearSVC()
    rf = RandomForestClassifier(n_estimators=150, random_state=42, n_jobs=-1)

    # Train
    nb.fit(X_train_tfidf, y_train)               # NB uses TF-IDF only
    lr.fit(X_train_hybrid, y_train)
    svm.fit(X_train_hybrid, y_train)
    rf.fit(X_train_hybrid, y_train)

    # Predict on test
    preds = {}
    preds['Naive Bayes'] = nb.predict(X_test_tfidf)
    preds['Logistic Regression'] = lr.predict(X_test_hybrid)
    preds['SVM'] = svm.predict(X_test_hybrid)
    preds['Random Forest'] = rf.predict(X_test_hybrid)

    # Metrics
    results = {}
    for name, y_pred in preds.items():
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, zero_division=0)
        rec = recall_score(y_test, y_pred, zero_division=0)
        f1 = f1_score(y_test, y_pred, zero_division=0)
        results[name] = {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "y_pred": y_pred}

    artifacts = {
        "vectorizer": vectorizer,
        "sbert": sbert,
        "nb": nb,
        "lr": lr,
        "svm": svm,
        "rf": rf,
        "X_test_tfidf": X_test_tfidf,
        "X_test_hybrid": X_test_hybrid,
        "y_test": y_test,
        "results": results
    }
    return artifacts

# --- UI ---
st.title("📦 Cross-Category Sentiment Demo — TF-IDF + SBERT (Hybrid)")
st.write("Train on Electronics (train CSV) and test on Books (test CSV).")

with st.spinner("Loading data..."):
    try:
        df_elec, df_books = load_data(ELEC_PATH, BOOKS_PATH)
    except Exception as e:
        st.error(f"Could not load CSV files. Error: {e}")
        st.stop()

st.markdown("### Dataset samples")
col1, col2 = st.columns(2)
with col1:
    st.write("Electronics (train) sample")
    st.dataframe(df_elec.head(5))
with col2:
    st.write("Books (test) sample")
    st.dataframe(df_books.head(5))

with st.spinner("Training models and building artifacts (may take 1-2 minutes)..."):
    artifacts = prepare_models(df_elec, df_books, 'reviewText', 'reviewText', TFIDF_MAX_FEATURES)

vectorizer = artifacts["vectorizer"]
sbert = artifacts["sbert"]
nb = artifacts["nb"]
lr = artifacts["lr"]
svm = artifacts["svm"]
rf = artifacts["rf"]
y_test = artifacts["y_test"]
results = artifacts["results"]

st.subheader("Model comparison (tested on Books)")
metrics_df = pd.DataFrame([
    {"Model": m, "Accuracy": r["accuracy"], "Precision": r["precision"], "Recall": r["recall"], "F1": r["f1"]}
    for m, r in results.items()
]).sort_values("F1", ascending=False).reset_index(drop=True)
st.dataframe(metrics_df.style.format({"Accuracy":"{:.3f}", "Precision":"{:.3f}", "Recall":"{:.3f}", "F1":"{:.3f}"}))

# --- Attractive comparison plots (F1 & Accuracy) ---
sns.set_style("whitegrid")
plt.rcParams.update({'font.size': 12})

fig, ax = plt.subplots(1, 2, figsize=(14, 5), constrained_layout=True)

# Left: F1 scores (horizontal, annotated)
f1_sorted = metrics_df.sort_values("F1", ascending=True)
sns.barplot(x="F1", y="Model", data=f1_sorted, ax=ax[0], palette="viridis")
ax[0].set_xlim(0, 1)
ax[0].set_xlabel("F1 Score")
ax[0].set_title("F1 Score (Test)")
for i, (val, name) in enumerate(zip(f1_sorted["F1"].values, f1_sorted["Model"].values)):
    ax[0].text(val + 0.01, i, f"{val:.3f}", va='center', fontweight='bold', color='black')

# Right: Accuracy (horizontal, annotated with different palette)
acc_sorted = metrics_df.sort_values("Accuracy", ascending=True)
sns.barplot(x="Accuracy", y="Model", data=acc_sorted, ax=ax[1], palette="rocket")
ax[1].set_xlim(0, 1)
ax[1].set_xlabel("Accuracy")
ax[1].set_title("Accuracy (Test)")
for i, (val, name) in enumerate(zip(acc_sorted["Accuracy"].values, acc_sorted["Model"].values)):
    ax[1].text(val + 0.01, i, f"{val:.3f}", va='center', fontweight='bold', color='black')

st.pyplot(fig)

# Announce best model (by F1)
best_model_name = metrics_df.iloc[0]["Model"]
st.success(f"Best model (by F1 on Books test): **{best_model_name}**")

# --- Remove confusion matrix display per request (do not show it) ---

# ---------------------------
# Real-time Prediction Section
# ---------------------------
st.subheader("🧠 Real-time Sentiment Prediction")

review = st.text_area("Enter a review to predict sentiment:", height=120)

if st.button("Predict"):
    if not review.strip():
        st.error("⚠️ Please enter a review.")
    else:
        # Feature extraction
        tf_feat = vectorizer.transform([review])
        bert_feat = sbert.encode([review], convert_to_numpy=True)
        hybrid_feat = hstack([tf_feat, csr_matrix(bert_feat)])

        # Predictions from all models
        nb_pred = nb.predict(tf_feat)[0]
        lr_pred = lr.predict(hybrid_feat)[0]
        svm_pred = svm.predict(hybrid_feat)[0]
        rf_pred = rf.predict(hybrid_feat)[0]

        mapping = {1: "Positive 😊", 0: "Negative 😞"}

        per_model = {
            "Naive Bayes": mapping[int(nb_pred)],
            "Logistic Regression": mapping[int(lr_pred)],
            "SVM": mapping[int(svm_pred)],
            "Random Forest": mapping[int(rf_pred)],
        }

        # Show per-model predictions
        st.markdown("### 📊 Model-wise Predictions")
        df_live = pd.DataFrame({
            "Model": list(per_model.keys()),
            "Prediction": list(per_model.values()),
            "Accuracy": [
                metrics_df.loc[metrics_df["Model"] == m, "Accuracy"].values[0]
                for m in per_model.keys()
            ]
        })

        st.table(df_live.style.format({"Accuracy": "{:.3f}"}))

        # 🎯 Display only best model prediction
        best_model_name = metrics_df.loc[metrics_df["F1"].idxmax(), "Model"]
        best_pred = df_live.loc[df_live["Model"] == best_model_name, "Prediction"].values[0]
        best_acc = df_live.loc[df_live["Model"] == best_model_name, "Accuracy"].values[0]

        st.markdown("---")
        st.markdown(f"### 🏆 Best Model: **{best_model_name}**")
        st.markdown(f"**Predicted Sentiment:** {best_pred}")
        st.markdown(f"**Model Accuracy:** `{best_acc:.3f}`")

        # Visualization of accuracies
        fig, ax = plt.subplots(figsize=(6, 3))
        sns.barplot(x="Accuracy", y="Model", data=df_live, palette="coolwarm", ax=ax)
        ax.set_xlim(0, 1)
        ax.set_title("Model Accuracy on Real-time Review")
        st.pyplot(fig)






In [16]:
!pip install -q pyngrok
from pyngrok import ngrok
ngrok.set_auth_token("33xQ32F2JBb6URiZ5bHskpcdcbG_81xbFL1TqcVcLy9fzGhS2")

In [17]:
# Start streamlit in background and open ngrok tunnel
from pyngrok import ngrok
import subprocess, time, os, signal

# Kill old tunnels if any
ngrok.kill()

# Start streamlit app (runs on port 8501 by default)
cmd = ["streamlit", "run", "streamlit_app.py", "--server.port=8501", "--server.headless=true"]
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

# Wait a few seconds for streamlit to start
time.sleep(4)

# Open ngrok tunnel to port 8501
public_url = ngrok.connect(8501, "http")
print("Streamlit public URL:", public_url.public_url)
print("If the page shows a spinner, wait 10-30 seconds and refresh the browser.")
# show the process output tail (optional)
time.sleep(1)


Streamlit public URL: https://unrecruited-marcelle-lang.ngrok-free.dev
If the page shows a spinner, wait 10-30 seconds and refresh the browser.


In [20]:
# Stop ngrok and kill streamlit
from pyngrok import ngrok
ngrok.kill()
# kill child process (if still running)
import psutil, os
for p in psutil.process_iter():
    if 'streamlit' in ' '.join(p.cmdline()).lower():
        p.kill()
print("Stopped Streamlit and ngrok.")


Stopped Streamlit and ngrok.
