<a href="https://colab.research.google.com/github/NeelimaKawatra/Clustering_Task/blob/main/Clustery.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Installing Packages

In [36]:
print("üì¶ Installing required packages...")
!pip install pandas openpyxl bertopic sentence-transformers umap-learn hdbscan wordcloud matplotlib plotly

print("‚úÖ All packages installed successfully!")

üì¶ Installing required packages...
‚úÖ All packages installed successfully!


Importing_Libraries

In [37]:
print("üìö Importing libraries...")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from google.colab import files
import io
import re
import string
from collections import Counter
import warnings
import wordcloud
warnings.filterwarnings('ignore')

# Text preprocessing libraries
import nltk
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

print("‚úÖ All libraries imported successfully!")

üìö Importing libraries...
‚úÖ All libraries imported successfully!


For Streamlit

In [38]:

!pip install streamlit pyngrok
!npm install localtunnel

[1G[0K‚†ô[1G[0K‚†π[1G[0K‚†∏[1G[0K‚†º[1G[0K‚†¥[1G[0K‚†¶[1G[0K‚†ß[1G[0K‚†á[1G[0K‚†è[1G[0K‚†ã[1G[0K‚†ô[1G[0K‚†π[1G[0K‚†∏[1G[0K‚†º[1G[0K
up to date, audited 23 packages in 1s
[1G[0K‚†º[1G[0K
[1G[0K‚†º[1G[0K3 packages are looking for funding
[1G[0K‚†º[1G[0K  run `npm fund` for details
[1G[0K‚†º[1G[0K
2 [31m[1mhigh[22m[39m severity vulnerabilities

To address all issues (including breaking changes), run:
  npm audit fix --force

Run `npm audit` for details.
[1G[0K‚†º[1G[0K

Logs folder

In [39]:
!mkdir -p /content
!touch /content/logs.txt


Defining Preprocessing

In [61]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
import time
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import re
import string
import nltk

# Download NLTK data (only if not already downloaded)
@st.cache_resource
def download_nltk_data():
    try:
        nltk.data.find('tokenizers/punkt')
        nltk.data.find('corpora/stopwords')
    except LookupError:
        nltk.download('punkt', quiet=True)
        nltk.download('stopwords', quiet=True)
    return True

# Force packages to be available - we know they work from command line
PACKAGES_AVAILABLE = True

# Import packages at module level
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

@st.cache_resource
def load_sentence_transformer(model_name):
    """Cache the sentence transformer model"""
    return SentenceTransformer(model_name)

def basic_text_cleaning(text):
    """Basic text cleaning function"""
    if pd.isna(text):
        return ""

    # Convert to string and lowercase
    text = str(text).lower()

    # Remove URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)

    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def advanced_text_cleaning(text, remove_stopwords=True, remove_punctuation=True, min_length=2):
    """Advanced text cleaning with options"""
    if pd.isna(text) or text == "":
        return ""

    # Ensure NLTK data is downloaded
    download_nltk_data()
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize

    # Basic cleaning first
    text = basic_text_cleaning(text)

    # Remove punctuation if requested
    if remove_punctuation:
        text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize and process
    try:
        tokens = word_tokenize(text)
    except:
        # Fallback if NLTK fails
        tokens = text.split()

    # Remove stopwords if requested
    if remove_stopwords:
        try:
            stop_words = set(stopwords.words('english'))
            tokens = [token for token in tokens if token not in stop_words]
        except:
            # Fallback if NLTK stopwords fail
            basic_stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them'}
            tokens = [token for token in tokens if token not in basic_stopwords]

    # Filter by minimum length
    tokens = [token for token in tokens if len(token) >= min_length]

    # Remove digits-only tokens
    tokens = [token for token in tokens if not token.isdigit()]

    return ' '.join(tokens)

def analyze_text_quality(texts):
    """Analyze text quality and provide statistics"""
    valid_texts = [text for text in texts if pd.notna(text) and str(text).strip()]

    if not valid_texts:
        return {
            'total_texts': len(texts),
            'empty_texts': len(texts),
            'avg_length': 0,
            'min_length': 0,
            'max_length': 0,
            'unique_texts': 0,
            'avg_words': 0,
            'min_words': 0,
            'max_words': 0
        }

    stats = {
        'total_texts': len(texts),
        'empty_texts': len(texts) - len(valid_texts),
        'avg_length': np.mean([len(str(text)) for text in valid_texts]),
        'min_length': min([len(str(text)) for text in valid_texts]),
        'max_length': max([len(str(text)) for text in valid_texts]),
        'unique_texts': len(set([str(text) for text in valid_texts]))
    }

    # Word count analysis
    word_counts = []
    for text in valid_texts:
        words = str(text).split()
        word_counts.append(len(words))

    if word_counts:
        stats['avg_words'] = np.mean(word_counts)
        stats['min_words'] = min(word_counts)
        stats['max_words'] = max(word_counts)
    else:
        stats['avg_words'] = stats['min_words'] = stats['max_words'] = 0

    return stats

def has_text_content(series):
    """Check if a column contains meaningful text content"""
    if series.dtype == 'object':
        # Remove null values
        text_data = series.dropna().astype(str)

        if len(text_data) == 0:
            return False

        # Check if most values have more than just numbers/single characters
        meaningful_text = text_data[text_data.str.len() > 2].count()
        has_words = text_data.str.contains(' ', na=False).sum()

        # At least 30% should be meaningful text with spaces
        return (meaningful_text > len(text_data) * 0.3) and (has_words > 0)

    return False

def get_optimal_parameters(n_texts):
    """Get optimal BERTopic parameters based on dataset size"""
    if n_texts < 50:
        return {
            'min_cluster_size': max(3, n_texts // 15),
            'min_samples': 2,
            'n_neighbors': 5,
            'n_components': 5,
            'embedding_model': 'all-MiniLM-L6-v2'  # Smallest, fastest model
        }
    elif n_texts < 200:
        return {
            'min_cluster_size': max(5, n_texts // 25),
            'min_samples': 3,
            'n_neighbors': 10,
            'n_components': 8,
            'embedding_model': 'all-MiniLM-L6-v2'  # Still fast
        }
    else:
        return {
            'min_cluster_size': max(8, n_texts // 40),
            'min_samples': 4,
            'n_neighbors': 15,
            'n_components': 10,
            'embedding_model': 'all-MiniLM-L6-v2'  # Changed from mpnet to MiniLM for speed
        }

def classify_confidence(probabilities, high_threshold=0.7, low_threshold=0.3):
    """Classify confidence levels based on HDBSCAN probabilities"""
    high_conf = probabilities >= high_threshold
    medium_conf = (probabilities >= low_threshold) & (probabilities < high_threshold)
    low_conf = probabilities < low_threshold

    return high_conf, medium_conf, low_conf

def create_wordcloud_for_cluster(texts, cluster_id, title):
    """Create word cloud for a specific cluster"""
    try:
        # Combine all texts in the cluster
        cluster_text = ' '.join(texts)

        # Clean text for better word cloud
        cluster_text = re.sub(r'[^\w\s]', ' ', cluster_text.lower())
        cluster_text = re.sub(r'\s+', ' ', cluster_text)

        if len(cluster_text.strip()) > 0:
            # Create word cloud - SMALLER SIZE
            wordcloud = WordCloud(
                width=300,      # Changed from 800
                height=200,     # Changed from 400
                background_color='white',
                max_words=30,   # Changed from 50
                colormap='viridis',
                relative_scaling=0.5
            ).generate(cluster_text)

            # Plot using matplotlib - SMALLER FIGURE
            fig, ax = plt.subplots(figsize=(4, 2))  # Changed from (10, 5)
            ax.imshow(wordcloud, interpolation='bilinear')
            if title:  # Only show title if provided
                ax.set_title(title, fontsize=10, fontweight='bold')  # Smaller font
            ax.axis('off')

            return fig
        else:
            return None
    except Exception as e:
        st.warning(f"Could not generate word cloud for {title}: {str(e)}")
        return None
def display_wordclouds(topic_model, topics, texts):
    """Display word clouds for all clusters in Streamlit"""
    st.subheader("üé® Word Clouds for Each Cluster")
    st.write("Visual representation of the most frequent words in each cluster:")

    unique_topics = set(topics)
    topic_list = sorted([t for t in unique_topics if t != -1])

    # Display word clouds in rows of 4
    for i in range(0, len(topic_list), 4):
        cols = st.columns(4)

        for j, topic_id in enumerate(topic_list[i:i+4]):
            with cols[j]:
                # Get texts for this cluster
                cluster_texts = [texts[idx] for idx, t in enumerate(topics) if t == topic_id]
                cluster_size = len(cluster_texts)

                # Get topic keywords
                topic_words = topic_model.get_topic(topic_id)[:3]
                keywords = [word for word, score in topic_words]

                st.write(f"**Cluster {topic_id}** ({cluster_size})")
                st.caption(f"{', '.join(keywords)}")

                # Create smaller word cloud
                fig = create_wordcloud_for_cluster(cluster_texts, topic_id, "")

                if fig is not None:
                    st.pyplot(fig, clear_figure=True)
                else:
                    st.info("No word cloud")

def get_user_satisfaction_choice():
    """Get user's satisfaction with clustering results"""
    st.markdown("---")
    st.subheader("üéØ How do you feel about these clustering results?")

    col1, col2 = st.columns(2)

    satisfaction_choice = None

    with col1:
        if st.button("‚úÖ **Happy with clustering**\nExport results as-is",
                    type="primary",
                    use_container_width=True,
                    help="The clusters look good! Download the results."):
            satisfaction_choice = "happy"

    with col2:
        if st.button("üîß **Want manual adjustments**\nReview and improve clusters",
                    use_container_width=True,
                    help="I want to review and manually adjust some clusters."):
            satisfaction_choice = "manual"

    return satisfaction_choice
    """Classify confidence levels based on HDBSCAN probabilities"""
    high_conf = probabilities >= high_threshold
    medium_conf = (probabilities >= low_threshold) & (probabilities < high_threshold)
    low_conf = probabilities < low_threshold

    return high_conf, medium_conf, low_conf

def run_bertopic_clustering(texts, params):
    """Run BERTopic clustering - removed caching to fix import issues"""

    # Debug: Print dataset info
    print(f"Debug: Dataset size: {len(texts)}")
    print(f"Debug: Sample texts: {texts[:3]}")
    print(f"Debug: Params: {params}")

    # Set up UMAP with more lenient parameters for small datasets
    umap_model = UMAP(
        n_neighbors=min(params['n_neighbors'], len(texts)-1),  # Ensure n_neighbors < dataset size
        n_components=min(params['n_components'], len(texts)-1),
        min_dist=0.0,
        metric='cosine',
        random_state=42
    )

    # Set up HDBSCAN with more lenient parameters
    hdbscan_model = HDBSCAN(
        min_cluster_size=max(2, min(params['min_cluster_size'], len(texts)//3)),  # More flexible
        min_samples=max(1, min(params['min_samples'], len(texts)//5)),
        metric='euclidean',
        cluster_selection_method='eom'
    )

    # Set up embedding model using cached function
    embedding_model = load_sentence_transformer(params['embedding_model'])

    # Create BERTopic model with error handling
    topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        verbose=True,  # Enable verbose to see what's happening
        calculate_probabilities=True,
        nr_topics="auto"  # Let BERTopic decide number of topics
    )

    # Fit the model with error handling
    try:
        topics, probabilities = topic_model.fit_transform(texts)
        print(f"Debug: Generated {len(set(topics))} topics")
        print(f"Debug: Topic distribution: {dict(zip(*np.unique(topics, return_counts=True)))}")

        # Check if clustering was successful
        if probabilities is None or len(probabilities) == 0:
            raise ValueError("No prediction data was generated")

        return topic_model, topics, probabilities

    except Exception as e:
        print(f"Debug: BERTopic error: {str(e)}")
        # If BERTopic fails, try simpler approach
        print("Debug: Trying fallback clustering...")
        return fallback_clustering(texts, embedding_model)

def fallback_clustering(texts, embedding_model):
    """Fallback clustering using KMeans if BERTopic fails"""
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_score

    # Generate embeddings
    embeddings = embedding_model.encode(texts)

    # Try different numbers of clusters
    best_score = -1
    best_k = 2

    for k in range(2, min(len(texts)//2, 8)):
        try:
            kmeans = KMeans(n_clusters=k, random_state=42)
            cluster_labels = kmeans.fit_predict(embeddings)
            score = silhouette_score(embeddings, cluster_labels)
            if score > best_score:
                best_score = score
                best_k = k
        except:
            continue

    # Final clustering with best k
    kmeans = KMeans(n_clusters=best_k, random_state=42)
    topics = kmeans.fit_predict(embeddings)

    # Generate mock probabilities (distances to centroids converted to probabilities)
    distances = kmeans.transform(embeddings)
    min_distances = np.min(distances, axis=1)
    max_dist = np.max(min_distances)
    probabilities = 1 - (min_distances / max_dist)  # Closer = higher probability

    # Create mock topic model
    class MockTopicModel:
        def __init__(self, texts, topics):
            self.topics = topics
            self.texts = texts

        def get_topic_info(self):
            topic_counts = dict(zip(*np.unique(topics, return_counts=True)))
            return pd.DataFrame([
                {'Topic': topic, 'Count': count}
                for topic, count in topic_counts.items()
            ])

        def get_topic(self, topic_id):
            # Return mock keywords
            return [("keyword1", 0.5), ("keyword2", 0.4), ("keyword3", 0.3)]

    return MockTopicModel(texts, topics), topics, probabilities

def page_upload():
    """Page 1: File Upload, Column Selection, and Preprocessing"""
    st.title("üîç Welcome to Clustery: Short Text Clustering")
    st.markdown("---")

    # File upload section
    st.subheader("üìÅ Please upload your file")

    uploaded_file = st.file_uploader(
        "Choose your data file",
        type=['csv', 'xlsx', 'xls'],
        help="Upload a CSV or Excel file containing your data"
    )

    if uploaded_file is not None:
        try:
            # Read the file
            if uploaded_file.name.endswith('.csv'):
                df = pd.read_csv(uploaded_file)
            else:
                df = pd.read_excel(uploaded_file)

            st.success(f"‚úÖ File uploaded successfully!")

            # Show top 5 rows
            st.subheader("üìã Top 5 rows of your data:")
            st.dataframe(df.head(), use_container_width=True)

            # Show basic file info
            col1, col2, col3 = st.columns(3)
            with col1:
                st.metric("üìä Total Rows", len(df))
            with col2:
                st.metric("üìä Total Columns", len(df.columns))
            with col3:
                st.metric("üíæ Memory Usage", f"{df.memory_usage(deep=True).sum() / 1024:.1f} KB")

            st.markdown("---")

            # Column selection
            st.subheader("üéØ Which column would you like to cluster?")

            # Identify potential text columns
            text_columns = []
            for col in df.columns:
                if has_text_content(df[col]):
                    text_columns.append(col)

            if text_columns:
                st.info(f"üí° Recommended text columns: {', '.join(text_columns)}")

            selected_column = st.selectbox(
                "Select a column:",
                df.columns,
                help="Choose the column containing the text you want to cluster"
            )

            # Check if selected column has text content
            if selected_column:
                if has_text_content(df[selected_column]):
                    # Text quality analysis
                    st.subheader("üìä Text Quality Analysis")

                    original_texts = df[selected_column].dropna()
                    original_stats = analyze_text_quality(original_texts)

                    col1, col2, col3, col4 = st.columns(4)
                    with col1:
                        st.metric("Total Responses", original_stats['total_texts'])
                    with col2:
                        st.metric("Empty/Invalid", original_stats['empty_texts'])
                    with col3:
                        st.metric("Avg Length", f"{original_stats['avg_length']:.1f} chars")
                    with col4:
                        st.metric("Avg Words", f"{original_stats['avg_words']:.1f}")

                    # Show sample data
                    st.subheader(f"üìñ Sample data from '{selected_column}':")
                    sample_data = df[selected_column].dropna().head(5)
                    for i, text in enumerate(sample_data, 1):
                        st.write(f"**{i}.** {str(text)[:150]}{'...' if len(str(text)) > 150 else ''}")

                    # Text length distribution
                    if len(original_texts) > 0:
                        text_lengths = [len(str(text)) for text in original_texts]
                        fig = plt.figure(figsize=(10, 4))
                        plt.hist(text_lengths, bins=20, alpha=0.7, color='skyblue', edgecolor='black')
                        plt.title('Distribution of Text Lengths')
                        plt.xlabel('Text Length (characters)')
                        plt.ylabel('Count')
                        st.pyplot(fig, clear_figure=True)

                    st.markdown("---")

                    # Preprocessing section
                    st.subheader("üîß Text Preprocessing Options")

                    preprocessing_option = st.radio(
                        "Choose preprocessing level:",
                        [
                            "No preprocessing (use original text)",
                            "Basic cleaning (URLs, emails, whitespace)",
                            "Advanced cleaning (+ stopwords, punctuation)",
                            "Custom preprocessing"
                        ],
                        help="Preprocessing can improve clustering quality by removing noise"
                    )

                    # Custom preprocessing options
                    if preprocessing_option == "Custom preprocessing":
                        st.write("**Custom Options:**")
                        col1, col2 = st.columns(2)
                        with col1:
                            remove_stopwords = st.checkbox("Remove stopwords", value=True)
                            remove_punctuation = st.checkbox("Remove punctuation", value=True)
                        with col2:
                            min_length = st.slider("Minimum word length", 1, 5, 2)

                    # Process text based on selection
                    if preprocessing_option == "No preprocessing (use original text)":
                        processed_texts = [str(text) if pd.notna(text) else "" for text in df[selected_column]]
                        preprocessing_details = "No preprocessing applied"

                    elif preprocessing_option == "Basic cleaning (URLs, emails, whitespace)":
                        processed_texts = [basic_text_cleaning(text) for text in df[selected_column]]
                        preprocessing_details = "Basic cleaning: URLs, emails, whitespace"

                    elif preprocessing_option == "Advanced cleaning (+ stopwords, punctuation)":
                        processed_texts = [advanced_text_cleaning(text, remove_stopwords=True, remove_punctuation=True) for text in df[selected_column]]
                        preprocessing_details = "Advanced cleaning: URLs, emails, stopwords, punctuation, short words"

                    else:  # Custom preprocessing
                        processed_texts = [advanced_text_cleaning(text, remove_stopwords=remove_stopwords,
                                                                 remove_punctuation=remove_punctuation,
                                                                 min_length=min_length) for text in df[selected_column]]
                        preprocessing_details = f"Custom: stopwords={remove_stopwords}, punctuation={remove_punctuation}, min_length={min_length}"

                    # Filter out empty texts
                    original_count = len(processed_texts)
                    processed_texts = [text.strip() for text in processed_texts if text.strip() and len(text.strip()) > 2]
                    filtered_count = len(processed_texts)

                    # Show before/after comparison
                    if preprocessing_option != "No preprocessing (use original text)":
                        st.subheader("üîç Before/After Comparison")

                        processed_stats = analyze_text_quality(processed_texts)

                        col1, col2 = st.columns(2)
                        with col1:
                            st.write("**Original:**")
                            st.metric("Total texts", original_stats['total_texts'])
                            st.metric("Avg length", f"{original_stats['avg_length']:.1f} chars")
                            st.metric("Avg words", f"{original_stats['avg_words']:.1f}")

                        with col2:
                            st.write("**Processed:**")
                            st.metric("Valid texts", filtered_count, f"{filtered_count - original_stats['total_texts']:+d}")
                            st.metric("Avg length", f"{processed_stats['avg_length']:.1f} chars", f"{processed_stats['avg_length'] - original_stats['avg_length']:+.1f}")
                            st.metric("Avg words", f"{processed_stats['avg_words']:.1f}", f"{processed_stats['avg_words'] - original_stats['avg_words']:+.1f}")

                        # Show sample processed texts
                        with st.expander("üìÑ Sample Processed Texts"):
                            for i, text in enumerate(processed_texts[:5], 1):
                                st.write(f"**{i}.** {text[:150]}{'...' if len(text) > 150 else ''}")

                    else:
                        # For no preprocessing, just show the count
                        st.info(f"üìä Ready for clustering: {filtered_count} texts (removed {original_count - filtered_count} empty/short texts)")

                    st.markdown("---")

                    # Proceed button
                    if len(processed_texts) >= 3:  # Minimum texts needed for clustering
                        st.success(f"‚úÖ Ready for clustering with {len(processed_texts)} texts!")

                        if st.button("üöÄ Let's do clustering!", type="primary", use_container_width=True):
                            # Store data in session state
                            st.session_state['survey_data'] = df
                            st.session_state['text_column'] = selected_column
                            st.session_state['processed_texts'] = processed_texts
                            st.session_state['preprocessing_details'] = preprocessing_details
                            st.session_state['original_stats'] = original_stats
                            if preprocessing_option != "No preprocessing (use original text)":
                                st.session_state['processed_stats'] = analyze_text_quality(processed_texts)
                            st.session_state['current_page'] = 'clustering'

                            st.balloons()
                            st.success("üéâ Great! Moving to clustering setup...")
                            st.rerun()
                    else:
                        st.error(f"‚ùå Need at least 3 texts for clustering. Current: {len(processed_texts)}")
                        st.write("Try:")
                        st.write("- Using less aggressive preprocessing")
                        st.write("- Selecting a different column")
                        st.write("- Checking your data quality")

                else:
                    # Warning for non-text column
                    st.error("‚ö†Ô∏è **This column doesn't have text to work on!**")
                    st.write("Please select a column that contains text responses suitable for clustering.")

                    # Show some sample data to help user understand
                    st.write(f"Sample data from '{selected_column}':")
                    sample_data = df[selected_column].dropna().head(3)
                    for i, value in enumerate(sample_data, 1):
                        st.write(f"**{i}.** {value}")

        except Exception as e:
            st.error(f"‚ùå Error reading file: {str(e)}")
            st.write("Please make sure your file is a valid CSV or Excel file.")
            st.write("Common issues:")
            st.write("- File might be corrupted")
            st.write("- File might be too large")
            st.write("- File format might not be supported")
            st.write("- File might require specific encoding")

def page_clustering():
    """Page 2: BERTopic Clustering"""
    st.title("üîç Clustery: BERTopic Clustering")

    # Back button
    if st.button("‚Üê Back to Upload", help="Go back to file upload"):
        st.session_state['current_page'] = 'upload'
        st.rerun()

    # Check if packages are available
    if not PACKAGES_AVAILABLE:
        st.error("‚ùå **Missing required packages!**")
        st.write("Please install the following packages:")
        st.code("pip install bertopic sentence-transformers umap-learn hdbscan")
        st.info("After installation, restart this page.")
        return

    # Get data from session state
    if 'survey_data' not in st.session_state or 'text_column' not in st.session_state:
        st.error("‚ùå **No data found!** Please upload data first.")
        if st.button("üîÑ Go to Upload"):
            st.session_state['current_page'] = 'upload'
            st.rerun()
        return

    df = st.session_state['survey_data']
    text_column = st.session_state['text_column']

    # Use processed texts if available, otherwise prepare from original
    if 'processed_texts' in st.session_state:
        texts = st.session_state['processed_texts']
        preprocessing_details = st.session_state.get('preprocessing_details', 'Unknown preprocessing')

        st.success(f"‚úÖ **Data loaded!** Using {len(texts)} processed responses from column '{text_column}'")
        st.info(f"üîß **Preprocessing applied:** {preprocessing_details}")

        # Show preprocessing summary if available
        if 'original_stats' in st.session_state and 'processed_stats' in st.session_state:
            with st.expander("üìä Preprocessing Summary"):
                original_stats = st.session_state['original_stats']
                processed_stats = st.session_state['processed_stats']

                col1, col2, col3 = st.columns(3)
                with col1:
                    st.metric("Original texts", original_stats['total_texts'])
                    st.metric("Processed texts", len(texts))
                with col2:
                    st.metric("Original avg length", f"{original_stats['avg_length']:.1f}")
                    st.metric("Processed avg length", f"{processed_stats['avg_length']:.1f}")
                with col3:
                    st.metric("Original avg words", f"{original_stats['avg_words']:.1f}")
                    st.metric("Processed avg words", f"{processed_stats['avg_words']:.1f}")
    else:
        # Fallback to original processing
        texts = df[text_column].dropna().astype(str).tolist()
        texts = [text.strip() for text in texts if len(text.strip()) > 2]
        st.success(f"‚úÖ **Data loaded!** Using {len(texts)} responses from column '{text_column}'")
        st.warning("‚ö†Ô∏è No preprocessing was applied. Consider going back to add preprocessing for better results.")

    # Show data summary
    with st.expander("üìä Data Summary"):
        st.write(f"**Total responses:** {len(texts)}")
        st.write(f"**Column:** {text_column}")
        st.write(f"**Average length:** {np.mean([len(text) for text in texts]):.1f} characters")
        if 'preprocessing_details' in st.session_state:
            st.write(f"**Preprocessing:** {st.session_state['preprocessing_details']}")
        st.write("**Sample responses:**")
        for i, text in enumerate(texts[:3]):
            st.write(f"{i+1}. {text[:100]}{'...' if len(text) > 100 else ''}")

    st.markdown("---")

    # Get optimal parameters
    optimal_params = get_optimal_parameters(len(texts))

    st.subheader("üîß Clustering Configuration")

    col1, col2 = st.columns([3, 1])

    with col1:
        st.write("**Recommended parameters for your dataset:**")
        for key, value in optimal_params.items():
            st.write(f"- **{key.replace('_', ' ').title()}:** {value}")

    with col2:
        use_optimal = st.radio(
            "Parameter choice:",
            ["Use recommended", "Customize"],
            help="Recommended settings work best for most datasets"
        )

    # Parameter selection
    if use_optimal == "Customize":
        st.subheader("‚öôÔ∏è Custom Parameters")

        col1, col2 = st.columns(2)
        with col1:
            min_cluster_size = st.slider("Min Cluster Size", 2, 20, optimal_params['min_cluster_size'])
            n_neighbors = st.slider("UMAP Neighbors", 2, 30, optimal_params['n_neighbors'])
            embedding_model = st.selectbox("Embedding Model",
                                         ['all-MiniLM-L6-v2', 'all-mpnet-base-v2'],
                                         index=0)

        with col2:
            min_samples = st.slider("Min Samples", 1, 10, optimal_params['min_samples'])
            n_components = st.slider("UMAP Components", 2, 20, optimal_params['n_components'])

        params = {
            'min_cluster_size': min_cluster_size,
            'min_samples': min_samples,
            'n_neighbors': n_neighbors,
            'n_components': n_components,
            'embedding_model': embedding_model
        }
    else:
        params = optimal_params

    st.markdown("---")

    # Clustering button
    col1, col2, col3 = st.columns([1, 2, 1])
    with col2:
        if st.button("üöÄ Start BERTopic Clustering", type="primary", use_container_width=True):

            progress_bar = st.progress(0)
            status_text = st.empty()

            try:
                status_text.text("üîÑ Initializing BERTopic...")
                progress_bar.progress(10)

                status_text.text("üîÑ Loading sentence transformer model...")
                progress_bar.progress(30)

                status_text.text("üîÑ Running clustering algorithm...")
                progress_bar.progress(50)

                # Run clustering
                topic_model, topics, probabilities = run_bertopic_clustering(texts, params)
                progress_bar.progress(80)

                # Store results
                st.session_state['topic_model'] = topic_model
                st.session_state['topics'] = topics
                st.session_state['probabilities'] = probabilities
                st.session_state['texts'] = texts
                st.session_state['clustering_complete'] = True

                progress_bar.progress(100)
                status_text.text("‚úÖ Clustering completed!")

                st.balloons()
                st.success("üéâ **Clustering successful!**")

            except Exception as e:
                st.error(f"‚ùå **Error during clustering:** {str(e)}")
                st.write("This might be due to:")
                st.write("- Dataset too small")
                st.write("- Text responses too similar")
                st.write("- Parameter settings need adjustment")
                return

    # Show results if clustering is complete
    if st.session_state.get('clustering_complete', False):

        st.markdown("---")
        st.header("üìä Clustering Results")

        topic_model = st.session_state['topic_model']
        topics = st.session_state['topics']
        probabilities = st.session_state['probabilities']
        texts = st.session_state['texts']

        # Basic statistics
        unique_topics = len(set(topics))
        outliers = sum(1 for t in topics if t == -1)
        clustered = len(texts) - outliers

        col1, col2, col3, col4 = st.columns(4)
        with col1:
            st.metric("üóÇÔ∏è Total Clusters", unique_topics - (1 if outliers > 0 else 0))
        with col2:
            st.metric("‚úÖ Clustered", clustered)
        with col3:
            st.metric("‚ùì Outliers", outliers)
        with col4:
            cluster_rate = (clustered / len(texts)) * 100
            st.metric("üìà Success Rate", f"{cluster_rate:.1f}%")

        # Confidence analysis
        st.subheader("üéØ Confidence Analysis")

        high_conf, medium_conf, low_conf = classify_confidence(probabilities)

        col1, col2, col3 = st.columns(3)
        with col1:
            high_count = sum(high_conf)
            high_pct = (high_count/len(probabilities)*100)
            st.metric("üü¢ High Confidence", f"{high_count}", f"{high_pct:.1f}%")
            st.caption("Probability ‚â• 0.7")

        with col2:
            med_count = sum(medium_conf)
            med_pct = (med_count/len(probabilities)*100)
            st.metric("üü° Medium Confidence", f"{med_count}", f"{med_pct:.1f}%")
            st.caption("Probability 0.3-0.7")

        with col3:
            low_count = sum(low_conf)
            low_pct = (low_count/len(probabilities)*100)
            st.metric("üî¥ Low Confidence", f"{low_count}", f"{low_pct:.1f}%")
            st.caption("Probability < 0.3")

        # Topic details
        st.subheader("üìù Cluster Details")

        topic_info = topic_model.get_topic_info()
        if len(topic_info) > 0:
            # Filter out outliers for main display
            main_topics = topic_info[topic_info['Topic'] != -1] if -1 in topic_info['Topic'].values else topic_info

            for idx, row in main_topics.iterrows():
                topic_num = row['Topic']
                topic_size = row['Count']

                # Get topic words
                topic_words = topic_model.get_topic(topic_num)
                top_words = [word for word, score in topic_words[:5]]

                # Get sample texts for this topic
                topic_indices = [i for i, t in enumerate(topics) if t == topic_num]
                topic_texts = [texts[i] for i in topic_indices]
                topic_probs = [probabilities[i] for i in topic_indices]

                with st.expander(f"üìã **Cluster {topic_num}** ({topic_size} responses) - {', '.join(top_words[:3])}"):

                    col1, col2 = st.columns([2, 1])

                    with col1:
                        st.write("**üî§ Top Keywords:**")
                        st.write(", ".join(top_words))

                        st.write("**üìÑ Sample Responses:**")
                        # Show top 5 responses with highest confidence
                        sorted_samples = sorted(zip(topic_texts, topic_probs), key=lambda x: x[1], reverse=True)
                        for i, (text, prob) in enumerate(sorted_samples[:5]):
                            confidence_emoji = "üü¢" if prob >= 0.7 else "üü°" if prob >= 0.3 else "üî¥"
                            st.write(f"{confidence_emoji} {text} *(conf: {prob:.2f})*")

                    with col2:
                        avg_confidence = np.mean(topic_probs)
                        st.metric("Avg Confidence", f"{avg_confidence:.2f}")

                        high_conf_in_topic = sum(1 for p in topic_probs if p >= 0.7)
                        st.metric("High Confidence Items", high_conf_in_topic)

        # Display word clouds
        display_wordclouds(topic_model, topics, texts)

        # User satisfaction choice
        satisfaction_choice = get_user_satisfaction_choice()

        if satisfaction_choice == "happy":
            st.success("üéâ Great! Preparing your results for export...")

            # Create results DataFrame
            results_df = pd.DataFrame({
                'text': texts,
                'cluster': topics,
                'confidence': probabilities
            })

            # Add cluster labels
            cluster_labels = {}
            for topic_num in set(topics):
                if topic_num != -1:
                    words = topic_model.get_topic(topic_num)[:3]
                    cluster_labels[topic_num] = "_".join([word for word, score in words])
                else:
                    cluster_labels[topic_num] = "outlier"

            results_df['cluster_label'] = results_df['cluster'].map(cluster_labels)

            st.session_state['final_results'] = results_df

            col1, col2, col3 = st.columns([1, 2, 1])
            with col2:
                st.download_button(
                    "üì• Download Results CSV",
                    results_df.to_csv(index=False),
                    "clustering_results.csv",
                    "text/csv",
                    type="primary",
                    use_container_width=True
                )

        elif satisfaction_choice == "manual":
            st.info("üîß Proceeding to manual review configuration...")
            st.session_state['current_page'] = 'review'
            st.rerun()

        # Original buttons (still available)
        st.markdown("---")
        st.subheader("üîÑ Other Options")

        col1, col2, col3 = st.columns(3)

        with col1:
            if st.button("‚úÖ **Quick Export**", use_container_width=True):
                # Create results DataFrame
                results_df = pd.DataFrame({
                    'text': texts,
                    'cluster': topics,
                    'confidence': probabilities
                })

                # Add cluster labels
                cluster_labels = {}
                for topic_num in set(topics):
                    if topic_num != -1:
                        words = topic_model.get_topic(topic_num)[:3]
                        cluster_labels[topic_num] = "_".join([word for word, score in words])
                    else:
                        cluster_labels[topic_num] = "outlier"

                results_df['cluster_label'] = results_df['cluster'].map(cluster_labels)

                st.session_state['final_results'] = results_df
                st.success("üéâ Results ready for export!")
                st.download_button(
                    "üì• Download Results CSV",
                    results_df.to_csv(index=False),
                    "clustering_results.csv",
                    "text/csv"
                )

        with col2:
            if st.button("üîß **Advanced Review**", use_container_width=True):
                st.session_state['current_page'] = 'review'
                st.rerun()

        with col3:
            if st.button("üîÑ **Start over**", use_container_width=True):
                # Clear session state
                for key in list(st.session_state.keys()):
                    del st.session_state[key]
                st.session_state['current_page'] = 'upload'
                st.rerun()

def page_review():
    """Page 3: Manual Review Configuration"""
    page_confidence_review()

def page_manual_review():
    """Page 4: Actual Manual Review"""
    pass  # Already implemented above")
    st.write("- üè∑Ô∏è Rename clusters")

def main():
    st.set_page_config(page_title="Clustery", layout="wide")

    # Initialize session state
    if 'current_page' not in st.session_state:
        st.session_state['current_page'] = 'upload'

    # Navigation
    current_page = st.session_state.get('current_page', 'upload')

    if current_page == 'upload':
        page_upload()
    elif current_page == 'clustering':
        page_clustering()
    elif current_page == 'review':
        page_review()
    elif current_page == 'manual_review':
        page_manual_review()

if __name__ == "__main__":
    main()

Overwriting app.py


In [None]:
# Install tools for creating public URL access to Streamlit
!pip install pyngrok
!npm install localtunnel


[1G[0K‚†ô[1G[0K‚†π[1G[0K‚†∏[1G[0K‚†º[1G[0K‚†¥[1G[0K‚†¶[1G[0K‚†ß[1G[0K
up to date, audited 23 packages in 793ms
[1G[0K‚†ß[1G[0K
[1G[0K‚†ß[1G[0K3 packages are looking for funding
[1G[0K‚†ß[1G[0K  run `npm fund` for details
[1G[0K‚†ß[1G[0K
2 [31m[1mhigh[22m[39m severity vulnerabilities

To address all issues (including breaking changes), run:
  npm audit fix --force

Run `npm audit` for details.
[1G[0K‚†ß[1G[0K

In [62]:
!streamlit run app.py &>/content/logs.txt & npx localtunnel --port 8501 & curl ipv4.icanhazip.com

34.143.163.48
[1G[0K‚†ô[1G[0Kyour url is: https://late-phones-play.loca.lt


In [52]:
#!wget -q -O - ipv4.icanhazip.com
#!streamlit run app.py & npx localtunnel --port 8501