In [1]:
!pip install syllapy textstat sumy pyngrok streamlit pymupdf docx2txt


Collecting syllapy
  Downloading syllapy-0.7.2-py3-none-any.whl.metadata (854 bytes)
Collecting textstat
  Downloading textstat-0.7.5-py3-none-any.whl.metadata (15 kB)
Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl.metadata (7.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Collecting streamlit
  Downloading streamlit-1.42.2-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting pymupdf
  Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting docx2txt
  Downloading docx2txt-0.8.tar.gz (2.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Collecting cmudict (from textstat)
  Downloading cmudict-1.0.32-py3-none-any.whl.metadata (3.6 kB)
Collecting docopt<0.7,>=0.6.1 (from sumy)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collec

In [2]:
import kagglehub
import os
import shutil

# Download latest version
path = kagglehub.dataset_download("nltkdata/gutenberg")

# Define the desired path
desired_path = '/content/dataset'

# Create the directory if it doesn't exist
os.makedirs(desired_path, exist_ok=True)

# Move the dataset to the desired folder
shutil.move(path, desired_path)

print(f"Dataset saved to {desired_path}")


Downloading from https://www.kaggle.com/api/v1/datasets/download/nltkdata/gutenberg?dataset_version_number=1...


100%|██████████| 4.13M/4.13M [00:00<00:00, 6.06MB/s]

Extracting files...





Dataset saved to /content/dataset


In [3]:
import kagglehub
import os
import shutil
import requests
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio  # For setting the renderer
import nltk
from bs4 import BeautifulSoup
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
from wordcloud import WordCloud
from sklearn.manifold import TSNE
from transformers import pipeline, AutoTokenizer, AutoModel
import torch

In [7]:

%%writefile app.py
import streamlit as st
import os
import requests
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import nltk
from bs4 import BeautifulSoup
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
from wordcloud import WordCloud
from sklearn.manifold import TSNE
from transformers import pipeline, AutoTokenizer, AutoModel
import torch
import fitz  # For PDF extraction
import docx2txt  # For DOCX extraction

# -------------------------------
# Page Configuration
# -------------------------------
st.set_page_config(page_title="Advanced Text Analysis Dashboard", layout="wide")
st.markdown(
    """
    <style>
    @import url('https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;700&display=swap');
    body, html {
        font-family: 'Roboto', sans-serif;
    }
    /* Vibrant colorful background gradients */
    .reportview-container {
        background: linear-gradient(135deg, #f6d365, #fda085);
    }
    .sidebar .sidebar-content {
        background: linear-gradient(135deg, #a1c4fd, #c2e9fb);
    }
    /* Colorful button styling */
    .stButton>button {
        background-color: #ff6f61;
        color: white;
        padding: 10px 20px;
        border: none;
        border-radius: 10px;
        font-weight: bold;
        transition: background-color 0.3s ease;
    }
    .stButton>button:hover {
        background-color: #ff3b2e;
    }
    /* Header styling */
    h1 {
        color: #333;
        text-align: center;
        font-size: 3em;
        margin-bottom: 20px;
        text-shadow: 2px 2px 5px rgba(0,0,0,0.2);
    }
    h2, h3 {
        color: #333;
    }
    /* Footer styling */
    .footer {
        position: fixed;
        left: 0;
        bottom: 0;
        width: 100%;
        background-color: #f6d365;
        color: #333;
        text-align: center;
        padding: 10px 0;
        font-size: 0.9em;
        border-top: 1px solid #ccc;
    }
    </style>
    """,
    unsafe_allow_html=True,
)

st.title("Advanced Text Analysis Dashboard")

# -------------------------------
# Download NLTK Data
# -------------------------------
nltk.download('punkt')
try:
    nltk.download('punkt_tab')
except:
    pass

# ===============================
# Analysis Functions
# ===============================

def load_dataset(dataset_path):
    dataset_texts, file_names = [], []
    if not os.path.exists(dataset_path):
        st.warning(f"Dataset path '{dataset_path}' not found. Skipping dataset comparison.")
        return dataset_texts, file_names
    for file in os.listdir(dataset_path):
        try:
            with open(os.path.join(dataset_path, file), "r", encoding="ISO-8859-1") as f:
                dataset_texts.append(f.read())
                file_names.append(file)
        except Exception as e:
            st.error(f"Skipping {file}: {e}")
    return dataset_texts, file_names

def extract_text_from_url(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        return ' '.join([p.get_text() for p in soup.find_all('p') if p.get_text()])
    return ""

def count_syllables(word):
    word = word.lower()
    vowels = "aeiouy"
    num_vowels = 0
    prev_char_was_vowel = False
    for char in word:
        if char in vowels:
            if not prev_char_was_vowel:
                num_vowels += 1
            prev_char_was_vowel = True
        else:
            prev_char_was_vowel = False
    if word.endswith("e") and num_vowels > 1:
        num_vowels -= 1
    return num_vowels if num_vowels > 0 else 1

def compute_readability_scores(text):
    import nltk
    words = nltk.word_tokenize(text)
    sentences = nltk.sent_tokenize(text)
    total_words = len(words)
    total_sentences = len(sentences)
    total_syllables = sum(count_syllables(word) for word in words if word.isalpha())

    if total_sentences == 0 or total_words == 0:
        fk_grade = 0
    else:
        fk_grade = 0.39 * (total_words / total_sentences) + 11.8 * (total_syllables / total_words) - 15.59

    dale_chall_easy_words = set([
        "a", "about", "all", "and", "are", "as", "at", "be", "but", "by", "can",
        "for", "if", "in", "is", "it", "of", "on", "or", "that", "the", "to",
        "was", "with", "i", "you", "he", "she", "we", "they", "this", "there"
    ])
    difficult_words = [word for word in words if word.isalpha() and word.lower() not in dale_chall_easy_words]
    difficult_word_percentage = (len(difficult_words) / total_words) * 100 if total_words > 0 else 0
    average_sentence_length = total_words / total_sentences if total_sentences > 0 else 0
    raw_score = 0.1579 * difficult_word_percentage + 0.0496 * average_sentence_length
    if difficult_word_percentage > 5:
        raw_score += 3.6365

    return {"Flesch_Kincaid_Grade": fk_grade, "Dale_Chall": raw_score}

def jacobs_semantic_complexity(text, method="word2vec", word2vec_model=None):
    import nltk
    from transformers import AutoTokenizer, AutoModel
    from sklearn.metrics.pairwise import cosine_similarity
    import torch
    sentences = nltk.sent_tokenize(text)
    if len(sentences) < 2:
        return {"avg_similarity": None, "std_similarity": None, "similarities": []}

    embeddings = []
    if method == "bert":
        bert_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
        bert_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
        for sentence in sentences:
            inputs = bert_tokenizer(
                sentence,
                return_tensors="pt",
                truncation=True,
                padding='max_length',
                max_length=512
            )
            with torch.no_grad():
                outputs = bert_model(**inputs)
            embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            embeddings.append(embedding)
    else:
        if word2vec_model is None:
            raise ValueError("A Word2Vec model must be provided when using method='word2vec'.")
        from gensim.utils import simple_preprocess
        import numpy as np
        for sentence in sentences:
            words = simple_preprocess(sentence)
            word_vectors = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
            if word_vectors:
                embeddings.append(np.mean(word_vectors, axis=0))
            else:
                embeddings.append(np.zeros(word2vec_model.vector_size))

    similarities = []
    for i in range(len(embeddings) - 1):
        sim = cosine_similarity([embeddings[i]], [embeddings[i+1]])[0][0]
        similarities.append(sim)
    import numpy as np
    avg_similarity = np.mean(similarities)
    std_similarity = np.std(similarities)
    return {"avg_similarity": avg_similarity, "std_similarity": std_similarity, "similarities": similarities}

def plot_semantic_complexity(metrics):
    import plotly.express as px
    similarities = metrics.get("similarities", [])
    if not similarities:
        st.info("Not enough sentences to compute semantic complexity.")
        return None
    avg_similarity = metrics.get("avg_similarity", 0)
    fig = px.line(
        x=list(range(len(similarities))),
        y=similarities,
        labels={'x': 'Sentence Pair Index', 'y': 'Cosine Similarity'},
        title="Jacobs' Semantic Complexity (Sentence Similarities)",
        markers=True
    )
    fig.add_hline(y=avg_similarity, line_dash="dash", line_color="red",
                  annotation_text=f"Average Similarity: {avg_similarity:.2f}",
                  annotation_position="bottom right")
    return fig

def analyze_emotions(text):
    words = text.lower().split()
    categories = {
        "Love": ["love", "affection", "joy"],
        "Hate": ["hate", "anger"],
        "Conflict": ["fight", "war"]
    }
    return {emotion: sum(words.count(w) for w in words_list)
            for emotion, words_list in categories.items()}

from transformers import pipeline
emotion_classifier = pipeline(
    "text-classification",
    model="j-hartmann/emotion-english-distilroberta-base",
    return_all_scores=True,
    device=-1
)

def analyze_emotions_pretrained(text):
    results = emotion_classifier(text, truncation=True, max_length=512)
    if isinstance(results, list) and results and isinstance(results[0], list):
        results = results[0]
    emotions = {result['label']: result['score'] for result in results}
    return emotions

def analyze_emotions_over_time(text):
    import nltk
    sentences = nltk.sent_tokenize(text)
    possible_labels = ["anger", "disgust", "fear", "joy", "neutral", "sadness", "surprise"]
    emotions_over_time = {lbl: [] for lbl in possible_labels}

    for sentence in sentences:
        results = emotion_classifier(sentence, truncation=True, max_length=512)
        if isinstance(results, list) and results and isinstance(results[0], list):
            results = results[0]
        sent_emotions = {r['label']: r['score'] for r in results}
        for lbl in possible_labels:
            emotions_over_time[lbl].append(sent_emotions.get(lbl, 0.0))
    return emotions_over_time

def plot_emotions_over_time(emotions_dict):
    import plotly.graph_objects as go
    sentence_indices = list(range(len(next(iter(emotions_dict.values())))))
    fig = go.Figure()
    for emotion_label, scores in emotions_dict.items():
        fig.add_trace(go.Scatter(
            x=sentence_indices,
            y=scores,
            mode='lines+markers',
            name=emotion_label
        ))
    fig.update_layout(
        title="Emotion Changes Over Time",
        xaxis_title="Sentence Index",
        yaxis_title="Emotion Score",
        legend_title="Emotions"
    )
    return fig

sentiment_analysis_pipeline = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment",
    return_all_scores=True
)

def analyze_sentiment_pretrained(text):
    results = sentiment_analysis_pipeline(text, truncation=True, max_length=512)
    if isinstance(results, list) and results and isinstance(results[0], list):
        results = results[0]
    mapping = {
        "LABEL_0": "negative",
        "LABEL_1": "neutral",
        "LABEL_2": "positive"
    }
    sentiment_scores = {mapping.get(result['label'], result['label']): result['score']
                        for result in results}
    return sentiment_scores

def plot_sentiment_bar(sentiment_scores):
    import plotly.express as px
    desired_order = ["negative", "positive", "neutral"]
    scores = [sentiment_scores.get(label, 0) for label in desired_order]
    fig = px.bar(
        x=desired_order,
        y=scores,
        labels={'x': 'Sentiment', 'y': 'Score'},
        title='Overall Sentiment Distribution',
        color=desired_order
    )
    fig.update_layout(xaxis_title="Sentiment", yaxis_title="Score")
    return fig

def track_sentiment_over_time(text):
    import nltk
    import plotly.express as px
    sentences = nltk.sent_tokenize(text)
    net_sentiments = []
    for sentence in sentences:
        sentiment = analyze_sentiment_pretrained(sentence)
        net = sentiment.get("positive", 0) - sentiment.get("negative", 0)
        net_sentiments.append(net)
    fig = px.line(
        x=list(range(len(sentences))),
        y=net_sentiments,
        labels={'x': 'Sentence Index', 'y': 'Net Sentiment'},
        title='Sentiment Changes Over Time',
        markers=True
    )
    return fig

summarization_pipeline = pipeline(
    "summarization",
    model="facebook/bart-large-cnn",
    device=-1
)

def generate_summary(text, max_length=130, min_length=30):
    summary = summarization_pipeline(text, max_length=max_length, min_length=min_length, do_sample=False)
    return summary[0]['summary_text']

from gensim.models import Word2Vec
def train_word2vec(texts):
    from gensim.utils import simple_preprocess
    sentences = [simple_preprocess(text) for text in texts]
    return Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

from sklearn.metrics.pairwise import cosine_similarity
def compute_similarity(text, dataset_texts, model):
    import numpy as np
    from gensim.utils import simple_preprocess
    text_vector = np.mean([model.wv[w] for w in simple_preprocess(text) if w in model.wv], axis=0)
    return [
        cosine_similarity(
            [text_vector],
            [np.mean([model.wv[w] for w in simple_preprocess(doc) if w in model.wv], axis=0)]
        )[0][0]
        for doc in dataset_texts
    ]

def plot_word_cloud(text, max_words=100):
    # Static word cloud
    wc = WordCloud(width=800, height=400, background_color='white',
                   max_words=max_words, colormap='viridis').generate(text)
    fig = px.imshow(wc)
    fig.update_layout(
        title=f'Word Cloud (Top {max_words} Words)',
        xaxis_visible=False,
        yaxis_visible=False
    )
    return fig

def animated_wordcloud(text, max_words=100):
    """
    Generates an animated word cloud figure by creating frames with increasing number of words.
    """
    # Set up the range and step for the number of words
    step = max(5, int(max_words/10))
    frames = []
    for num in range(10, max_words+1, step):
        wc = WordCloud(width=800, height=400, background_color='white',
                       max_words=num, colormap='viridis').generate(text)
        img = wc.to_array()
        frames.append(go.Frame(data=[go.Image(z=img)], name=str(num)))
    # Create the initial static word cloud with the smallest number of words
    wc_initial = WordCloud(width=800, height=400, background_color='white',
                           max_words=10, colormap='viridis').generate(text)
    fig = go.Figure(
        data=[go.Image(z=wc_initial.to_array())],
        layout=go.Layout(
            title="Animated Word Cloud",
            xaxis=dict(visible=False),
            yaxis=dict(visible=False),
            updatemenus=[{
                "type": "buttons",
                "buttons": [{
                    "label": "Play",
                    "method": "animate",
                    "args": [None, {"frame": {"duration": 500, "redraw": True}, "fromcurrent": True}]
                }]
            }]
        ),
        frames=frames
    )
    return fig

def plot_word_frequency(text, top_n=10):
    import plotly.express as px
    from collections import Counter
    word_counts = Counter(text.split()).most_common(top_n)
    fig = px.treemap(
        names=[wc[0] for wc in word_counts],
        parents=["Words"] * len(word_counts),
        values=[wc[1] for wc in word_counts],
        title='Word Frequency'
    )
    return fig

def plot_similarity(similarity_scores, file_names):
    import plotly.express as px
    fig = px.imshow(
        [similarity_scores],
        labels=dict(x="Dataset Files", y="Input Text", color="Similarity"),
        x=file_names,
        y=['Input Text']
    )
    fig.update_layout(title='Text Similarity Heatmap', coloraxis_showscale=True)
    return fig

from sklearn.manifold import TSNE
def plot_tsne(model):
    import plotly.express as px
    words, vectors = zip(*[(word, model.wv[word]) for word in model.wv.index_to_key[:100]])
    vectors_array = np.array(vectors)
    tsne_results = TSNE(n_components=2, perplexity=15, random_state=42).fit_transform(vectors_array)
    df = {
        "x": tsne_results[:, 0],
        "y": tsne_results[:, 1],
        "word": words
    }
    fig = px.scatter(
        df,
        x="x",
        y="y",
        text="word",
        title="t-SNE Visualization of Word Embeddings",
        width=700,
        height=600
    )
    fig.update_traces(textposition='top center', marker=dict(size=8, color='blue'))
    return fig

# ===============================
# Streamlit App Layout & UI
# ===============================

# Sidebar: Input Options
st.sidebar.header("Input Options")
input_option = st.sidebar.selectbox("Select Input Type", ["Enter Text", "Enter URL", "Upload File"])

# --- Input text, URL, or file ---
if input_option == "Enter Text":
    user_text = st.sidebar.text_area("Enter your text here (max 1000 words)", height=200)
    user_text = " ".join(user_text.split()[:1000])
elif input_option == "Enter URL":
    url_input = st.sidebar.text_input("Enter a URL")
    if st.sidebar.button("Fetch URL"):
        with st.spinner("Fetching and processing URL..."):
            user_text = extract_text_from_url(url_input)
        user_text = " ".join(user_text.split()[:1000])
    else:
        user_text = ""
elif input_option == "Upload File":
    uploaded_file = st.sidebar.file_uploader("Upload PDF or DOCX file", type=["pdf", "docx"])
    if uploaded_file is not None:
        if uploaded_file.name.endswith(".pdf"):
            try:
                file_bytes = uploaded_file.read()
                import fitz
                with fitz.open(stream=file_bytes, filetype="pdf") as doc:
                    text = ""
                    for page in doc:
                        text += page.get_text()
                user_text = text
            except Exception as e:
                st.error(f"Error processing PDF file: {e}")
                user_text = ""
        elif uploaded_file.name.endswith(".docx"):
            import docx2txt
            try:
                user_text = docx2txt.process(uploaded_file)
            except Exception as e:
                st.error(f"Error processing DOCX file: {e}")
                user_text = ""
        else:
            st.error("Unsupported file type.")
            user_text = ""
        user_text = " ".join(user_text.split()[:1000])
    else:
        user_text = ""

# Word count
if user_text:
    word_count = len(user_text.split())
    st.sidebar.success(f"Word Count: {word_count}/1000")

# Sidebar: Analysis Options
st.sidebar.header("Analysis Options")
run_sentiment = st.sidebar.checkbox("Sentiment Analysis", value=True)
run_sentiment_over_time = st.sidebar.checkbox("Track Sentiment Over Time", value=True)
run_readability = st.sidebar.checkbox("Readability Metrics", value=True)
run_summary = st.sidebar.checkbox("Generate Summary", value=False)
run_emotions = st.sidebar.checkbox("Emotion Analysis", value=True)
run_emotions_overtime = st.sidebar.checkbox("Emotion Over Time", value=False)
run_semantic = st.sidebar.checkbox("Semantic Complexity Metrics", value=True)
use_bert = st.sidebar.checkbox("Use BERT for Semantic Complexity", value=False)
run_dataset = st.sidebar.checkbox("Compare with Dataset", value=False)

# Main content
if user_text:
    st.subheader("Input Text")
    st.write(user_text)

    tabs = st.tabs([
        "Sentiment",
        "Readability",
        "Summary",
        "Emotions",
        "Semantic Complexity",
        "WordCloud & Frequency",
        "Dataset Comparison"
    ])

    # ----- Tab 1: Sentiment -----
    with tabs[0]:
        st.header("Sentiment Analysis")
        if run_sentiment:
            with st.spinner("Analyzing sentiment..."):
                sentiment_scores = analyze_sentiment_pretrained(user_text)

            neg = sentiment_scores.get("negative", 0)
            pos = sentiment_scores.get("positive", 0)
            neu = sentiment_scores.get("neutral", 0)
            st.write("**Overall Sentiment:**")
            st.write(
                f"- Positive: {pos:.2f}\n"
                f"- Negative: {neg:.2f}\n"
                f"- Neutral: {neu:.2f}\n"
            )

            if run_sentiment_over_time:
                st.subheader("Sentiment Over Time")
                fig_time = track_sentiment_over_time(user_text)
                st.plotly_chart(fig_time, use_container_width=True)

            st.subheader("Overall Sentiment Distribution")
            fig_sent = plot_sentiment_bar(sentiment_scores)
            st.plotly_chart(fig_sent, use_container_width=True)
        else:
            st.info("Enable 'Sentiment Analysis' from the sidebar.")

    # ----- Tab 2: Readability -----
    with tabs[1]:
        st.header("Readability Metrics")
        if run_readability:
            with st.spinner("Computing readability metrics..."):
                readability = compute_readability_scores(user_text)
            fk_score = readability['Flesch_Kincaid_Grade']
            dc_score = readability['Dale_Chall']

            st.write("### Flesch-Kincaid Grade Level")
            st.write(f"- **Score**: {fk_score:.2f}")
            if fk_score < 5:
                level_desc = "Elementary school level."
            elif fk_score < 8:
                level_desc = "Middle school level."
            elif fk_score < 12:
                level_desc = "High school level."
            elif fk_score < 16:
                level_desc = "College-level text."
            else:
                level_desc = "Very advanced, possibly post-graduate level."
            st.write(f"- **Interpretation**: {level_desc}")
            st.write("*Typical newspapers range around 8-10, academic papers can be 14+.*")

            st.write("### Dale-Chall Readability Score")
            st.write(f"- **Score**: {dc_score:.2f}")
            if dc_score < 5:
                dc_desc = "Easily understandable by average 4th-grade student."
            elif dc_score < 8:
                dc_desc = "Conversational / typical magazine-level text."
            elif dc_score < 11:
                dc_desc = "College-level text."
            else:
                dc_desc = "Advanced or very difficult text."
            st.write(f"- **Interpretation**: {dc_desc}")
            st.write("*Dale-Chall above ~10 indicates advanced-level reading.*")
        else:
            st.info("Enable 'Readability Metrics' from the sidebar.")

    # ----- Tab 3: Summary -----
    with tabs[2]:
        st.header("Summary")
        if run_summary:
            try:
                with st.spinner("Generating summary..."):
                    summary_text = generate_summary(user_text)
                st.subheader("Generated Summary")
                st.write(summary_text)

                original_word_count = len(user_text.split())
                summary_word_count = len(summary_text.split())
                if original_word_count > 0:
                    compression = 1 - (summary_word_count / original_word_count)
                    st.write(
                        f"**Compression Ratio**: Reduced word count by "
                        f"{compression*100:.1f}% (from {original_word_count} to {summary_word_count})."
                    )
            except Exception as e:
                st.error(f"Summary generation failed: {e}")
        else:
            st.info("Enable 'Generate Summary' from the sidebar.")

    # ----- Tab 4: Emotions -----
    with tabs[3]:
        st.header("Emotion Analysis")
        if run_emotions:
            with st.spinner("Analyzing emotions..."):
                basic_emotions = analyze_emotions(user_text)
                pretrained_emotions = analyze_emotions_pretrained(user_text)

            st.subheader("Basic Keyword-based Emotion Counts")
            for emo, count in basic_emotions.items():
                st.write(f"- {emo}: {count}")

            st.subheader("Pretrained Emotion Scores")
            sorted_emotions = sorted(pretrained_emotions.items(), key=lambda x: x[1], reverse=True)
            for emo, score in sorted_emotions:
                st.write(f"- {emo}: {score:.2f}")

            st.markdown("---")

            if run_emotions_overtime:
                st.subheader("Emotion Changes Over Time")
                with st.spinner("Analyzing emotions sentence by sentence..."):
                    emotions_dict = analyze_emotions_over_time(user_text)
                fig_emotions_time = plot_emotions_over_time(emotions_dict)
                st.plotly_chart(fig_emotions_time, use_container_width=True)
                st.write("""
                    **How to Interpret**:
                    - Each line corresponds to an emotion's intensity across consecutive sentences.
                    - Spikes indicate where that emotion is strongly expressed.
                    - Flat or near-zero lines suggest minimal expression of that emotion.
                """)
        else:
            st.info("Enable 'Emotion Analysis' from the sidebar.")

    # ----- Tab 5: Semantic Complexity -----
    with tabs[4]:
        st.header("Semantic Complexity Metrics")
        if run_semantic:
            with st.spinner("Computing semantic complexity..."):
                if use_bert:
                    complexity = jacobs_semantic_complexity(user_text, method="bert")
                else:
                    model = train_word2vec([user_text])
                    complexity = jacobs_semantic_complexity(user_text, method="word2vec", word2vec_model=model)
            avg_sim = complexity.get("avg_similarity", 0)
            st.write(
                f"**Average Sentence-to-Sentence Similarity**: {avg_sim:.2f}\n\n"
                f"**Standard Deviation**: {complexity.get('std_similarity', 0):.2f}"
            )

            fig_sem = plot_semantic_complexity(complexity)
            if fig_sem:
                st.plotly_chart(fig_sem, use_container_width=True)
                st.write(f"""
                    **Understanding the Graph**:
                    - **Cosine Similarity (Blue Line)** measures how similar each pair of consecutive sentences is.
                      Values closer to 1.0 indicate near-identical meaning.
                    - The **Red Dashed Line** is the average similarity score ({avg_sim:.2f}), a reference
                      for above/below-average coherence.
                    - **Dips** suggest abrupt shifts in meaning; **peaks** indicate strong continuity.
                """)
        else:
            st.info("Enable 'Semantic Complexity Metrics' from the sidebar.")

    # ----- Tab 6: WordCloud & Frequency -----
    with tabs[5]:
        st.header("Word Cloud & Word Frequency")
        max_words_wc = st.slider("Number of words to display in the Word Cloud",
                                 min_value=20, max_value=300, value=100, step=10)
        # Add a button to trigger the animated word cloud
        animate_wc = st.button("Animate Word Cloud")
        if animate_wc:
            fig_wc_anim = animated_wordcloud(user_text, max_words=max_words_wc)
            st.plotly_chart(fig_wc_anim, use_container_width=True)
        else:
            fig_wc = plot_word_cloud(user_text, max_words=max_words_wc)
            st.plotly_chart(fig_wc, use_container_width=True)

        st.write("Top 10 Word Frequency")
        fig_freq = plot_word_frequency(user_text, top_n=10)
        st.plotly_chart(fig_freq, use_container_width=True)

    # ----- Tab 7: Dataset Comparison -----
    with tabs[6]:
        st.header("Dataset Comparison")
        if run_dataset:
            dataset_path = st.text_input("Enter Dataset Path", value="/content/dataset/1/gutenberg")
            if dataset_path:
                dataset_texts, file_names = load_dataset(dataset_path)
                if dataset_texts:
                    st.write("""
                        We compare your input text to each file in the dataset by computing
                        average Word2Vec embeddings and measuring cosine similarity.
                        Scores near 1.0 indicate strong similarity, while near 0.0 indicates dissimilar text.
                    """)
                    model = train_word2vec(dataset_texts + [user_text])
                    similarity_scores = compute_similarity(user_text, dataset_texts, model)

                    fig_sim = plot_similarity(similarity_scores, file_names)
                    st.plotly_chart(fig_sim, use_container_width=True)

                    st.write("""
                        **t-SNE Visualization**:
                        This plot shows a 2D projection of the Word2Vec embeddings. It helps us see
                        how words are grouped in semantic space.
                    """)
                    fig_tsne = plot_tsne(model)
                    st.plotly_chart(fig_tsne, use_container_width=True)
                else:
                    st.warning("No dataset files to compare.")
        else:
            st.info("Enable 'Compare with Dataset' from the sidebar to see dataset comparison.")

else:
    st.info("Please enter some text, a URL, or upload a file (limited to 1000 words) from the sidebar to begin analysis.")

st.markdown(
    """
    <div class="footer">
        Advanced Text Analysis Dashboard &copy; 2025 | Developed with Streamlit
    </div>
    """,
    unsafe_allow_html=True,
)


Overwriting app.py


In [8]:
from pyngrok import ngrok
import os
import time

!ngrok config add-authtoken 2tfpVSURCrI2ZyYT3m7uZjGG0NV_5mC1356bAs4tfQrthVMWA

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [10]:
# Kill any existing ngrok tunnels
ngrok.kill()
time.sleep(5)  # Give it a few seconds to close

# Create an HTTP tunnel on port 8501
public_url = ngrok.connect(8501, proto="http")
print("Streamlit app running at:", public_url)

# Run Streamlit in the background
os.system("streamlit run app.py &")

Streamlit app running at: NgrokTunnel: "https://f900-35-204-154-139.ngrok-free.app" -> "http://localhost:8501"


0