In [1]:
import ipywidgets as widgets
from IPython.display import display

text = widgets.Text(value="Hello", description="Input:")
button = widgets.Button(description="Click Me")
output = widgets.Output()

def on_button_clicked(b):
    with output:
        print(f"You entered: {text.value}")

button.on_click(on_button_clicked)
display(text)
display(button)
display(output)

Text(value='Hello', description='Input:')

Button(description='Click Me', style=ButtonStyle())

Output()

In [2]:
# Import libraries
import praw
import pandas as pd
from dotenv import load_dotenv
import os
import logging
import ipywidgets as widgets
from IPython.display import display, clear_output
from geopy.geocoders import Nominatim
import re
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
import folium
from folium.plugins import MarkerCluster

# Configure logging
logging.basicConfig(filename="health_trends.log", level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# Load Reddit credentials
CLIENT_ID = "_r-auBrrOUMdCwudxBtZJw"
CLIENT_SECRET = "ldQbRtpOltzZZHbJfDiuZGZq0s30bg"
USER_AGENT = "windows:my-cool-app:v1.0 (by /u/Happy-Syllabub-6994)"

# Initialize geolocator for location inference
geolocator = Nominatim(user_agent="health_trends_app")
locations = ["New York", "California", "London", "Toronto", "Sydney", "Texas", "Florida", "Paris", "Tokyo", "Mumbai"]

# Create interactive widgets
keyword_input = widgets.Text(
    value="flu",
    placeholder="Enter a keyword (e.g., flu, fever)",
    description="Keyword:",
    layout={'width': '500px'}
)

search_button = widgets.Button(
    description="Search and Analyze",
    button_style="success",
    tooltip="Click to search Reddit and analyze posts"
)

output = widgets.Output()

# Define the analysis pipeline
def on_button_clicked(b):
    with output:
        clear_output()
        keyword = keyword_input.value.strip()
        if not keyword:
            print("Please enter a keyword.")
            return

        print(f"Collecting posts for keyword: {keyword}")
        logging.info(f"Collecting posts for keyword: {keyword}")

        try:
            # Authenticate with Reddit API
            reddit = praw.Reddit(
                client_id=CLIENT_ID,
                client_secret=CLIENT_SECRET,
                user_agent=USER_AGENT
            )
            reddit.read_only = True
            user = reddit.user.me()
            print("Authentication successful! User:", user)
            logging.info("Authentication successful")

            # Collect posts
            posts = []
            subreddit = reddit.subreddit("health+anxiety+mentalhealth+askdocs")
            for post in subreddit.search(keyword, limit=100):
                posts.append({
                    "date": post.created_utc,
                    "text": post.title + " " + (post.selftext or ""),
                    "subreddit": post.subreddit.display_name,
                    "username": post.author.name if post.author else "Unknown",
                    "keyword": keyword
                })

            if not posts:
                print("No posts found for this keyword.")
                return

            df = pd.DataFrame(posts)
            df["date"] = pd.to_datetime(df["date"], unit="s")
            print(f"Collected {len(posts)} posts")
            logging.info(f"Collected {len(posts)} posts")

            # Step 1: Clean the data
            def clean_text(text):
                text = re.sub(r"http\S+|www\S+|https\S+", "", text)
                text = re.sub(r"@\w+|\#\w+", "", text)
                text = re.sub(r"[^\w\s]", "", text)
                return text.strip()

            df["cleaned_text"] = df["text"].apply(clean_text)
            df["subreddit"] = df["subreddit"].fillna("Unknown")
            df = df.drop_duplicates(subset=["text"])
            print("Data cleaned")
            logging.info("Data cleaned")

            # Step 2: Location inference
            def extract_location(text):
                for location in locations:
                    if re.search(r'\b' + re.escape(location) + r'\b', text, re.IGNORECASE):
                        try:
                            geo = geolocator.geocode(location)
                            if geo:
                                return location, geo.latitude, geo.longitude
                        except:
                            continue
                return None, None, None

            df["location"], df["latitude"], df["longitude"] = zip(*df["text"].apply(extract_location))
            print("Location data added")
            logging.info("Location data added")

            # Step 3: Sentiment analysis
            def get_sentiment(text):
                return TextBlob(text).sentiment.polarity

            df["sentiment"] = df["cleaned_text"].apply(get_sentiment)
            df["sentiment_category"] = df["sentiment"].apply(
                lambda x: "Positive" if x > 0.1 else "Negative" if x < -0.1 else "Neutral"
            )
            print("Sentiment analysis completed")
            logging.info("Sentiment analysis completed")

            # Step 4: Topic modeling with scikit-learn LDA
            texts = df["cleaned_text"].dropna().tolist()
            if not texts:
                print("No valid texts for topic modeling.")
                df["topic"] = None
            else:
                vectorizer = CountVectorizer(stop_words="english", max_df=0.95, min_df=2)
                doc_term_matrix = vectorizer.fit_transform(texts)
                lda_model = LatentDirichletAllocation(n_components=3, random_state=42)
                lda_output = lda_model.fit_transform(doc_term_matrix)
                df["topic"] = [None] * len(df)
                valid_indices = df["cleaned_text"].dropna().index
                df.loc[valid_indices, "topic"] = lda_output.argmax(axis=1)
                print("Topic modeling completed")
                print("Identified Topics:")
                feature_names = vectorizer.get_feature_names_out()
                for topic_idx, topic in enumerate(lda_model.components_):
                    top_words = [feature_names[i] for i in topic.argsort()[:-6:-1]]
                    print(f"Topic {topic_idx}: {', '.join(top_words)}")
                logging.info("Topic modeling completed")

            # Step 5: Time series analysis
            daily_posts = df.groupby(df["date"].dt.date).size().reset_index(name="post_count")
            daily_posts["date"] = pd.to_datetime(daily_posts["date"])
            daily_posts.set_index("date", inplace=True)
            if len(daily_posts) > 7:
                decomposition = seasonal_decompose(daily_posts["post_count"], model="additive", period=7)
            else:
                decomposition = None
            print("Time series analysis completed")
            logging.info("Time series analysis completed")

            # Step 6: Visualizations
            plt.figure(figsize=(10, 6))
            daily_posts.plot()
            plt.title(f"{keyword.capitalize()}-Related Posts Over Time")
            plt.xlabel("Date")
            plt.ylabel("Number of Posts")
            plt.show()

            sentiment_counts = df["sentiment_category"].value_counts()
            plt.figure(figsize=(8, 5))
            sentiment_counts.plot(kind="bar")
            plt.title("Sentiment Distribution")
            plt.xlabel("Sentiment")
            plt.ylabel("Number of Posts")
            plt.show()

            topic_counts = df["topic"].value_counts()
            plt.figure(figsize=(8, 5))
            topic_counts.plot(kind="bar")
            plt.title("Topic Distribution")
            plt.xlabel("Topic")
            plt.ylabel("Number of Posts")
            plt.show()

            location_df = df.dropna(subset=["latitude", "longitude"])
            if not location_df.empty:
                m = folium.Map(location=[0, 0], zoom_start=2)
                marker_cluster = MarkerCluster().add_to(m)
                for idx, row in location_df.iterrows():
                    folium.Marker(
                        location=[row["latitude"], row["longitude"]],
                        popup=f"{row['location']}: {row['text'][:100]}..."
                    ).add_to(marker_cluster)
                display(m)
            else:
                print("No location data available for mapping.")

            if decomposition:
                plt.figure(figsize=(12, 8))
                plt.subplot(411)
                plt.plot(decomposition.observed, label="Observed")
                plt.legend(loc="best")
                plt.subplot(412)
                plt.plot(decomposition.trend, label="Trend")
                plt.legend(loc="best")
                plt.subplot(413)
                plt.plot(decomposition.seasonal, label="Seasonal")
                plt.legend(loc="best")
                plt.subplot(414)
                plt.plot(decomposition.resid, label="Residual")
                plt.legend(loc="best")
                plt.tight_layout()
                plt.show()
            else:
                print("Not enough data for time series decomposition.")

            df.to_csv("analyzed_posts.csv", index=False)
            print("Analysis complete! Results saved to 'analyzed_posts.csv'")
            logging.info("Analysis complete")

        except praw.exceptions.RedditAPIException as e:
            print(f"Reddit API error: {e}")
            logging.error(f"Reddit API error: {e}")
        except Exception as e:
            print(f"Error: {e}")
            logging.error(f"Error: {e}")

# Link the button to the function
search_button.on_click(on_button_clicked)

# Display the widgets
display(keyword_input)
display(search_button)
display(output)

Text(value='flu', description='Keyword:', layout=Layout(width='500px'), placeholder='Enter a keyword (e.g., fl…

Button(button_style='success', description='Search and Analyze', style=ButtonStyle(), tooltip='Click to search…

Output()