In [3]:
import pandas as pd

df = pd.read_csv('/Users/ritushetkar/env_capstone/skincare_analysis/data/profile_posts_unnested.csv')

In [4]:
df.columns

Index(['id', 'text', 'textLanguage', 'createTimeISO', 'authorMeta_id',
       'authorMeta_profileUrl', 'authorMeta_nickName', 'authorMeta_verified',
       'authorMeta_signature', 'authorMeta_commerceUserInfo_category',
       'authorMeta_privateAccount', 'authorMeta_roomId', 'authorMeta_ttSeller',
       'authorMeta_following', 'authorMeta_friends', 'authorMeta_fans',
       'authorMeta_heart', 'authorMeta_video', 'authorMeta_digg',
       'authorMeta_region', 'musicMeta_musicName', 'musicMeta_musicAuthor',
       'musicMeta_musicOriginal', 'musicMeta_musicAlbum', 'musicMeta_musicId',
       'webVideoUrl', 'mediaUrls', 'shareCount', 'playCount', 'collectCount',
       'commentCount', 'mentions', 'detailedMentions_id',
       'detailedMentions_nickName', 'hashtags_name', 'effectStickers_id',
       'effectStickers_name', 'effectStickers_stickerStats_useCount',
       'slideshowImageLinks_tiktokLink', 'slideshowImageLinks_downloadLink',
       'note', 'locationMeta_city', 'locationMeta_

In [5]:

df['createTimeISO'] = pd.to_datetime(df['createTimeISO'])
# Create new columns
df['date'] = df['createTimeISO'].dt.date
df['month'] = df['createTimeISO'].dt.strftime('%b')   # e.g., 'Feb'
df['year'] = df['createTimeISO'].dt.year
df['week'] = df['createTimeISO'].dt.isocalendar().week



In [6]:
df_text_filtered = df[df['textLanguage'].isin(['en', 'de'])]

In [7]:
df_text_filtered=df_text_filtered.reset_index()

In [8]:
# Are there any rows with empty hashtags
empty_hashtags_count = df_text_filtered[df_text_filtered['hashtags_name'].str.strip() == ''].shape[0]
print("Empty hashtag rows:", empty_hashtags_count)


Empty hashtag rows: 0


In [9]:
hashtag_trends = (
    df_text_filtered.groupby(['year', 'week', 'hashtags_name'])
    .size()
    .reset_index(name='count')
    .sort_values(['year', 'week', 'count'], ascending=[False, False, False])
)

hashtag_trends

Unnamed: 0,year,week,hashtags_name,count
861,2025,13,foryou,13
862,2025,13,foryoupage,9
864,2025,13,fürdich,9
865,2025,13,grwm,6
856,2025,13,beach,4
...,...,...,...,...
7,2024,1,momlife,1
8,2024,1,momoffour,1
9,2024,1,newborn,1
10,2024,1,recap,1


In [10]:
top_hashtags_this_week = (
    hashtag_trends[hashtag_trends['week'] == df_text_filtered['week'].max()]
    .groupby('hashtags_name')['count']
    .sum()
    .sort_values(ascending=False)
    .head(10)
)
top_hashtags_this_week


hashtags_name
fyp          2
mädchenwg    2
ski          1
skiseason    1
Name: count, dtype: int64

In [11]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display

# --- Assume df_text_filtered is already prepared with columns: 'year', 'month', 'week', 'hashtags_name'

# Widgets
year_dropdown = widgets.Dropdown(
    options=sorted(df_text_filtered['year'].unique()),
    description='Year:',
    value=df_text_filtered['year'].max()
)

month_dropdown = widgets.Dropdown(
    options=["All"] + sorted(df_text_filtered['month'].dropna().unique()),
    description='Month:',
    value="All"
)

week_dropdown = widgets.Dropdown(
    options=["All"] + sorted(df_text_filtered['week'].dropna().unique()),
    description='Week:',
    value="All"
)

# Output area
output = widgets.Output()

def update_graphs(change=None):
    with output:
        output.clear_output()
        
        # Filter by year
        filtered = df_text_filtered[df_text_filtered['year'] == year_dropdown.value]
        
        # Optional month filter
        if month_dropdown.value != "All":
            filtered = filtered[filtered['month'] == month_dropdown.value]
        
        # Optional week filter
        if week_dropdown.value != "All":
            filtered = filtered[filtered['week'] == week_dropdown.value]
        
        # Count hashtags
        top_hashtags = (
        filtered['hashtags_name']
        .value_counts()
        .head(10)
        .reset_index()
        .rename(columns={'index': 'hashtags_name'})
        )
        top_hashtags.columns = ['hashtags_name', 'count']  # Make column names explicit


        if top_hashtags.empty:
            print("No data available for this selection.")
            return

        # --- Matplotlib + Seaborn Bar Chart ---
        plt.figure(figsize=(10, 6))
        sns.barplot(
            data=top_hashtags,
            x='hashtags_name',
            y='count',
            palette='viridis'
        )
        
        # Add count labels on top of bars
        for index, row in top_hashtags.iterrows():
            plt.text(index, row['count'] + 1, str(row['count']), ha='center', va='bottom', fontsize=10)
        
        # Build dynamic title
        title = f"Top Hashtags in {year_dropdown.value}"
        if month_dropdown.value != "All":
            title += f" - {month_dropdown.value}"
        if week_dropdown.value != "All":
            title += f" - Week {week_dropdown.value}"
        
        plt.title(title, fontsize=14)
        plt.xlabel("Hashtag")
        plt.ylabel("Mentions")
        plt.xticks(rotation=45, ha='right')
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.show()

        # Show the data table
        display(top_hashtags)

# Trigger when widgets are changed
year_dropdown.observe(update_graphs, names='value')
month_dropdown.observe(update_graphs, names='value')
week_dropdown.observe(update_graphs, names='value')

# Display UI
display(widgets.VBox([year_dropdown, month_dropdown, week_dropdown]))
display(output)

# Initial render
update_graphs()


VBox(children=(Dropdown(description='Year:', index=1, options=(np.int32(2024), np.int32(2025)), value=np.int32…

Output()

In [12]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display

# --- Add a new column: total engagement per post
df_text_filtered['engagement_score'] = ((
    df_text_filtered['diggCount'] + 
    df_text_filtered['shareCount'] + 
    df_text_filtered['playCount'] + 
    df_text_filtered['collectCount'] + 
    df_text_filtered['commentCount']
)/5) # aberage with weights 1


# --- Widgets
year_dropdown = widgets.Dropdown(
    options=sorted(df_text_filtered['year'].unique()),
    description='Year:',
    value=df_text_filtered['year'].max()
)

month_dropdown = widgets.Dropdown(
    options=["All"] + sorted(df_text_filtered['month'].dropna().unique()),
    description='Month:',
    value="All"
)

week_dropdown = widgets.Dropdown(
    options=["All"] + sorted(df_text_filtered['week'].dropna().unique()),
    description='Week:',
    value="All"
)

output = widgets.Output()

# --- Update function
def update_graphs(change=None):
    with output:
        output.clear_output()
        
        # Filter by dropdowns
        filtered = df_text_filtered[df_text_filtered['year'] == year_dropdown.value]
        if month_dropdown.value != "All":
            filtered = filtered[filtered['month'] == month_dropdown.value]
        if week_dropdown.value != "All":
            filtered = filtered[filtered['week'] == week_dropdown.value]

        # Group and sum engagement per hashtag
        top_engagement = (
            filtered.groupby('hashtags_name')['engagement_score']
            .sum()
            .sort_values(ascending=False)
            .head(10)
            .reset_index()
        )

        if top_engagement.empty:
            print("No data available for this selection.")
            return

        # --- Plot
        plt.figure(figsize=(10, 6))
        sns.barplot(
            data=top_engagement,
            x='hashtags_name',
            y='engagement_score',
            palette='magma'
        )

        # Add value labels
        for index, row in top_engagement.iterrows():
            plt.text(index, row['engagement_score'] + 0.05 * row['engagement_score'], 
                     int(row['engagement_score']), ha='center', fontsize=9)

        # Title
        title = f"Top Hashtags by Engagement in {year_dropdown.value}"
        if month_dropdown.value != "All":
            title += f" - {month_dropdown.value}"
        if week_dropdown.value != "All":
            title += f" - Week {week_dropdown.value}"

        plt.title(title, fontsize=14)
        plt.xlabel("Hashtag")
        plt.ylabel("Total Engagement")
        plt.xticks(rotation=45, ha='right')
        plt.grid(axis='y', linestyle='--', alpha=0.6)
        plt.tight_layout()
        plt.show()

        # Show data table
        display(top_engagement)

# Attach interactivity
year_dropdown.observe(update_graphs, names='value')
month_dropdown.observe(update_graphs, names='value')
week_dropdown.observe(update_graphs, names='value')

# Display UI
display(widgets.VBox([year_dropdown, month_dropdown, week_dropdown]))
display(output)

# Initial call
update_graphs()


KeyError: 'diggCount'