In [None]:
import pandas as pd
import numpy as np

# STEP 1: Load your dataset (replace with actual path or query)
df = pd.read_csv("/Users/ritushetkar/env_capstone/data/hashtags_posts.csv", parse_dates=['createTimeISO'])



# STEP 3: Aggregate to hashtag-day level
df['post_date'] = df['createTimeISO'].dt.date

daily_hashtags = (
    df.groupby(['hashtag_name', 'post_date'])
      .agg(
          post_volume=('post_id', 'nunique'),
          total_diggs=('diggCount', 'sum'),
          total_shares=('shareCount', 'sum'),
          total_comments=('commentCount', 'sum'),
          total_plays=('playCount', 'sum'),
          avg_duration=('video_duration', 'mean'),
          avg_fans=('author_fans', 'mean')
      )
      .reset_index()
)

daily_hashtags['post_date'] = pd.to_datetime(daily_hashtags['post_date'])
daily_hashtags = daily_hashtags.sort_values(['hashtag_name', 'post_date'])

for col in ['post_volume', 'total_diggs', 'total_shares', 'total_comments', 'total_plays']:
    daily_hashtags[f'{col}_7d'] = (
        daily_hashtags.groupby('hashtag_name')[col]
        .transform(lambda x: x.rolling(window=7, min_periods=1).sum())
    )

# STEP 5: Add change rates
daily_hashtags['volume_change_7d'] = (
    daily_hashtags.groupby('hashtag_name')['post_volume_7d']
    .pct_change().replace([np.inf, -np.inf], np.nan).fillna(0)
)

# STEP 6: Label as trending (if volume jumped >50% day-over-day)
daily_hashtags['is_trending'] = (daily_hashtags['volume_change_7d'] > 0.5).astype(int)

# STEP 7: Fill missing values
daily_hashtags.fillna(0, inplace=True)

# FINAL FEATURE SET
features = [
    'post_volume_7d', 'total_diggs_7d', 'total_shares_7d', 
    'total_comments_7d', 'total_plays_7d', 'avg_duration', 'avg_fans', 'volume_change_7d'
]

X = daily_hashtags[features]
y = daily_hashtags['is_trending']



In [None]:
df.columns

In [None]:
# Ensure 'post_date' is a datetime object (in case it's a string)
df['post_date'] = pd.to_datetime(df['post_date'])

# Get unique dates and sort them descending
unique_dates = df['post_date'].dropna().drop_duplicates().sort_values(ascending=True)

print(unique_dates)

In [None]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)

model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


In [None]:
import shap

explainer = shap.Explainer(model)
shap_values = explainer(X_test)

shap.summary_plot(shap_values, X_test)


In [None]:
daily_hashtags[daily_hashtags['is_trending']==1].head(15)

In [None]:
import pandas as pd

# 1. Get predicted probabilities for the positive class (trending = 1)
probs = model.predict_proba(X_test)[:, 1]

# 2. Create a DataFrame with predictions
pred_df = daily_hashtags.loc[X_test.index].copy()
pred_df['trending_probability'] = probs

# 3. Filter for today's hashtags (or latest date)
latest_date = pred_df['post_date'].max()
today_hashtags = pred_df[pred_df['post_date'] == latest_date]

# 4. Get top 10 most likely trending hashtags
top_trending = (
    today_hashtags[['hashtag_name', 'trending_probability']]
    .sort_values(by='trending_probability', ascending=False)
    .drop_duplicates('hashtag_name')
    .head(20)
)

top_trending


In [None]:
import pandas as pd

# 1. Get predicted probabilities for the positive class (trending = 1)
probs = model.predict_proba(X_test)[:, 1]

# 2. Create a DataFrame with predictions
pred_df = daily_hashtags.loc[X_test.index].copy()
pred_df['trending_probability'] = probs

# 3. Filter for today's hashtags (or latest date)
latest_date = pred_df['post_date'].max()
today_hashtags = pred_df[pred_df['post_date'] == latest_date]

# 4. Get top 20 most likely trending hashtags
top_trending = (
    today_hashtags[['hashtag_name', 'trending_probability']]
    .sort_values(by='trending_probability', ascending=False)
    .drop_duplicates('hashtag_name')
    .head(20)
)

# 5. Show the result
print(top_trending)


In [None]:
import pandas as pd

# 1. Get predicted probabilities for the positive class (trending = 1)
probs = model.predict_proba(X_test)[:, 1]

# 2. Create a DataFrame with predictions
pred_df = daily_hashtags.loc[X_test.index].copy()
pred_df['trending_probability'] = probs

# 3. Filter for hashtags from the last 7 days
latest_date = pred_df['post_date'].max()
start_date = latest_date - pd.Timedelta(days=6)  # last 7 days including today

week_hashtags = pred_df[(pred_df['post_date'] >= start_date) & (pred_df['post_date'] <= latest_date)]

# 4. Get top 20 most likely trending hashtags (across all those days)
top_trending_week = (
    week_hashtags[['hashtag_name', 'post_date', 'trending_probability']]
    .sort_values(by='trending_probability', ascending=False)
    .drop_duplicates('hashtag_name')  # ensures each hashtag appears only once
    .head(20)
)

# 5. Show the result
print(top_trending_week)


In [None]:
import pandas as pd

# 1. Get predicted probabilities for the positive class (trending = 1)
probs = model.predict_proba(X_test)[:, 1]

# 2. Create a DataFrame with predictions
pred_df = daily_hashtags.loc[X_test.index].copy()
pred_df['trending_probability'] = probs

# 3. Filter for hashtags from the last 30 days
latest_date = pred_df['post_date'].max()
start_date_month = latest_date - pd.Timedelta(days=29)  # last 30 days including today

month_hashtags = pred_df[
    (pred_df['post_date'] >= start_date_month) & (pred_df['post_date'] <= latest_date)
]

# 4. Get top 20 most likely trending hashtags across the month
top_trending_month = (
    month_hashtags[['hashtag_name', 'post_date', 'trending_probability']]
    .sort_values(by='trending_probability', ascending=False)
    .drop_duplicates('hashtag_name')  # only one row per hashtag
    .head(20)
)

# 5. Show the result
print(top_trending_month)


In [None]:
import pandas as pd

# STEP 1: Predict trending probabilities
probs = model.predict_proba(X_test)[:, 1]
pred_df = daily_hashtags.loc[X_test.index].copy()
pred_df['trending_probability'] = probs

# STEP 2: Define date ranges
latest_date = pred_df['post_date'].max()
last_7_days = latest_date - pd.Timedelta(days=6)
last_30_days = latest_date - pd.Timedelta(days=29)

# STEP 3: Get Top N for Weekly
top_week = (
    pred_df[pred_df['post_date'] >= last_7_days]
    .sort_values(by='trending_probability', ascending=False)
    .drop_duplicates('hashtag_name')
    .head(20)
    .assign(source='weekly')
)

# STEP 4: Get Top N for Monthly
top_month = (
    pred_df[pred_df['post_date'] >= last_30_days]
    .sort_values(by='trending_probability', ascending=False)
    .drop_duplicates('hashtag_name')
    .head(20)
    .assign(source='monthly')
)

# STEP 5: Merge to compare
combined = pd.concat([top_week, top_month])
combined_summary = (
    combined.groupby('hashtag_name')
    .agg(
        sources=('source', lambda x: ', '.join(sorted(set(x)))),
        max_probability=('trending_probability', 'max'),
        most_recent_date=('post_date', 'max')
    )
    .reset_index()
    .sort_values(by='max_probability', ascending=False)
)

print(combined_summary)



In [None]:
import matplotlib.pyplot as plt

top_combined = combined_summary.head(20)  # visualize top 20 from comparison

plt.figure(figsize=(12, 6))
plt.barh(top_combined['hashtag_name'], top_combined['max_probability'], color='skyblue')
plt.gca().invert_yaxis()
plt.xlabel('Max Trending Probability')
plt.title('Top Hashtags (Weekly vs Monthly Trending Overlap)')
plt.tight_layout()
plt.show()


# HASHTAGS with XGboost

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import shap

# STEP 1: Load dataset
df = pd.read_csv("/Users/ritushetkar/env_capstone/data/hashtags_posts.csv", parse_dates=['createTimeISO'])

# STEP 2: Aggregate to hashtag-day level
df['post_date'] = df['createTimeISO'].dt.date

daily_hashtags = (
    df.groupby(['hashtag_name', 'post_date'])
      .agg(
          post_volume=('post_id', 'nunique'),
          total_diggs=('diggCount', 'sum'),
          total_shares=('shareCount', 'sum'),
          total_comments=('commentCount', 'sum'),
          total_plays=('playCount', 'sum'),
          avg_duration=('video_duration', 'mean'),
          avg_fans=('author_fans', 'mean')
      )
      .reset_index()
)

# STEP 3: Rolling features (7-day window)
daily_hashtags['post_date'] = pd.to_datetime(daily_hashtags['post_date'])
daily_hashtags = daily_hashtags.sort_values(['hashtag_name', 'post_date'])

for col in ['post_volume', 'total_diggs', 'total_shares', 'total_comments', 'total_plays']:
    daily_hashtags[f'{col}_7d'] = (
        daily_hashtags.groupby('hashtag_name')[col]
        .transform(lambda x: x.rolling(window=7, min_periods=1).sum())
    )

# STEP 4: Growth signal
daily_hashtags['volume_change_7d'] = (
    daily_hashtags.groupby('hashtag_name')['post_volume_7d']
    .pct_change().replace([np.inf, -np.inf], np.nan).fillna(0)
)

# STEP 5: Label trending (volume surge > 50%)
daily_hashtags['is_trending'] = (daily_hashtags['volume_change_7d'] > 0.5).astype(int)

# STEP 6: Prepare features
daily_hashtags.fillna(0, inplace=True)

#features = [
 #   'post_volume_7d', 'total_diggs_7d', 'total_shares_7d',
  #  'total_comments_7d', 'total_plays_7d', 'avg_duration', 'avg_fans', 'volume_change_7d'
#]

features = [
    'post_volume_7d', 'total_diggs_7d', 'total_shares_7d',
    'total_comments_7d', 'total_plays_7d', 'avg_duration', 'avg_fans'
]

X = daily_hashtags[features]
y = daily_hashtags['is_trending']

# STEP 7: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# STEP 8: Model training
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)



# STEP 10: SHAP for interpretability
explainer = shap.Explainer(model)
shap_values = explainer(X_test)




In [74]:
# STEP 9: Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      7399
           1       0.97      0.91      0.94       937

    accuracy                           0.99      8336
   macro avg       0.98      0.96      0.97      8336
weighted avg       0.99      0.99      0.99      8336



In [None]:
shap.summary_plot(shap_values, X_test)

In [73]:
# STEP 11: Predict trending probabilities
probs = model.predict_proba(X_test)[:, 1]
pred_df = daily_hashtags.loc[X_test.index].copy()
pred_df['trending_probability'] = probs


In [76]:

# STEP 12: Top trending for the latest date with minimum post filter
MIN_POSTS = 10 # set your minimum post threshold here

latest_date = pred_df['post_date'].max()
today_trends = (
    pred_df[
        (pred_df['post_date'] == latest_date) &
        (pred_df['post_volume_7d'] >= MIN_POSTS)
    ]
    .sort_values(by='trending_probability', ascending=False)
    .drop_duplicates('hashtag_name')
    .head(20)
)

print("🔥 Top Trending Hashtags Today (with at least 5 posts in the last 7 days)")
print(today_trends[['hashtag_name', 'post_volume_7d', 'trending_probability']])



🔥 Top Trending Hashtags Today (with at least 5 posts in the last 7 days)
            hashtag_name  post_volume_7d  trending_probability
34382  skincarethatworks            11.0              0.000626
33017           skincare           181.0              0.000098
21213     koreanskincare            44.0              0.000033


In [None]:
with MIN Posts as 0
🔥 Top Trending Hashtags Today (with at least 5 posts in the last 7 days)
                hashtag_name  post_volume_7d  trending_probability
30249           ringanafresh             2.0              0.962396
12775                frische             5.0              0.044179
4676             beautyhacks             8.0              0.011727
32144  sensitiveskinfriendly             6.0              0.005687
17617              hautliebe             8.0              0.003768
24504                momlife             7.0              0.003364
5174             beautytipps             9.0              0.002100
34049         skincarereview             8.0              0.001337
34382      skincarethatworks            11.0              0.000626
33017               skincare           181.0              0.000098
21213         koreanskincare            44.0              0.000033


In [None]:
with MIN Posts as 5
Top Trending Hashtags Today (with at least 5 posts in the last 7 days)
                hashtag_name  post_volume_7d  trending_probability
12775                frische             5.0              0.044179
4676             beautyhacks             8.0              0.011727
32144  sensitiveskinfriendly             6.0              0.005687
17617              hautliebe             8.0              0.003768
24504                momlife             7.0              0.003364
5174             beautytipps             9.0              0.002100
34049         skincarereview             8.0              0.001337
34382      skincarethatworks            11.0              0.000626
33017               skincare           181.0              0.000098
21213         koreanskincare            44.0              0.000033


In [None]:
with 10 min posts
 Top Trending Hashtags Today (with at least 5 posts in the last 7 days)
            hashtag_name  post_volume_7d  trending_probability
34382  skincarethatworks            11.0              0.000626
33017           skincare           181.0              0.000098
21213     koreanskincare            44.0              0.000033


In [None]:
 with 20 min posts
Top Trending Hashtags Today (with at least 5 posts in the last 7 days)
         hashtag_name  post_volume_7d  trending_probability
33017        skincare           181.0              0.000098
21213  koreanskincare            44.0              0.000033


In [None]:
with 50 min posts 
Top Trending Hashtags Today (with at least 5 posts in the last 7 days)
      hashtag_name  post_volume_7d  trending_probability
33017     skincare           181.0              0.000098


In [None]:

# Get the most recent date in the prediction set
latest_date = pred_df['post_date'].max()
week_start = latest_date - pd.Timedelta(days=6)
month_start = latest_date - pd.Timedelta(days=29)

# WEEKLY (Step 13)
week_trends = (
    pred_df[
        (pred_df['post_date'] >= week_start) &
        (pred_df['post_volume_7d'] >= MIN_POSTS)
    ]
    .sort_values(by='trending_probability', ascending=False)
    .drop_duplicates('hashtag_name')
    .head(20)
)

# MONTHLY (Step 14)
month_trends = (
    pred_df[
        (pred_df['post_date'] >= month_start) &
        (pred_df['post_volume_7d'] >= MIN_POSTS)
    ]
    .sort_values(by='trending_probability', ascending=False)
    .drop_duplicates('hashtag_name')
    .head(20)
)

# STEP 15: Weekly vs Monthly comparison
week_trends['source'] = 'weekly'
month_trends['source'] = 'monthly'
comparison = pd.concat([week_trends, month_trends])

comparison_summary = (
    comparison.groupby('hashtag_name')
    .agg(
        sources=('source', lambda x: ', '.join(sorted(set(x)))),
        max_probability=('trending_probability', 'max'),
        most_recent_date=('post_date', 'max')
    )
    .reset_index()
    .sort_values(by='max_probability', ascending=False)
)

print("📊 Weekly vs Monthly Trending Comparison:")
print(comparison_summary[comparison_summary['max_probability']>0.4].head(20))




