In [3]:
import pandas as pd
import numpy as np

# STEP 1: Load your dataset
df = pd.read_csv("/Users/ritushetkar/env_capstone/data/hashtags_posts.csv", parse_dates=['createTimeISO'])

# STEP 2: Preprocessing
df['week'] = df['createTimeISO'].dt.to_period('W').dt.start_time
df['engagement'] = df[['diggCount', 'shareCount', 'commentCount']].sum(axis=1)

# STEP 3: Explode hashtags (if multiple hashtags per post)
df['hashtag_name'] = df['hashtag_name'].fillna('')
df['hashtag_list'] = df['hashtag_name'].apply(lambda x: x.split(',') if ',' in x else [x])
df_exploded = df.explode('hashtag_list')
df_exploded['hashtag_list'] = df_exploded['hashtag_list'].str.strip().str.lower()
df_exploded = df_exploded[df_exploded['hashtag_list'] != '']  # remove empty tags

# STEP 4: Weekly hashtag stats
weekly_stats = (
    df_exploded.groupby(['hashtag_list', 'week'])
    .agg(post_count=('post_id', 'count'),
         avg_engagement=('engagement', 'mean'))
    .reset_index()
)

# STEP 5: Compute % change in engagement for last 2 weeks
latest_weeks = sorted(weekly_stats['week'].unique())[-2:]

latest_df = weekly_stats[weekly_stats['week'].isin(latest_weeks)]
pivot = latest_df.pivot(index='hashtag_list', columns='week', values='avg_engagement').dropna()

pivot['engagement_change'] = ((pivot[latest_weeks[1]] - pivot[latest_weeks[0]]) / pivot[latest_weeks[0]]) * 100

# STEP 6: Top creators per hashtag
top_creators = (
    df_exploded.groupby(['hashtag_list', 'author_nickName'])
    .agg(total_engagement=('engagement', 'sum'))
    .reset_index()
    .sort_values(['hashtag_list', 'total_engagement'], ascending=[True, False])
    .drop_duplicates('hashtag_list')
    .set_index('hashtag_list')['author_nickName']
)



# STEP 7: Final leaderboard
hashtag_counts = df_exploded['hashtag_list'].value_counts()
leaderboard = pivot.copy()
leaderboard['post_count'] = hashtag_counts

# 🔽 FILTER HERE
MIN_POST_THRESHOLD = 50
leaderboard = leaderboard[leaderboard['post_count'] >= MIN_POST_THRESHOLD]

leaderboard['trend'] = np.where(leaderboard['engagement_change'] > 0, '📈', '📉')
leaderboard['engagement_change_fmt'] = leaderboard['engagement_change'].apply(lambda x: f"{x:+.0f}%")
leaderboard['top_creator'] = leaderboard.index.map(top_creators)



leaderboard['trend'] = np.where(leaderboard['engagement_change'] > 0, '📈', '📉')
leaderboard['engagement_change_fmt'] = leaderboard['engagement_change'].apply(lambda x: f"{x:+.0f}%")
leaderboard['top_creator'] = leaderboard.index.map(top_creators)

# STEP 8: Sort and display
final = leaderboard.reset_index().rename(columns={
    'hashtag_list': 'Hashtag',
    'post_count': 'Post Count',
    'engagement_change_fmt': 'Avg. Engagement',
    'trend': 'Trendline',
    'top_creator': 'Top Creator(s)'
})[['Hashtag', 'Post Count', 'Avg. Engagement', 'Trendline', 'Top Creator(s)']]

final['Rank'] = final['Post Count'].rank(method='first', ascending=False).astype(int)
final = final.sort_values('Rank')

# Reorder columns
final = final[['Rank', 'Hashtag', 'Post Count', 'Avg. Engagement', 'Trendline', 'Top Creator(s)']]

# Display
print(final.head(10))  # or st.dataframe(final) in Streamlit


week  Rank          Hashtag  Post Count Avg. Engagement Trendline  \
91       1         skincare        3061            -48%         📉   
97       2  skincareroutine        1788            -38%         📉   
53       3       hautpflege        1766            -89%         📉   
34       4              fyp        1352           +323%         📈   
61       5   koreanskincare         833           +171%         📈   
44       6           glowup         560          +2028%         📈   
101      7     skincaretips         554           +585%         📈   
57       8          kbeauty         537           +381%         📈   
83       9         selfcare         522            -48%         📉   
9       10           beauty         454            -86%         📉   

week       Top Creator(s)  
91                  manel  
97               amyflamy  
53                    Sri  
34                heylina  
61                heylina  
44    Masculine Vouge USA  
101              amyflamy  
57        Skinca

  df['week'] = df['createTimeISO'].dt.to_period('W').dt.start_time


In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sentence_transformers import SentenceTransformer
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder

# === PART 1: Predict Trending Hashtags === #

# Load dataset
df = pd.read_csv("/Users/ritushetkar/env_capstone/data/hashtags_posts.csv", parse_dates=['createTimeISO'])

# Preprocess time and hashtags
df['week'] = df['createTimeISO'].dt.to_period('W').dt.start_time
df['engagement'] = df[['diggCount', 'shareCount', 'commentCount']].sum(axis=1)
df['hashtag_name'] = df['hashtag_name'].fillna('')
df['hashtag_list'] = df['hashtag_name'].apply(lambda x: x.split(',') if ',' in x else [x])
df_exploded = df.explode('hashtag_list')
df_exploded['hashtag_list'] = df_exploded['hashtag_list'].str.strip().str.lower()
df_exploded = df_exploded[df_exploded['hashtag_list'] != '']

# Weekly aggregation
weekly = (
    df_exploded.groupby(['hashtag_list', 'week'])
    .agg(post_count=('post_id', 'count'),
         avg_engagement=('engagement', 'mean'),
         sponsored_ratio=('isSponsored', 'mean'))
    .reset_index()
)

# Sort weeks and calculate growth
weekly = weekly.sort_values(['hashtag_list', 'week'])
weekly['prev_post_count'] = weekly.groupby('hashtag_list')['post_count'].shift(1)
weekly['growth_rate'] = (weekly['post_count'] - weekly['prev_post_count']) / (weekly['prev_post_count'] + 1e-6)
weekly['is_trending'] = (weekly['growth_rate'] > 0.5).astype(int)
weekly = weekly.dropna()

# Encode categorical
le = LabelEncoder()
weekly['hashtag_encoded'] = le.fit_transform(weekly['hashtag_list'])

# Features and labels
X = weekly[['hashtag_encoded', 'post_count', 'avg_engagement', 'growth_rate', 'sponsored_ratio']]
y = weekly['is_trending']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Report
part1_report = classification_report(y_test, y_pred, output_dict=True)


# === PART 2: Skincare Hashtag Classifier === #

# Create simple labeled dataset (manually labeled or based on rules)
def label_skincare_related(tag):
    skincare_keywords = ['skin', 'spf', 'acne', 'moisturizer', 'sunscreen', 'kbeauty', 'glow']
    return int(any(kw in tag.lower() for kw in skincare_keywords))

df_tags = df_exploded[['hashtag_list', 'text']].dropna().drop_duplicates()
df_tags['label'] = df_tags['hashtag_list'].apply(label_skincare_related)

# Use sentence-transformers to embed text context (hashtag + caption)
model = SentenceTransformer('all-MiniLM-L6-v2')
df_tags['combined_text'] = df_tags['hashtag_list'] + " " + df_tags['text']
embeddings = model.encode(df_tags['combined_text'].tolist(), show_progress_bar=True)

# Train classifier
X_embed = np.array(embeddings)
y_embed = df_tags['label'].values
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_embed, y_embed, test_size=0.2, random_state=42)

clf2 = LogisticRegression(max_iter=1000)
clf2.fit(X_train2, y_train2)
y_pred2 = clf2.predict(X_test2)

# Report
part2_report = classification_report(y_test2, y_pred2, output_dict=True)



  df['week'] = df['createTimeISO'].dt.to_period('W').dt.start_time


Batches:   0%|          | 0/1835 [00:00<?, ?it/s]

In [10]:
from sklearn.metrics import classification_report

# Print Trending Hashtag Classifier Results
print("=== Trending Hashtag Prediction Report ===")
print(classification_report(y_test, y_pred))

# Print Skincare Classifier Results
print("\n=== Skincare Hashtag Classifier Report ===")
print(classification_report(y_test2, y_pred2))


=== Trending Hashtag Prediction Report ===
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2434
           1       1.00      1.00      1.00       626

    accuracy                           1.00      3060
   macro avg       1.00      1.00      1.00      3060
weighted avg       1.00      1.00      1.00      3060


=== Skincare Hashtag Classifier Report ===
              precision    recall  f1-score   support

           0       0.88      0.93      0.90      8462
           1       0.78      0.67      0.72      3282

    accuracy                           0.86     11744
   macro avg       0.83      0.80      0.81     11744
weighted avg       0.85      0.86      0.85     11744



In [12]:
# Convert the dictionary report into a DataFrame
df_trending_report = pd.DataFrame(part1_report).transpose()
df_skincare_report = pd.DataFrame(part2_report).transpose()

# Show top rows
print("=== Trending Hashtag Report ===")
print(df_trending_report.head())

print("\n=== Skincare Hashtag Report ===")
print(df_skincare_report.head())



=== Trending Hashtag Report ===
              precision  recall  f1-score  support
0                   1.0     1.0       1.0   2434.0
1                   1.0     1.0       1.0    626.0
accuracy            1.0     1.0       1.0      1.0
macro avg           1.0     1.0       1.0   3060.0
weighted avg        1.0     1.0       1.0   3060.0

=== Skincare Hashtag Report ===
              precision    recall  f1-score       support
0              0.879056  0.928504  0.903103   8462.000000
1              0.784391  0.670628  0.723062   3282.000000
accuracy       0.856437  0.856437  0.856437      0.856437
macro avg      0.831723  0.799566  0.813083  11744.000000
weighted avg   0.852600  0.856437  0.852789  11744.000000


In [13]:
# Add model predictions back to the test set
X_test_with_preds = X_test.copy()
X_test_with_preds['actual'] = y_test
X_test_with_preds['predicted'] = y_pred

# Map encoded hashtags back to original names
X_test_with_preds['hashtag'] = le.inverse_transform(X_test_with_preds['hashtag_encoded'])

# Show trending predictions
print("=== Hashtags Predicted as Trending ===")
print(X_test_with_preds[X_test_with_preds['predicted'] == 1][['hashtag', 'actual', 'predicted']].head())

print("\n=== Hashtags Predicted as Not Trending ===")
print(X_test_with_preds[X_test_with_preds['predicted'] == 0][['hashtag', 'actual', 'predicted']].head())


=== Hashtags Predicted as Trending ===
          hashtag  actual  predicted
20065         pov       1          1
11609       hauls       1          1
11144       haare       1          1
27413  velvetskin       1          1
9043      fypdong       1          1

=== Hashtags Predicted as Not Trending ===
               hashtag  actual  predicted
9128          fypviral       0          0
7007   eatyourskincare       0          0
12898          hygiene       0          0
16814            model       0          0
12715   hyaluronicacid       0          0


In [16]:
# Add predictions to the test set
# Create indices to keep track of original rows
indices = np.arange(len(df_tags))

# Split embeddings and indices together
X_train2, X_test2, y_train2, y_test2, idx_train, idx_test = train_test_split(
    X_embed, y_embed, indices, test_size=0.2, random_state=42
)

# Use idx_test to select the right rows from df_tags
df_test = df_tags.iloc[idx_test].copy()
df_test['actual'] = y_test2
df_test['predicted'] = y_pred2

df_test['actual'] = y_test2
df_test['predicted'] = y_pred2

print("=== Hashtags Predicted as Skincare ===")
print(df_test[df_test['predicted'] == 1][['hashtag_list', 'text', 'actual']].head())

print("\n=== Hashtags Predicted as Non-Skincare ===")
print(df_test[df_test['predicted'] == 0][['hashtag_list', 'text', 'actual']].head())


=== Hashtags Predicted as Skincare ===
          hashtag_list                                               text  \
66117         skincare  not once but TWICE😭😭 god forbid a girl just wa...   
35618     harmancheema  Skincare tips I wish I knew sooner #skin #clea...   
40066          kbeauty  Honestly, the Mandelic Acid Gentle Exfoliating...   
11035  skincareroutine  Summer skin loading 🌴🌴#skincare #beauty #skinc...   
44434      glowingskin  RIP to my eye, stay tuned💀 but anyway Do my ni...   

       actual  
66117       1  
35618       0  
40066       1  
11035       1  
44434       1  

=== Hashtags Predicted as Non-Skincare ===
            hashtag_list                                               text  \
59673  porenverfeinerung  ✨ FUTURACONTOUR – Die innovative Unterspritzun...   
36430         hautpflege  Ich suche 10 türkisch sprechenden Frauen, die ...   
29848  hautpflegeroutine  Natürlich, sanft und effektiv – die perfekte P...   
67319            goviral  Lieblingstag der