In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("comments_with_sentiment.csv")

# Drop any rows with missing user_name or token_ticker
df = df.dropna(subset=["user_name", "token_ticker"])

# Total comments per token
comment_counts = df.groupby("token_ticker")["comment_id"].count().reset_index()
comment_counts.columns = ["token_ticker", "total_comments"]

# Unique users per token
unique_users = df.groupby("token_ticker")["user_name"].nunique().reset_index()
unique_users.columns = ["token_ticker", "unique_users"]

# Merge both metrics
engagement_df = pd.merge(comment_counts, unique_users, on="token_ticker")

# Calculate engagement ratio
engagement_df["comments_per_user"] = engagement_df["total_comments"] / engagement_df["unique_users"]

print(engagement_df.to_string(index=False))


                                                           token_ticker  total_comments  unique_users  comments_per_user
                                                                #186120             519           108           4.805556
                                                                 #TRUMP              28             6           4.666667
                                                                   #cat              39            18           2.166667
                                                                  #love             199            34           5.852941
                                                               #moodeng               3             3           1.000000
                                                              $1stAmend             107            38           2.815789
                                                                 $ALEXA              18            16           1.125000
                                

In [6]:
#identify anomalies
from scipy.stats import zscore

# Calculate z-scores
engagement_df["z_score_comments_per_user"] = zscore(engagement_df["comments_per_user"])

# Mark anomalies (z-score above 2 or below -2)
engagement_df["anomaly"] = engagement_df["z_score_comments_per_user"].apply(lambda x: "High" if x > 2 else ("Low" if x < -2 else "Normal"))

print(engagement_df.sort_values(by="z_score_comments_per_user", ascending=False).head())


     token_ticker  total_comments  unique_users  comments_per_user  \
595          CUNT            1000            24          41.666667   
2838          gub             872            56          15.571429   
2140      SLOWDEN              15             1          15.000000   
32           $DWT             997            93          10.720430   
3038        quant            1000            96          10.416667   

      z_score_comments_per_user anomaly  
595                   27.059291    High  
2838                   9.232661    High  
2140                   8.842297    High  
32                     5.918764    High  
3038                   5.711251    High  
