Importing Libraries

In [None]:
import json
from datetime import datetime
from collections import defaultdict
import pandas as pd

Importing Dataset

In [None]:
pendle_df = pd.read_json('dataset/pendle.json')
# virtual_df = pd.read_json('virtual.json')

In [None]:
pendle_df.iloc[0]

Unique userName

In [None]:
unique_author_names = pendle_df['author'].apply(lambda x: x.get('userName') if isinstance(x, dict) else None).dropna().unique()

unique_author_names_set = set(unique_author_names)

print(unique_author_names_set)

In [None]:
len(unique_author_names_set)

Unique Author Dataset

In [None]:
author_data = pd.json_normalize(pendle_df["author"])

unique_users_df = author_data.drop_duplicates(subset='userName', keep='first')

In [None]:
unique_users_df.head()

Grouping Tweets based on userNames

In [None]:
tempdf = pendle_df.copy()

tempdf['userName'] = tempdf['author'].apply(lambda x: x.get('userName') if isinstance(x, dict) else None)
grouped_tweets = tempdf.groupby('userName').apply(lambda x: x.to_dict(orient='records')).to_dict()

In [None]:
grouped_tweets["0318Ki"]

Finding Account Engagement Ratio

In [None]:
tempdf['viewCount'].fillna(1,inplace=True)

In [None]:
interaction_cols = ['retweetCount', 'replyCount', 'likeCount', 'quoteCount', 'bookmarkCount']
tempdf[interaction_cols] = tempdf[interaction_cols].fillna(0)

# Calculate total interactions
tempdf['totalInteractions'] = tempdf[interaction_cols].sum(axis=1)

# Extract follower count
tempdf['followers'] = tempdf['author'].apply(lambda x: x.get('followers', 1) if isinstance(x, dict) else 1)

# Avoid division by zero and calculate engagement ratio
tempdf['engagementRatio'] = tempdf['totalInteractions'] / tempdf['viewCount'].replace(0, 1)

# Group by user and calculate average engagement ratio
engagement_ratios = tempdf.groupby('userName')['engagementRatio'].mean().to_dict()

In [None]:
# Engagement Ratio for user 0800Degen
engagement_ratios["0800Degen"]

Account Age

In [None]:
# Convert 'createdAt' to datetime format (directly on the series)
unique_users_df['accountCreatedAt'] = pd.to_datetime(unique_users_df['createdAt'], format="%a %b %d %H:%M:%S +0000 %Y", errors='coerce')

max_date = unique_users_df['accountCreatedAt'].max().date()  # Get max date only
cutoff_date = unique_users_df[unique_users_df['accountCreatedAt'].dt.date == max_date]['accountCreatedAt'].max()
cutoff_date = cutoff_date.tz_localize(None)  # Remove timezone if needed

cutoff_date

In [None]:
def calculate_account_age_score(df, cutoff_date):
    df['accountCreatedAt'] = pd.to_datetime(df['createdAt'], errors='coerce').dt.tz_localize(None)

    # Ensure cutoff_date is a full timestamp
    cutoff_date = cutoff_date.tz_localize(None) if cutoff_date.tz is not None else cutoff_date

    # Calculate account age in days (use total_seconds for accuracy)
    df['accountAgeDays'] = (cutoff_date - df['accountCreatedAt']).dt.total_seconds() / (24 * 3600)

    # Ensure non-negative values (in case of future dates)
    df['accountAgeDays'] = df['accountAgeDays'].clip(lower=0)

    # Calculate total days (avoid division by zero)
    total_days = df['accountAgeDays'].max() if pd.notnull(df['accountAgeDays'].max()) else 1

    # Calculate the age score
    df['ageScore'] = df['accountAgeDays'].div(total_days).clip(upper=1).fillna(0)

    # Return the scores as a dictionary
    account_ages = df.set_index('userName')['ageScore'].to_dict()

    return account_ages

account_ages = calculate_account_age_score(unique_users_df, cutoff_date)

In [None]:
# Account Age for user 0Cyberbully
account_ages["0Cyberbully"]

Profile Completeness

In [None]:
def calculate_profile_completeness(df):
    # Calculate completeness score with adjusted weightages
    df['completeness_score'] = (
        df['profilePicture'].notnull().astype(int) * 0.1 +
        df['coverPicture'].notnull().astype(int) * 0.1 +
        df['description'].notnull().astype(int) * 0.1 +
        df['canDm'].astype(int) * 0.2 +
        df['isVerified'].astype(int) * 0.5
    )
    
    # Return the scores as a dictionary
    profile_scores = df.set_index('userName')['completeness_score'].to_dict()
    
    return profile_scores

# Calculate profile completeness scores
profile_scores = calculate_profile_completeness(unique_users_df)

In [None]:
# Profile Completeness for user 0XunoYou
profile_scores["0XunoYou"]

Media Status Ratio

In [None]:
def calculate_media_status_ratio(df):
    df['mediaStatusRatio'] = df['mediaCount'].fillna(0) / (df['statusesCount'].fillna(0) + 1)
    media_status_ratios = df.set_index('userName')['mediaStatusRatio'].to_dict()
    return media_status_ratios

# Calculate media status ratios
media_status_ratios = calculate_media_status_ratio(unique_users_df)

In [None]:
# Media Ratio for user 0x3bands
media_status_ratios["0x3bands"]

In [None]:
grouped_tweets["0318Ki"]

Account frequency of tweets (daily, weekly, monthly and average)

In [None]:
def calculate_tweet_frequencies(df):
    # Parse createdAt to datetime
    df['tweetCreatedAt'] = pd.to_datetime(df['createdAt'], format="%a %b %d %H:%M:%S %z %Y", errors='coerce')
    
    # Group by user
    user_frequencies = {}
    for user, group in df.groupby('userName'):
        if group.empty:
            continue
        
        total_tweets = len(group)
        unique_days = group['tweetCreatedAt'].dt.date.nunique()
        unique_weeks = group['tweetCreatedAt'].dt.isocalendar().week.nunique()
        unique_months = group['tweetCreatedAt'].dt.to_period('M').nunique()
        
        min_date, max_date = group['tweetCreatedAt'].min(), group['tweetCreatedAt'].max()
        total_days = (max_date - min_date).days + 1 if pd.notnull(min_date) and pd.notnull(max_date) else 1
        
        # Compute frequencies
        daily_frequency = total_tweets / unique_days if unique_days else 0
        weekly_frequency = total_tweets / unique_weeks if unique_weeks else 0
        monthly_frequency = total_tweets / unique_months if unique_months else 0
        avg_rate_of_tweets = total_tweets / total_days if total_days else 0
        
        # Store the result
        user_frequencies[user] = (
            daily_frequency,
            weekly_frequency,
            monthly_frequency,
            avg_rate_of_tweets
        )
    
    return user_frequencies

# Calculate tweet frequencies
account_frequency = calculate_tweet_frequencies(tempdf)

In [None]:
# Account Frequency(daily,weekly,monthly and average) for user AndreiMX_
account_frequency["AndreiMX_"]

Advanced Tweet Analysis ( persistence_score and activity_score )

In [None]:
from datetime import datetime
import pandas as pd

def analyze_user_tweets(user_tweet_dict):
    result = []

    for user, tweets in user_tweet_dict.items():
        if not tweets:
            continue
        
        # Extract createdAt dates and convert to datetime
        dates = [datetime.strptime(tweet['createdAt'], "%a %b %d %H:%M:%S +0000 %Y") for tweet in tweets]
        
        # Calculate first and last tweet dates
        first_date = min(dates)
        last_date = max(dates)
        
        # Unique active days
        unique_active_days = len(set(date.date() for date in dates))
        
        # Total tweets count
        total_tweets = len(tweets)
        
        # Append results
        result.append({
            'user_name': user,
            'first_tweet_date': first_date,
            'last_tweet_date': last_date,
            'days_active': unique_active_days,
            'total_tweets': total_tweets  # New column for tweet count
        })
    
    # Create DataFrame
    df = pd.DataFrame(result)
    
    # Calculate additional metrics
    df['time_span'] = (df['last_tweet_date'] - df['first_tweet_date']).dt.days.replace(0, 1)  # Avoid division by zero
    df['active_days_ratio'] = df['days_active'] / df['time_span']
    df['tweets_per_active_day'] = df['total_tweets'] / df['days_active']
    
    return df

# Generate the DataFrame
result_df = analyze_user_tweets(grouped_tweets)

# Display result
result_df.head()

In [None]:
tweet_dates_dict = result_df.set_index("user_name")[["first_tweet_date", "last_tweet_date"]].apply(tuple, axis=1).to_dict()

In [None]:
# Convert date columns to datetime format
result_df['first_tweet_date'] = pd.to_datetime(result_df['first_tweet_date'])
result_df['last_tweet_date'] = pd.to_datetime(result_df['last_tweet_date'])

# Calculate the time span of tweets for each account
result_df['time_span'] = (result_df['last_tweet_date'] - result_df['first_tweet_date']).dt.days
result_df['time_span'] = result_df['time_span'].replace(0, 1)  # Avoid division by zero

# Calculate persistence score
total_days_in_dataset = (result_df['last_tweet_date'].max() - result_df['first_tweet_date'].min()).days + 1
result_df['persistence_score'] = result_df['days_active'] / total_days_in_dataset

# Calculate tweets per active day (avoid division by zero)
result_df['tweets_per_active_day'] = result_df['total_tweets'] / result_df['days_active']
result_df['tweets_per_active_day'] = result_df['tweets_per_active_day'].replace([float('inf'), -float('inf')], 0)

# Calculate tweets per day ratio based on total dataset duration
result_df['tweets_per_day_ratio'] = result_df['total_tweets'] / total_days_in_dataset

# Compute the new activity score based on weighted values
result_df['activity_score'] = (
    (result_df['tweets_per_active_day'] * 0.8) +
    (result_df['tweets_per_day_ratio'] * 0.2)
)

# Normalize the calculated activity score for comparison while preventing values from going to zero
result_df['activity_score'] = ((result_df['activity_score'] - result_df['activity_score'].min()) / (
    result_df['activity_score'].max() - result_df['activity_score'].min()
)) * 0.9 + 0.1

# Display the updated DataFrame
result_df.tail()

In [None]:
account_scores = result_df.set_index("user_name")[["persistence_score", "activity_score"]].apply(tuple, axis=1).to_dict()

In [None]:
result_df[result_df["user_name"] == "cz_volume"]

Average Time between Tweets

In [None]:
# Sort by user and timestamp
sorted_df = tempdf.sort_values(by=['userName', 'tweetCreatedAt'])

# Calculate the time difference between consecutive tweets
sorted_df['timeDiff'] = sorted_df.groupby('userName')['tweetCreatedAt'].diff().dt.total_seconds() / 3600  # In hours

# Calculate the average time difference per author (ignoring NaNs)
avg_time_per_author = sorted_df.groupby('userName')['timeDiff'].mean().reset_index(name='avgTimeBetweenTweets')
# For single tweet users the average time will be 0
avg_time_per_author['avgTimeBetweenTweets'] = avg_time_per_author['avgTimeBetweenTweets'].fillna(0)

avg_time_dict = avg_time_per_author.set_index('userName')['avgTimeBetweenTweets'].to_dict()

In [None]:
avg_time_dict["0Cyberbully"]

Account's Content Originality Ratio

In [None]:
def calculate_originality_ratio(df):
    def get_tweet_score(row):
        if row['isRetweet']:
            score = 0
        elif row['isReply']:
            score = 1
        elif row['isQuote']:
            score = 2
        elif not (row['isRetweet'] or row['isReply'] or row['isQuote']):
            score = 3
        
        # Add media bonus
        if row['media']:
            score += 0.5
        
        return score

    df['tweetScore'] = df.apply(get_tweet_score, axis=1)

    # Group by username
    grouped = df.groupby('userName').agg(
        totalScore=('tweetScore', 'sum'),
        totalTweets=('tweetScore', 'count')
    ).reset_index()

    # Calculate originality ratio
    grouped['originalityRatio'] = grouped['totalScore'] / grouped['totalTweets']
    
    # Normalize the ratio between 0 and 1
    min_ratio = grouped['originalityRatio'].min()
    max_ratio = grouped['originalityRatio'].max()
    
    grouped['normalizedOriginalityRatio'] = (
        (grouped['originalityRatio'] - min_ratio) / (max_ratio - min_ratio)
    ).fillna(0)

    result_dict = grouped.set_index('userName')['normalizedOriginalityRatio'].to_dict()

    return result_dict

originality_df = tempdf.copy()
content_originality_ratio = calculate_originality_ratio(originality_df)

In [None]:
content_originality_ratio["0800Degen"]

Human Source Devices Ratio

In [None]:
def calculate_human_device_ratio(df):
    possible_human_source = [
        "Twitter Web App",
        "Twitter for Android",
        "Twitter for iPhone",
        "Twitter for iPad",
        "Twitter for Mac",
        "TweetDeck",
        "TweetDeck Web App"
    ]

    human_device_ratio = {}
    
    for username, group in df.groupby('userName'):
        total_tweets = len(group)
        human_tweets = group['source'].isin(possible_human_source).sum()
        
        ratio = human_tweets / total_tweets if total_tweets > 0 else 0
        
        human_device_ratio[username] = ratio
    
    return human_device_ratio

human_device_ratio = calculate_human_device_ratio(tempdf)

In [None]:
human_device_ratio["0x100s"]

Average Reach

In [None]:
# def calculate_average_reach(df):
#     # Group by username and calculate the average view count for each author
#     average_reach_per_author = df.groupby('userName')['viewCount'].mean().fillna(0)
    
#     # Convert to a dictionary
#     reach_dict = average_reach_per_author.to_dict()
    
#     return reach_dict

# avg_reach = calculate_average_reach(tempdf)

# Calculate average reach per author
avg_reach_series = tempdf.groupby("userName")["viewCount"].mean().fillna(0)

# Apply Min-Max Normalization
avg_reach_normalized = (avg_reach_series - avg_reach_series.min()) / (avg_reach_series.max() - avg_reach_series.min())

# Convert to dictionary format (if needed for further use)
avg_reach = avg_reach_normalized.to_dict()


In [None]:
avg_reach["0800Degen"]

Follower to Following Ratio

In [None]:
unique_users_df["follower_following_ratio"] = unique_users_df["followers"] / (unique_users_df["following"] + 1)

follower_to_following_ratio = unique_users_df.set_index('userName')['follower_following_ratio'].to_dict()

In [None]:
unique_users_df["follower_following_ratio"]

In [None]:
# Apply Min-Max Normalization to follower-to-following ratio
unique_users_df["follower_following_ratio_normalized"] = (
    (unique_users_df["follower_following_ratio"] - unique_users_df["follower_following_ratio"].min()) / 
    (unique_users_df["follower_following_ratio"].max() - unique_users_df["follower_following_ratio"].min())
)

# Convert to dictionary format (if needed)
follower_to_following_ratio_normalized = unique_users_df.set_index("userName")["follower_following_ratio_normalized"].to_dict()

# Display sample normalized values
unique_users_df[["userName", "follower_following_ratio", "follower_following_ratio_normalized"]].head()

In [None]:
unique_users_df["follower_following_ratio_normalized"].min()

In [None]:
follower_to_following_ratio["0800Degen"]

In [None]:
follower_to_following_ratio_normalized["0800Degen"]

    Importing Pendle LLM Analysis Dataset

In [None]:
pendle_llm = pd.read_csv("dataset/pendle_llm_analysis.csv")

In [None]:
pendle_llm.head()

Account Emotional and Statistical Tweet Ratio

In [None]:
def calculate_emotion_stat_ratio(tweet_df, analysis_df):
    # Merge the datasets on tweet ID
    merged_df = pd.merge(tweet_df, analysis_df, on='id')

    # Group by author and calculate the emotional/statistical ratio
    result = (
        merged_df.groupby(merged_df['author'].apply(lambda x: x['userName']))
        .apply(lambda group: {
            'emotionalRatio': (group['tweet_type'] == 'emotional').mean(),
            'statisticalRatio': (group['tweet_type'] == 'statistical').mean()
        })
        .to_dict()
    )

    return result

emotional_statistical_ratio = calculate_emotion_stat_ratio(tempdf,pendle_llm)

In [None]:
emotional_statistical_ratio

In [None]:
result_df["activity_score"].describe()

In [None]:
emotional_statistical_ratio["0800Degen"]

Fetching Additional Columns

In [None]:
unique_users_df

blueCheck_dict = unique_users_df.set_index("userName")[["isBlueVerified"]].apply(tuple, axis=1).to_dict()

    Making Single Day User Column and Account Activity Score

In [None]:
result_df["single_day_user"] = result_df["first_tweet_date"].dt.date == result_df["last_tweet_date"].dt.date

In [None]:
result_df["account_activity"] = (
    result_df["activity_score"] * 0.75 +
    result_df["persistence_score"] * 0.25 -
    (result_df["single_day_user"] * 0.05)
)

account_activity_dict = result_df.set_index("user_name")[["account_activity"]].apply(tuple, axis=1).to_dict()

Making Author's Authenticity Dataset

In [None]:
author_df = pd.DataFrame({
    'username': list(unique_author_names_set),
    'blue_verification_badge': [blueCheck_dict.get(user, 0)[0] for user in unique_author_names_set],
    'account_age': [account_ages.get(user, 0) for user in unique_author_names_set],
    'profile_completeness': [profile_scores.get(user, 0) for user in unique_author_names_set],
    'media_status_ratio': [media_status_ratios.get(user, 0) for user in unique_author_names_set],
    'tweets_frequency': [account_activity_dict.get(user, (0, 0, 0, 0))[0] for user in unique_author_names_set],
    'content_originality_ratio': [content_originality_ratio.get(user, 0) for user in unique_author_names_set],
    'human_source_device_ratio': [human_device_ratio.get(user, 0) for user in unique_author_names_set],
    'follower_to_following_ratio': [follower_to_following_ratio_normalized.get(user, 0) for user in unique_author_names_set],
    'engagement_ratio':[engagement_ratios.get(user, 0) for user in unique_author_names_set],
    'avg_reach': [avg_reach.get(user, 0) for user in unique_author_names_set],
})

author_df.head()

In [None]:
author_df["account_age"].describe()

Ranking score

In [None]:
# Calculate Verification Trust Score
author_df["verification_trust"] = (
    author_df["blue_verification_badge"].astype(int) * 0.20 +
    author_df["account_age"] * 0.20 +
    author_df["profile_completeness"] * 0.10 +
    author_df["media_status_ratio"] * 0.10 +
    author_df["tweets_frequency"] * 0.10 +
    author_df["content_originality_ratio"] * 0.20 +
    author_df["human_source_device_ratio"] * 0.10
)

# Calculate Follower Quality Score
author_df["follower_quality"] = (
    author_df["follower_to_following_ratio"] * 0.35 +
    author_df["engagement_ratio"] * 0.50 +
    author_df["avg_reach"] * 0.15
)

# # Normalize the scores for better ranking (Min-Max Scaling)
author_df["verification_trust"] = (author_df["verification_trust"] - author_df["verification_trust"].min()) / (author_df["verification_trust"].max() - author_df["verification_trust"].min())
author_df["follower_quality"] = (author_df["follower_quality"] - author_df["follower_quality"].min()) / (author_df["follower_quality"].max() - author_df["follower_quality"].min())

# Calculate final Ranking Score
author_df["ranking_score"] = author_df["verification_trust"] * 0.40 + author_df["follower_quality"] * 0.60

# Sort by ranking_score in descending order
author_df_sorted = author_df.sort_values(by="ranking_score", ascending=False)

# Display top 5 ranked accounts
author_df_sorted[["username", "verification_trust", "follower_quality", "ranking_score"]].head()

In [None]:
author_df.to_csv("ranked_author.csv")