In [8]:
import os
import json
from collections import defaultdict

In [9]:
def calculate_cascade_metrics(tweets):
    # Group tweets by conversation
    conversations = defaultdict(list)
    for tweet in tweets:
        conversations[tweet['conversation_id']].append(tweet)

    # Calculate cascade size, depth, and width for each conversation
    cascade_sizes = {}
    cascade_depths = {}
    cascade_widths = {}
    for conversation_id, tweets in conversations.items():
        # Unique authors represent the size
        authors = set(tweet['author_id'] for tweet in tweets)
        cascade_sizes[conversation_id] = len(authors)

        # Depth is approximated by the number of tweets that are replies
        depths = [tweet for tweet in tweets if 'in_reply_to_user_id' in tweet]
        cascade_depths[conversation_id] = len(depths)

        # Width is approximated by the maximum number of authors at any "depth level"
        depth_authors = defaultdict(set)
        for tweet in tweets:
            if 'in_reply_to_user_id' in tweet:
                depth_authors[tweet['in_reply_to_user_id']].add(tweet['author_id'])
        if depth_authors:  # handle case where there are no replies
            cascade_widths[conversation_id] = max(len(authors) for authors in depth_authors.values())
        else:
            cascade_widths[conversation_id] = 0  # no replies means width is 0

    return cascade_sizes, cascade_depths, cascade_widths

def calculate_virality(tweets):
    # Virality is the sum of retweet_count and like_count
    virality = {}
    for tweet in tweets:
        metrics = tweet['public_metrics']
        virality[tweet['id']] = metrics['retweet_count'] + metrics['like_count']
    return virality

In [10]:
def process_directory(directory):
    results = []
    all_virality = {}  # store all virality scores

    # First pass: calculate metrics and collect all virality scores
    for filename in os.listdir(directory):
        if filename.endswith(".json"):
            with open(os.path.join(directory, filename), "r") as file:
                data = json.load(file)

            cascade_sizes, cascade_depths, cascade_widths = calculate_cascade_metrics(data)
            virality = calculate_virality(data)

            all_virality.update(virality)

            # Identify the tweets/conversations with the highest values for each metric
            biggest_conversations = max(cascade_sizes, key=cascade_sizes.get)
            biggest_cascade_depths = max(cascade_depths, key=cascade_depths.get)
            biggest_cascade_widths = max(cascade_widths, key=cascade_widths.get)
            biggest_virality = max(virality, key=virality.get)

            # Prepare the output
            output = {
                "hashtag": filename.rstrip(".json"),
                "biggest_conversations": [{"conversation_id": biggest_conversations, "size": cascade_sizes[biggest_conversations]}],
                "biggest_cascade_depth": [{"tweet_id": biggest_cascade_depths, "value": cascade_depths[biggest_cascade_depths]}],
                "biggest_cascade_size": [{"tweet_id": biggest_conversations, "value": cascade_sizes[biggest_conversations]}],
                "biggest_cascade_breadth": [{"tweet_id": biggest_cascade_widths, "value": cascade_widths[biggest_cascade_widths]}],
                "biggest_cascade_virality": [{"tweet_id": biggest_virality, "value": virality[biggest_virality]}]

            }
            results.append(output)

    # Second pass: normalize virality scores
    max_virality = max(all_virality.values())
    for result in results:
        for tweet in result["biggest_cascade_virality"]:
            tweet["value"] /= max_virality

    # Save the results to a new JSON file
    with open("analysis_results.json", "w") as file:
        json.dump(results, file, indent=2)

In [11]:
process_directory("data")