In [None]:
# Retrieve the data from the SQLite database
query = "SELECT * FROM tweets"
tweets_df = pd.read_sql_query(query, conn, parse_dates=['created_at'])
print(tweets_df.dtypes)


# Retrieve the data from the SQLite database
query = "SELECT * FROM users"
users_df = pd.read_sql_query(query, conn, parse_dates=['author_created'])
print(users_df.dtypes)

In [None]:
users_df.head()

In [None]:
def aggregate_data(df):
    """
    Aggregate data based on domain ID from the provided tweets DataFrame.

    Args:
        tweets_df (pd.DataFrame): DataFrame containing tweet data.

    Returns:
        pd.DataFrame: DataFrame with aggregated data based on domain ID.
    """
    subset_df = df[['tweet_id', 'author_id', 'tweet_metrics', 'context', 'entities']]

    subset_context_items = []
    for index, row in subset_df.iterrows():
        context_list = json.loads(row['context'])
        tweet_id     = row['tweet_id']
        author_id    = row['author_id']
        for item in context_list:
            domain_id   = item['domain']['id']
            domain_name = item['domain']['name']
            entity_id   = item['entity']['id']
            entity_name = item['entity']['name']
            data = {'tweet_id': tweet_id, 'author_id': author_id, 'domain_id':domain_id, 'domain_name': domain_name, 'entity_id':entity_id, 'entity_name': entity_name}
            subset_context_items.append(data)

    subset_context_item_df = pd.DataFrame(subset_context_items)
    return subset_context_item_df

subset_context_item_df = aggregate_data(df=tweets_df)

In [None]:
def calculate_top_counts(data_frame, group_by_cols, aggregate_col, top_n):
    """
    Calculate the top counts based on the given DataFrame, group by columns, aggregate column, and number of top counts to retrieve.
    
    Parameters:
        data_frame (pandas.DataFrame): DataFrame containing the data to perform calculations on.
        group_by_cols (list): List of columns to group by.
        aggregate_col (str): Column to perform aggregation on.
        top_n (int): Number of top counts to retrieve.
        
    Returns:
        pandas.DataFrame: Top counts, sorted in descending order.
    """
    subset_agg = data_frame.groupby(by=group_by_cols).agg({aggregate_col: pd.Series.nunique}).reset_index().sort_values(by=aggregate_col, ascending=False)
    top_counts = subset_agg.head(top_n)
    
    return top_counts

top_20_domain_counts = calculate_top_counts(subset_context_item_df, ["domain_name"], "tweet_id", 20)
top_50_entity_counts = calculate_top_counts(subset_context_item_df, ["entity_name"], "tweet_id", 50)
top_25_user_counts = calculate_top_counts(subset_context_item_df, ["author_id"], "tweet_id", 25)

In [None]:
fig, axs = plt.subplots(3, figsize = (25, 35))
fig.suptitle('Volume of Tweets by Domain & Entity (Context Annotations)')

sns.set_theme(style="whitegrid")
sns.set_color_codes("pastel")
sns.barplot(ax=axs[0], x=top_20_domain_counts['domain_name'], y=top_20_domain_counts['tweet_id'], color="b")
sns.barplot(ax=axs[1], x=top_50_entity_counts['tweet_id'], y=top_50_entity_counts['entity_name'], color="b", orient='h')
sns.barplot(ax=axs[2], x=top_25_user_counts['tweet_id'], y=top_25_user_counts['author_id'], color="b", orient='h')


# Rotate the x-axis labels
axs[0].set_xticklabels(axs[0].get_xticklabels(), rotation=60)
axs[1].set_xticklabels(axs[1].get_xticklabels(), rotation=60)
axs[2].set_xticklabels(axs[2].get_xticklabels(), rotation=60)


plt.tight_layout()  # Adjust spacing to avoid label overlap

plt.show()

In [None]:
# Define the buckets for follower and following counts
buckets = [0, 100, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000, 5000000, 10000000]

# Count the number of users in each bucket
follower_counts = pd.cut(users_df['followers_count'], buckets).value_counts().sort_index()
following_counts = pd.cut(users_df['following_count'], buckets).value_counts().sort_index()
# Create subplots
fig, axs = plt.subplots(2, 1, figsize=(10, 18))

# Plot follower count distribution
axs[0].bar(range(len(follower_counts)), follower_counts, width=0.4, align='center', alpha=0.5, color='blue')
axs[0].set_xlabel('Bucket')
axs[0].set_ylabel('User Count')
axs[0].set_title('Distribution of Follower Counts')
axs[0].set_xticks(range(len(follower_counts)))
axs[0].set_xticklabels(follower_counts.index, rotation=45)

# Plot following count distribution
axs[1].bar(range(len(following_counts)), following_counts, width=0.4, align='center', alpha=0.5, color='red')
axs[1].set_xlabel('Bucket')
axs[1].set_ylabel('User Count')
axs[1].set_title('Distribution of Following Counts')
axs[1].set_xticks(range(len(following_counts)))
axs[1].set_xticklabels(following_counts.index, rotation=45)

# Adjust spacing between subplots
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Convert 'verified' column to integer type
users_df['verified'] = users_df['verified'].astype(int)

# Filter verified users
verified_users = users_df[users_df['verified'] == 1]
non_verified_users = users_df[users_df['verified'] == 0]

# Sort the verified users based on followers count in descending order
top_verified_followed_users = verified_users.sort_values('followers_count', ascending=False).head(25)

# Sort the verified users based on tweet count in descending order
top_verified_tweet_users = verified_users.sort_values('tweet_count', ascending=False).head(25)

# Sort the non-verified users based on followers count in descending order
top_non_verified_followed_users = non_verified_users.sort_values('followers_count', ascending=False).head(25)

# Sort the non-verified users based on tweet count in descending order
top_non_verified_tweet_users = non_verified_users.sort_values('tweet_count', ascending=False).head(25)

# Create a figure with a 2x2 grid of subplots
fig, axs = plt.subplots(2, 2, figsize=(12, 12))

# Plot the top followed users (verified)
axs[0, 0].barh(top_verified_followed_users['username'], top_verified_followed_users['followers_count'], color='blue')
axs[0, 0].set_xlabel('Followers Count')
axs[0, 0].set_ylabel('Username')
axs[0, 0].set_title('Top 25 Most Followed Verified Users')
axs[0, 0].invert_yaxis()

# Plot the top tweet users (verified)
axs[0, 1].barh(top_verified_tweet_users['username'], top_verified_tweet_users['tweet_count'], color='green')
axs[0, 1].set_xlabel('Tweet Count')
axs[0, 1].set_ylabel('Username')
axs[0, 1].set_title('Top 25 Verified Users with Highest Tweet Counts')
axs[0, 1].invert_yaxis()

# Plot the top followed users (non-verified)
axs[1, 0].barh(top_non_verified_followed_users['username'], top_non_verified_followed_users['followers_count'], color='blue')
axs[1, 0].set_xlabel('Followers Count')
axs[1, 0].set_ylabel('Username')
axs[1, 0].set_title('Top 25 Most Followed Non-Verified Users')
axs[1, 0].invert_yaxis()

# Plot the top tweet users (non-verified)
axs[1, 1].barh(top_non_verified_tweet_users['username'], top_non_verified_tweet_users['tweet_count'], color='green')
axs[1, 1].set_xlabel('Tweet Count')
axs[1, 1].set_ylabel('Username')
axs[1, 1].set_title('Top 25 Non-Verified Users with Highest Tweet Counts')
axs[1, 1].invert_yaxis()

# Adjust the spacing between subplots
plt.tight_layout()

# Show the plot
plt.show()