In [1]:
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import os

In [2]:
# Read the Parquet file into an Arrow Table
tweets = pq.read_table('data/tweets_light.parquet')
retweets = pq.read_table('data/retweets_light.parquet')
users = pq.read_table('data/users_tw+rt_light.parquet')

# Convert the Arrow Table to a Pandas DataFrame
df_tweets = tweets.to_pandas()
df_retweets = retweets.to_pandas()
df_users = users.to_pandas()

In [None]:
#convert column data types
df_tweets['author_id'] = df_tweets['author_id'].astype(int)
df_users['id'] = df_users['id'].astype(float)

#merge df_tweets and df_users
df_users_tweets = pd.merge(df_tweets, df_users, left_on="author_id", right_on="id")

In [None]:
df_users_tweets.drop(columns="id_y", inplace = True)
df_users_tweets = df_users_tweets.rename(columns={"created_at_x":"tweet_created_at", "id_x":"original_post_id", "created_at_y":"account_created_at", "name":"author_name", "username":"author_username"})

In [None]:
#merge df_user_tweets and df_retweets on original_post_id and post_id
df = pd.merge(df_users_tweets, df_retweets, left_on="original_post_id", right_on="post_id")
df.drop(columns=["post_id", "url", "location", "verified"], inplace = True)

In [None]:
#convert column data types
df['original_post_id'] = df['original_post_id'].astype(int)
df['retweeter_id'] = df['retweeter_id'].astype(int)

In [None]:
df = df.drop(columns=['lang', 'text',
       'possibly_sensitive', 'referenced_id', 'reference_type',
       'public_metrics.like_count', 'public_metrics.quote_count',
       'public_metrics.reply_count', 'public_metrics.retweet_count',
       'account_created_at', 'description','name'])

In [None]:
df.dropna(inplace=True)

In [None]:
df.describe()

Unnamed: 0,tweet_created_at,original_post_id,author_id,retweeter_id
count,29235029,29235030.0,29235030.0,29235030.0
mean,2020-06-25 17:59:32.733049,1.276245e+18,1.18561e+17,4.458498e+17
min,2017-12-31 23:11:09,9.476212e+17,5893702.0,12.0
25%,2019-03-09 14:13:47,1.1044e+18,14060260.0,575423900.0
50%,2020-06-02 07:19:34,1.267752e+18,150725700.0,2895642000.0
75%,2021-09-21 22:00:00,1.440548e+18,1024976000.0,9.851892e+17
max,2022-12-31 22:19:02,1.609328e+18,1.555225e+18,1.666974e+18
std,,1.928046e+17,3.106293e+17,5.506337e+17


In [None]:
print("df has shape:",df.shape)

df has shape: (29235029, 6)


In [None]:
df.head()

Unnamed: 0,tweet_created_at,original_post_id,author_id,author_name,author_username,retweeter_id
0,2018-12-31 22:46:18,1079886497279561728,622354597,Salvo Di Grazia,MedBunker,951848540
1,2018-12-31 22:46:18,1079886497279561728,622354597,Salvo Di Grazia,MedBunker,135554444
2,2018-12-31 22:46:18,1079886497279561728,622354597,Salvo Di Grazia,MedBunker,433418060
3,2018-12-31 22:46:18,1079886497279561728,622354597,Salvo Di Grazia,MedBunker,1668533642
4,2018-12-31 22:46:18,1079886497279561728,622354597,Salvo Di Grazia,MedBunker,1623208790


In [None]:
checkpoint_path = "data/df_checkpoint.parquet"

if os.path.exists(checkpoint_path):
    print("Loading df from checkpoint...")
    df = pd.read_parquet(checkpoint_path)
else:
    print("Saving df to checkpoint...")
    df.to_parquet(checkpoint_path)

Saving df to checkpoint...
