# Imports #

In [5]:
import os
import pandas as pd
from tqdm import tqdm
from collections import Counter

# Get unique ID's #

In [6]:
input_dir = "../data/twitter_proc/files"
user_counter = Counter()

for file_name in tqdm(os.listdir(input_dir), desc="Scanning files for user IDs"):
    if file_name.endswith(".csv"):
        file_path = os.path.join(input_dir, file_name)
        try:
            df = pd.read_csv(file_path, usecols=["userid"])
            user_counter.update(df["userid"].dropna())
        except Exception as e:
            print(f"Skipping {file_name}: {e}")
            continue

# Convert counter to DataFrame
df_unique = pd.DataFrame(list(user_counter.items()), columns=["userid", "count"])

print("Number of unique users:", df_unique.shape[0])
print(df_unique.head())

Scanning files for user IDs: 100%|██████████| 290/290 [02:29<00:00,  1.93it/s]


Number of unique users: 2389688
                userid  count
0            173212647     54
1            335041409     27
2  1512400441103032323   1410
3  1356632630662430722    261
4             20297125     16


# Delete labeled users #

In [7]:

# Load the intersection user IDs
intersection_df = pd.read_csv('../data/labeled_intersection.csv', usecols=['userid'])
intersection_userids = set(intersection_df['userid'])

# Filter df_unique to exclude intersecting userids
df_unique_filtered = df_unique[~df_unique['userid'].isin(intersection_userids)].copy()
del df_unique


Filtered unique users saved. Remaining rows: 2285391
                userid  count
0            173212647     54
1            335041409     27
2  1512400441103032323   1410
3  1356632630662430722    261
4             20297125     16


# totaltweets #

In [9]:

def scan_and_aggregate(input_dir, target_col, agg_func, userid_col="userid"):
    """
    Scans all CSVs in a directory and aggregates target_col per userid using agg_func.

    Returns:
        pd.DataFrame with columns: [userid, target_col]
    """
    aggregated = {}

    for file_name in tqdm(os.listdir(input_dir), desc=f"Scanning '{target_col}'"):
        if not file_name.endswith(".csv"):
            continue
        file_path = os.path.join(input_dir, file_name)

        try:
            df = pd.read_csv(file_path, usecols=[userid_col, target_col])
            df = df.dropna(subset=[userid_col, target_col])

            # Group by userid and aggregate
            grouped = df.groupby(userid_col)[target_col].agg(agg_func)

            for uid, value in grouped.items():
                if uid not in aggregated:
                    aggregated[uid] = value
                else:
                    aggregated[uid] = agg_func([aggregated[uid], value])

        except Exception as e:
            print(f"⚠️ Skipping {file_name}: {e}")
            continue

    return pd.DataFrame(list(aggregated.items()), columns=[userid_col, target_col])


df_totaltweets = scan_and_aggregate(input_dir, target_col="totaltweets", agg_func=max)
df_unique_filtered = df_unique_filtered.merge(df_totaltweets, on="userid", how="left")
del df_totaltweets



  grouped = df.groupby(userid_col)[target_col].agg(agg_func)
  grouped = df.groupby(userid_col)[target_col].agg(agg_func)
  grouped = df.groupby(userid_col)[target_col].agg(agg_func)
  grouped = df.groupby(userid_col)[target_col].agg(agg_func)
  grouped = df.groupby(userid_col)[target_col].agg(agg_func)
  grouped = df.groupby(userid_col)[target_col].agg(agg_func)
  grouped = df.groupby(userid_col)[target_col].agg(agg_func)
  grouped = df.groupby(userid_col)[target_col].agg(agg_func)
  grouped = df.groupby(userid_col)[target_col].agg(agg_func)
  grouped = df.groupby(userid_col)[target_col].agg(agg_func)
  grouped = df.groupby(userid_col)[target_col].agg(agg_func)
  grouped = df.groupby(userid_col)[target_col].agg(agg_func)
  grouped = df.groupby(userid_col)[target_col].agg(agg_func)
  grouped = df.groupby(userid_col)[target_col].agg(agg_func)
  grouped = df.groupby(userid_col)[target_col].agg(agg_func)
  grouped = df.groupby(userid_col)[target_col].agg(agg_func)
  grouped = df.groupby(u

In [12]:
print("Number of unique users:", df_unique_filtered.shape[0])
print(df_unique_filtered.head())


Number of unique users: 2285391
                userid  count  totaltweets
0            173212647     54         6808
1            335041409     27        79454
2  1512400441103032323   1410         2610
3  1356632630662430722    261          955
4             20297125     16        61432


# Get following and followers and acctdesc #

In [10]:
import os
from tqdm import tqdm

def extract_latest_follow_acctdesc_stats(input_dir, known_userids,
                                 userid_col="userid", ts_col="tweetcreatedts",
                                 followers_col="followers", following_col="following", acctdesc_col="acctdesc"):

    latest_info = {}  # userid → (timestamp, followers, following)
    known_userids_set = set(known_userids)

    for file in tqdm(os.listdir(input_dir), desc="Scanning files"):
        if not file.endswith(".csv"):
            continue
        path = os.path.join(input_dir, file)

        try:
            df = pd.read_csv(path, usecols=[userid_col, ts_col, followers_col, following_col, acctdesc_col])
        except Exception as e:
            print(f"⚠️ Skipping {file}: {e}")
            continue

        df = df[df[userid_col].isin(known_userids_set)]
        df = df.dropna(subset=[userid_col, ts_col, followers_col, following_col])

        # Parse timestamps
        df[ts_col] = pd.to_datetime(df[ts_col], errors='coerce')
        df = df.dropna(subset=[ts_col])

        for row in df.itertuples(index=False):
            uid = getattr(row, userid_col)
            ts = getattr(row, ts_col)
            followers = getattr(row, followers_col)
            following = getattr(row, following_col)
            acctdesc = getattr(row, acctdesc_col)

            if uid not in latest_info or ts > latest_info[uid][0]:
                latest_info[uid] = (ts, followers, following, acctdesc)

    # Create a dataframe from the final dict
    df_result = pd.DataFrame([
        (uid, data[1], data[2], data[3]) for uid, data in latest_info.items()
    ], columns=[userid_col, "followers", "following", "acctdesc"])

    return df_result

df_follow_stats = extract_latest_follow_acctdesc_stats(
    input_dir=input_dir,
    known_userids=df_unique_filtered["userid"]
)

# Merge the result into your main dataframe
df_unique_filtered = df_unique_filtered.merge(df_follow_stats, on="userid", how="left")


Scanning files: 100%|██████████| 290/290 [10:42<00:00,  2.22s/it]


# AVG retweet count #

In [20]:
df_retweet_sum = scan_and_aggregate(input_dir=input_dir,target_col="retweetcount", agg_func=sum)
df_unique_filtered = df_unique_filtered.merge(df_retweet_sum, on="userid", how="left")
df_unique_filtered["avg_retweetcount"] = (
    df_unique_filtered["retweetcount"] / df_unique_filtered["count"]
)
df_unique_filtered.drop(columns=["retweetcount"], inplace=True)

  grouped = df.groupby(userid_col)[target_col].agg(agg_func)
  grouped = df.groupby(userid_col)[target_col].agg(agg_func)
  grouped = df.groupby(userid_col)[target_col].agg(agg_func)
  grouped = df.groupby(userid_col)[target_col].agg(agg_func)
  grouped = df.groupby(userid_col)[target_col].agg(agg_func)
  grouped = df.groupby(userid_col)[target_col].agg(agg_func)
  grouped = df.groupby(userid_col)[target_col].agg(agg_func)
  grouped = df.groupby(userid_col)[target_col].agg(agg_func)
  grouped = df.groupby(userid_col)[target_col].agg(agg_func)
  grouped = df.groupby(userid_col)[target_col].agg(agg_func)
  grouped = df.groupby(userid_col)[target_col].agg(agg_func)
  grouped = df.groupby(userid_col)[target_col].agg(agg_func)
  grouped = df.groupby(userid_col)[target_col].agg(agg_func)
  grouped = df.groupby(userid_col)[target_col].agg(agg_func)
  grouped = df.groupby(userid_col)[target_col].agg(agg_func)
  grouped = df.groupby(userid_col)[target_col].agg(agg_func)
  grouped = df.groupby(u

KeyError: 'retweetcount_sum'

# Save to CSV #

In [22]:
df_unique_filtered["avg_retweetcount"] = (
    df_unique_filtered["retweetcount"] / df_unique_filtered["count"]
)
df_unique_filtered.drop(columns=["retweetcount"], inplace=True)

In [11]:

print("Filtered unique users saved. Remaining rows:", df_unique_filtered.shape[0])
print(df_unique_filtered.head())

Filtered unique users saved. Remaining rows: 2285391
                userid  count  totaltweets  avg_retweetcount  followers  \
0            173212647     54         6808          0.148148        247   
1            335041409     27        79454          0.111111       6552   
2  1512400441103032323   1410         2610          0.089362        165   
3  1356632630662430722    261          955          0.095785         40   
4             20297125     16        61432          0.062500      18173   

   following                                           acctdesc  
0        278  Yeshua Hamashiach is THE answer | Romans 10:9-...  
1       1493  Somos el periódico  #ExclusivasPuebla| Investi...  
2        881  Shelter for abandoned dogs and cats. 1400 dogs...  
3          5  Shelter for abandoned dogs and cats. 1400 dogs...  
4       2178  Las mejores noticias de los dos Laredos y el m...  


In [14]:
# Save the filtered DataFrame
df_unique_filtered.to_csv('../data/unique_users_no_intersection_unlabeled.csv', index=False)