In [None]:
import pandas as pd
import torch
from tqdm import tqdm

In [None]:
path = "/Twibot-22/"

In [None]:
full_label = pd.read_csv(path + "label.csv")
user_data = pd.read_json(path + "user.json")
split_df = pd.read_csv(path + "split.csv")

Used downsampling to balance (equal users and bots)

In [None]:
human_samples = full_label[full_label['label'] == 'human']
bot_samples = full_label[full_label['label'] == 'bot']

# Determine the minimum number of samples between human and bot
min_samples = min(len(human_samples), len(bot_samples))

# Sample an equal number of human and bot samples
balanced_df = pd.concat([human_samples.sample(min_samples), bot_samples.sample(min_samples)])

# Shuffle the balanced dataset
balanced_df = balanced_df.sample(frac=1).reset_index(drop=True)

print('Filtered Length: ', len(balanced_df))

balanced_df.to_csv("filtered_label.csv", index=False)

In [None]:
user_ids = balanced_df['id'].tolist()

filtered_user_data = user_data[user_data['id'].isin(user_ids)]

# Save the filtered user data to a new JSON file
filtered_user_data.to_json("filtered_user.json", orient="records", lines=True)

In [None]:
filtered_split_df = split_df[split_df['id'].isin(user_ids)]

# Count the number of samples in each split
split_counts = filtered_split_df['split'].value_counts()

# Balance the splits to match the standard train-test-val split
# Assuming standard split ratios of 70% train, 15% test, and 15% val
standard_split_ratios = {'train': 0.7, 'test': 0.15, 'val': 0.15}
balanced_split_dfs = []

for split_name, split_ratio in standard_split_ratios.items():
    split_count = int(split_counts.sum() * split_ratio)
    split_df_subset = filtered_split_df[filtered_split_df['split'] == split_name].sample(n=split_count, replace=True)
    balanced_split_dfs.append(split_df_subset)

# Concatenate the balanced splits into a single DataFrame
balanced_split_df = pd.concat(balanced_split_dfs)

# Save the balanced split data to a new CSV file
balanced_split_df.to_csv("balanced_split.csv", index=False)

In [None]:
selected_indices = full_label[full_label['id'].isin(user_ids)].index.tolist()

selected_indices_df = pd.DataFrame({'original_index': selected_indices})
selected_indices_df.to_csv("selected_indices.csv", index=False)