In [1]:
import pandas as pd
import os



In [3]:
file_path = '../data/unique_users_after_labeling.csv'
labeled_df = pd.read_csv(file_path, usecols=['userid', 'label'])
# Define the directory containing the CSV files and the output file path
source_directory = "../data/twitter_proc/files"
output_file = '../data/bot_tweets_by_user.csv'
processed_files_report = 'processed_files_report.txt'

print(labeled_df.head())

               userid  label
0            22240612      0
1             6135622      0
2  848416437030985728      0
3  984429894829592576      0
4  807095565028917248      0


In [4]:
# Initialize an empty list to store data before appending
data_to_append = []
rows_processed_count = 0
append_threshold = 10000

# Keep track of processed files
processed_files = set()

# Load previously processed files if the report file exists
if os.path.exists(processed_files_report):
    with open(processed_files_report, 'r') as f:
        for line in f:
            processed_files.add(line.strip())

print("Starting file processing...")

# Iterate through files in the directory
for filename in os.listdir(source_directory):
    if filename.endswith(".csv"):
        file_path = os.path.join(source_directory, filename)

        # Skip if the file has already been processed
        if filename in processed_files:
            print(f"Skipping already processed file: {filename}")
            continue

        print(f"Processing file: {filename}")

        try:
            # Load the CSV file, only needing 'userid' and 'text'
            current_tweets_df = pd.read_csv(file_path, usecols=['userid', 'text'])
            merged_df = pd.merge(current_tweets_df, labeled_df[['userid', 'label']], on='userid', how='left')

            # Filter for bots (where predicted_label is 1)
            bot_tweets = merged_df[merged_df['label'] == 1][['userid', 'text']]

            # Append to the list
            data_to_append.append(bot_tweets)
            rows_processed_count += len(bot_tweets)

            # Check if we should append to the output file
            if rows_processed_count >= append_threshold:
                print(f"Appending {rows_processed_count} rows to output file...")
                combined_df = pd.concat(data_to_append, ignore_index=True)

                # Check if the output file already exists to handle headers
                if not os.path.exists(output_file):
                    combined_df.to_csv(output_file, index=False, mode='w')
                else:
                    combined_df.to_csv(output_file, index=False, mode='a', header=False)

                # Clear the list and reset count after appending
                data_to_append = []
                rows_processed_count = 0

                # Record processed files
                with open(processed_files_report, 'a') as f:
                    f.write(filename + '\n')
                processed_files.add(filename)
                print(f"Appended data and recorded {filename} as processed.")

        except KeyError as ke:
            print(f"Error processing {filename}: Missing required column. {ke}")
        except Exception as e:
            print(f"Error processing {filename}: {e}")

# Append any remaining data in the list
if data_to_append:
    print(f"Appending remaining {rows_processed_count} rows to output file...")
    combined_df = pd.concat(data_to_append, ignore_index=True)
    if not os.path.exists(output_file):
        combined_df.to_csv(output_file, index=False, mode='w')
    else:
        combined_df.to_csv(output_file, index=False, mode='a', header=False)

    # Record remaining processed files
    with open(processed_files_report, 'a') as f:
        with open(processed_files_report, 'r') as report_check_f:
            processed_lines = set(line.strip() for line in report_check_f)
        for filename in processed_files:
            if filename not in processed_lines:
                 f.write(filename + '\n')
    print("Appended remaining data.")


# Final step: Load the complete file and sort by userid
if os.path.exists(output_file):
    print("Loading the complete output file for sorting...")
    final_df = pd.read_csv(output_file)
    print("Sorting by userid...")
    final_df_sorted = final_df.sort_values(by='userid')

    # Overwrite the output file with the sorted data
    print("Saving the final sorted file...")
    final_df_sorted.to_csv(output_file, index=False)
    print("Processing complete. Final sorted data saved.")
else:
    print("No data was processed.")

Starting file processing...
Appending 10309 rows to output file...
Appended data and recorded 20230421_UkraineCombinedTweetsDeduped.csv as processed.
Appending 11697 rows to output file...
Appended data and recorded 20230610_UkraineCombinedTweetsDeduped.csv as processed.
Appending 10746 rows to output file...
Appended data and recorded 20230407_UkraineCombinedTweetsDeduped.csv as processed.
Appending 11033 rows to output file...
Appended data and recorded 1008_UkraineCombinedTweetsDeduped.csv as processed.
Appending 10497 rows to output file...
Appended data and recorded 1106_UkraineCombinedTweetsDeduped.csv as processed.
Appending 11478 rows to output file...
Appended data and recorded 20230305_UkraineCombinedTweetsDeduped.csv as processed.
Appending 10094 rows to output file...
Appended data and recorded 1023_UkraineCombinedTweetsDeduped.csv as processed.
Appending 10210 rows to output file...
Appended data and recorded 20230213_UkraineCombinedTweetsDeduped.csv as processed.
Appendin

In [7]:
bot_text = pd.read_csv(output_file)
print(bot_text.shape)
print(bot_text.head())

(380119, 2)
   userid                                               text
0    1968  @VeritasVinnie21 @MrChuckD They traded her fre...
1    1968  @gloria_sin It's not the weapons, its avoiding...
2   59563  Finally!!! #Messi𓃵 ❤️❤️❤️ #WorldCupFinal #Arge...
3  647943  ++ Ecco i partigiani #Russia anti #Putin. Ora ...
4  647943  Sacrificio estremo degli ucraini, attacchi sui...
