## Program: preprocess tweet data

In [1]:
import glob
import pandas as pd
import polars as pl
from datetime import datetime

In [2]:
# Read and combine CSV files using Pandas
def read_and_combine_files(folder_path):
    # Find all .csv files in the specified folder
    csv_files = glob.glob(f'{folder_path}/*.csv')
    
    # List to hold dataframes
    dataframes = []
    
    # Read each CSV file and append to the list
    for file in csv_files:
        df = pd.read_csv(file, usecols=['User', 'Tweet', 'Date'])
        dataframes.append(df)
    
    # Concatenate all dataframes into one
    combined_df = pd.concat(dataframes, ignore_index=True)

    combined_df['Date'] = combined_df['Date'].str.split('T', n=1).str[0]   # keep only "YYYY-MM-DD", ignore times and timezones

    # Convert 'Date' column to datetime
    combined_df['Date'] = pd.to_datetime(combined_df['Date'], format='%Y-%m-%d', errors='coerce')
    
    return combined_df

In [None]:
combined_raw_table = read_and_combine_files("scraped_data")
all_processed_usernames = pd.read_csv("processed_usernames.csv")

In [4]:
combined_raw_table.to_csv(f"combined_table_users_tweets.csv", index=False, encoding="utf-8")