In [1]:
import pandas as pd
import os
import glob

In [None]:
def process_user_data_files():
    base_path = "/data2/julina/scripts/tweets/2020/03/"
    all_users_dir = os.path.join(base_path, "user_csv/demo/")
    su_users_dir = os.path.join(base_path, "pred/dm/")
    output_dir = os.path.join(base_path, "SU_and_NON_SU_analysis/")

    print(f"Creating output directory (if needed): {output_dir}")
    os.makedirs(output_dir, exist_ok=True)

    all_users_filepaths = glob.glob(os.path.join(all_users_dir, "*.csv"))
    if not all_users_filepaths:
        print(f"Error: No CSV files found in {all_users_dir}. Please check the path.")
        return
    print(f"Found {len(all_users_filepaths)} files to process.")

    all_classified_dfs = []
    common_columns = ['id', 'gender', 'age', 'org', 'text', 'user_id', 'name', 'screen_name', 'description', 'lang', 'DrugAbuse']
    for all_users_file in all_users_filepaths:
        try:
            base_filename = os.path.basename(all_users_file)
            filename_without_ext = os.path.splitext(base_filename)[0]

            su_users_file = os.path.join(su_users_dir, f"{filename_without_ext}pred_dm.csv")

            print("-" * 50)
            print(f"Processing: {base_filename}")

            if not os.path.exists(su_users_file):
                print(f"  -> SKIPPING: Corresponding SU file not found at {su_users_file}")
                continue

            # --- Main Logic for each file pair ---
            all_users_df = pd.read_csv(all_users_file)
            su_users_df = pd.read_csv(su_users_file, index_col=0)
            su_user_ids = su_users_df['user_id'].unique()
            non_su_users_df = all_users_df[~all_users_df['user_id'].isin(su_user_ids)].copy()
            non_su_users_df['DrugAbuse'] = 0
            su_users_cleaned_df = su_users_df[common_columns].copy()

            daily_final_df = pd.concat([non_su_users_df, su_users_cleaned_df], ignore_index=True)
            all_classified_dfs.append(daily_final_df)
            print(f"  -> Processed and added to the collection.")

        except Exception as e:
            print(f"  -> ERROR processing {base_filename}: {e}")
            continue

    if not all_classified_dfs:
        print("No data was processed. Exiting.")
        return

    print("-" * 50)
    print("All files processed. Now combining into a single master file...")
    
    master_df = pd.concat(all_classified_dfs, ignore_index=True)
    output_file = os.path.join(output_dir, "all_users_classified_combined.csv")
    master_df.to_csv(output_file, index=False)
    print(f"\nSUCCESS: Saved master file to {output_file}")
    print("\nFinal data distribution in the master file:")
    print(master_df['DrugAbuse'].value_counts())
    print(f"\nTotal rows in master file: {len(master_df)}")
    print("\nBatch processing complete.")


In [3]:

process_user_data_files()

Creating output directory (if needed): /data2/julina/scripts/tweets/2020/03/SU_and_NON_SU_analysis/
Found 31 files to process.
--------------------------------------------------
Processing: 2020_03_31.csv
  -> SKIPPING: Corresponding SU file not found at /data2/julina/scripts/tweets/2020/03/pred/dm/2020_03_31pred_dm.csv
--------------------------------------------------
Processing: 2020_03_20.csv


  all_users_df = pd.read_csv(all_users_file)


  -> Processed and added to the collection.
--------------------------------------------------
Processing: 2020_03_26.csv


  all_users_df = pd.read_csv(all_users_file)


  -> Processed and added to the collection.
--------------------------------------------------
Processing: 2020_03_24.csv


  all_users_df = pd.read_csv(all_users_file)


  -> Processed and added to the collection.
--------------------------------------------------
Processing: 2020_03_04.csv


  all_users_df = pd.read_csv(all_users_file)


  -> Processed and added to the collection.
--------------------------------------------------
Processing: 2020_03_25.csv


  all_users_df = pd.read_csv(all_users_file)


  -> Processed and added to the collection.
--------------------------------------------------
Processing: 2020_03_08.csv


  all_users_df = pd.read_csv(all_users_file)


  -> Processed and added to the collection.
--------------------------------------------------
Processing: 2020_03_30.csv


  all_users_df = pd.read_csv(all_users_file)


  -> Processed and added to the collection.
--------------------------------------------------
Processing: 2020_03_21.csv


  all_users_df = pd.read_csv(all_users_file)


  -> Processed and added to the collection.
--------------------------------------------------
Processing: 2020_03_22.csv


  all_users_df = pd.read_csv(all_users_file)


  -> Processed and added to the collection.
--------------------------------------------------
Processing: 2020_03_15.csv


  all_users_df = pd.read_csv(all_users_file)


  -> Processed and added to the collection.
--------------------------------------------------
Processing: 2020_03_16.csv


  all_users_df = pd.read_csv(all_users_file)


  -> Processed and added to the collection.
--------------------------------------------------
Processing: 2020_03_28.csv


  all_users_df = pd.read_csv(all_users_file)


  -> Processed and added to the collection.
--------------------------------------------------
Processing: 2020_03_19.csv


  all_users_df = pd.read_csv(all_users_file)


  -> Processed and added to the collection.
--------------------------------------------------
Processing: 2020_03_13.csv


  all_users_df = pd.read_csv(all_users_file)


  -> Processed and added to the collection.
--------------------------------------------------
Processing: 2020_03_01.csv


  all_users_df = pd.read_csv(all_users_file)


  -> Processed and added to the collection.
--------------------------------------------------
Processing: 2020_03_09.csv


  all_users_df = pd.read_csv(all_users_file)


  -> Processed and added to the collection.
--------------------------------------------------
Processing: 2020_03_14.csv


  all_users_df = pd.read_csv(all_users_file)


  -> Processed and added to the collection.
--------------------------------------------------
Processing: 2020_03_06.csv


  all_users_df = pd.read_csv(all_users_file)


  -> Processed and added to the collection.
--------------------------------------------------
Processing: 2020_03_11.csv


  all_users_df = pd.read_csv(all_users_file)


  -> Processed and added to the collection.
--------------------------------------------------
Processing: 2020_03_02.csv


  all_users_df = pd.read_csv(all_users_file)


  -> Processed and added to the collection.
--------------------------------------------------
Processing: 2020_03_23.csv


  all_users_df = pd.read_csv(all_users_file)


  -> Processed and added to the collection.
--------------------------------------------------
Processing: 2020_03_07.csv


  all_users_df = pd.read_csv(all_users_file)


  -> Processed and added to the collection.
--------------------------------------------------
Processing: 2020_03_03.csv


  all_users_df = pd.read_csv(all_users_file)


  -> Processed and added to the collection.
--------------------------------------------------
Processing: 2020_03_18.csv


  all_users_df = pd.read_csv(all_users_file)


  -> Processed and added to the collection.
--------------------------------------------------
Processing: 2020_03_29.csv


  all_users_df = pd.read_csv(all_users_file)


  -> Processed and added to the collection.
--------------------------------------------------
Processing: 2020_03_27.csv


  all_users_df = pd.read_csv(all_users_file)


  -> Processed and added to the collection.
--------------------------------------------------
Processing: 2020_03_10.csv


  all_users_df = pd.read_csv(all_users_file)


  -> Processed and added to the collection.
--------------------------------------------------
Processing: 2020_03_12.csv


  all_users_df = pd.read_csv(all_users_file)


  -> Processed and added to the collection.
--------------------------------------------------
Processing: 2020_03_05.csv


  all_users_df = pd.read_csv(all_users_file)


  -> Processed and added to the collection.
--------------------------------------------------
Processing: 2020_03_17.csv


  all_users_df = pd.read_csv(all_users_file)


  -> Processed and added to the collection.
--------------------------------------------------
All files processed. Now combining into a single master file...

SUCCESS: Saved master file to /data2/julina/scripts/tweets/2020/03/SU_and_NON_SU_analysis/all_users_classified_combined.csv

Final data distribution in the master file:
DrugAbuse
0    31901469
1      340683
Name: count, dtype: int64

Total rows in master file: 32242152

Batch processing complete.
