In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
import os

def sample_and_combine_csvs(input_folder, output_file, sample_size=40000):
    """
    Randomly sample rows from multiple CSV files and combine them into one CSV.
    """
    # Convert to absolute path and resolve any symlinks
    input_path = Path(input_folder).resolve()
    
    print(f"Looking for CSV files in: {input_path}")
    print(f"Current working directory: {os.getcwd()}")
    
    # Check if directory exists
    if not input_path.exists():
        raise ValueError(f"Directory does not exist: {input_path}")
    
    # Get list of all CSV files in the input folder
    csv_files = list(input_path.glob('*.csv'))
    
    # Print all files in directory for debugging
    print("\nAll files in directory:")
    for file in input_path.iterdir():
        print(f"- {file.name}")
    
    if not csv_files:
        raise ValueError(f"No CSV files found in {input_path}")
    
    print(f"\nFound {len(csv_files)} CSV files")
    print("\nCSV files to be processed:")
    for file in csv_files:
        print(f"- {file.name}")
    
    # Initialize empty list to store dataframes
    combined_data = []
    
    # Process each CSV file
    for csv_file in tqdm(csv_files, desc="Processing files"):
        try:
            # Read the CSV file
            df = pd.read_csv(csv_file)
            
            print(f"\nProcessing {csv_file.name}")
            print(f"Original size: {len(df)} rows")
            
            # Randomly sample rows
            if len(df) >= sample_size:
                sampled_df = df.sample(n=sample_size, random_state=42)
            else:
                print(f"Warning: {csv_file.name} has fewer than {sample_size} rows. Using all available rows.")
                sampled_df = df
            
            # Add source file information
            sampled_df['source_file'] = csv_file.name
            
            print(f"Sampled size: {len(sampled_df)} rows")
            
            # Append to list
            combined_data.append(sampled_df)
            
        except Exception as e:
            print(f"Error processing {csv_file.name}: {str(e)}")
    
    # Combine all sampled dataframes
    if combined_data:
        final_df = pd.concat(combined_data, ignore_index=True)
        
        # Create output directory if it doesn't exist
        output_path = Path(output_file)
        os.makedirs(output_path.parent, exist_ok=True)
        
        # Save the combined dataset
        final_df.to_csv(output_path, index=False)
        
        print(f"\nProcessing complete!")
        print(f"Total rows in combined dataset: {len(final_df)}")
        print(f"Output saved to: {output_path}")
        
        # Print sample distribution
        print("\nSamples per source file:")
        print(final_df['source_file'].value_counts())
    else:
        raise ValueError("No data was successfully processed")

if __name__ == "__main__":
    # Get the current working directory
    current_dir = Path.cwd()
    print(f"Script running from: {current_dir}")
    
    # # Try to find the ZDATA directory
    # project_dir = current_dir
    # while project_dir.name and not (project_dir / "PENS-Personalized-News-Headline-Generation").exists():
    #     project_dir = project_dir.parent
    
    # if not project_dir.name:
    #     raise ValueError("Could not find PENS-Personalized-News-Headline-Generation directory")
    
    # Set up paths
    input_folder = project_dir / "your-corresponding-directory"
    output_file = input_folder / "combined_sample.csv"
    
    print(f"\nUsing paths:")
    print(f"Input folder: {input_folder}")
    print(f"Output file: {output_file}")
    
    # Run the function
    try:
        sample_and_combine_csvs(input_folder, output_file)
    except Exception as e:
        print(f"\nAn error occurred: {str(e)}")

Script running from: /teamspace/studios/this_studio/PENS-Personalized-News-Headline-Generation/ZDATA

Using paths:
Input folder: /teamspace/studios/this_studio/PENS-Personalized-News-Headline-Generation/ZDATA
Output file: /teamspace/studios/this_studio/PENS-Personalized-News-Headline-Generation/ZDATA/combined_sample.csv
Looking for CSV files in: /teamspace/studios/this_studio/PENS-Personalized-News-Headline-Generation/ZDATA
Current working directory: /teamspace/studios/this_studio/PENS-Personalized-News-Headline-Generation/ZDATA

All files in directory:
- Combined_Sampled.ipynb
- synthetic-original_openai_300_10.csv
- synthetic-original_openai_300_20.csv
- synthetic-original_openai_300_30.csv
- synthetic-original_openai_300_40.csv
- synthetic-original_openai_300_50.csv
- synthetic-original_openai_50_100.csv
- synthetic-original_openai_50_150.csv
- synthetic-original_openai_50_200.csv
- synthetic-original_openai_50_350.csv
- synthetic-original_openai_50_500.csv

Found 10 CSV files

CSV 

Processing files:  20%|██        | 2/10 [00:00<00:00, 13.30it/s]


Processing synthetic-original_openai_300_10.csv
Original size: 1280 rows
Sampled size: 128 rows

Processing synthetic-original_openai_300_20.csv
Original size: 1280 rows
Sampled size: 128 rows

Processing synthetic-original_openai_300_30.csv
Original size: 1280 rows
Sampled size: 128 rows


Processing files:  40%|████      | 4/10 [00:00<00:00, 13.06it/s]


Processing synthetic-original_openai_300_40.csv
Original size: 1280 rows
Sampled size: 128 rows

Processing synthetic-original_openai_300_50.csv
Original size: 1280 rows
Sampled size: 128 rows

Processing synthetic-original_openai_50_100.csv
Original size: 1280 rows
Sampled size: 128 rows

Processing synthetic-original_openai_50_150.csv
Original size: 1280 rows
Sampled size: 128 rows


Processing files: 100%|██████████| 10/10 [00:00<00:00, 14.52it/s]


Processing synthetic-original_openai_50_200.csv
Original size: 1280 rows
Sampled size: 128 rows

Processing synthetic-original_openai_50_350.csv
Original size: 1280 rows
Sampled size: 128 rows

Processing synthetic-original_openai_50_500.csv
Original size: 1280 rows
Sampled size: 128 rows






Processing complete!
Total rows in combined dataset: 1280
Output saved to: /teamspace/studios/this_studio/PENS-Personalized-News-Headline-Generation/ZDATA/combined_sample.csv

Samples per source file:
source_file
synthetic-original_openai_300_10.csv    128
synthetic-original_openai_300_20.csv    128
synthetic-original_openai_300_30.csv    128
synthetic-original_openai_300_40.csv    128
synthetic-original_openai_300_50.csv    128
synthetic-original_openai_50_100.csv    128
synthetic-original_openai_50_150.csv    128
synthetic-original_openai_50_200.csv    128
synthetic-original_openai_50_350.csv    128
synthetic-original_openai_50_500.csv    128
Name: count, dtype: int64
