In [None]:
import pandas as pd
import random
import glob

# Function to merge multiple CSVs and select k random rows
def merge_and_sample(csv_folder, k, output_file):
    """
    Merges multiple CSV files from a folder, selects k random virus genome records, and saves to a new CSV.
    
    Args:
        csv_folder (str): Path to the folder containing CSV files.
        k (int): Number of random entries to retrieve.
        output_file (str): Output CSV filename.
    """
    # Get all CSV file paths from the folder
    csv_files = glob.glob(f"{csv_folder}/*.csv")
    
    df_list = [pd.read_csv(file) for file in csv_files]
    merged_df = pd.concat(df_list, ignore_index=True)
    
    k = min(k, len(merged_df))
    
    # Randomly sample k rows
    sampled_df = merged_df.sample(n=k, random_state=42)  # random_state ensures reproducibility
    
    selected_columns = ["accession", "seq_definition", "seq_length", "GBSeq_organism"]
    sampled_df = sampled_df[selected_columns]
    
    sampled_df.to_csv(output_file, index=False)
    print(f"Saved {k} random samples to {output_file}")

# Example usage
csv_folder_path = ".\old_genbank_pull"
k = 30000 
output_csv = "random_viral_samples.csv"

merge_and_sample(csv_folder_path, k, output_csv)


  csv_folder_path = ".\old_genbank_pull"  # Replace with your actual folder path


Saved 30000 random samples to random_viral_samples.csv
