In [1]:
import pandas as pd
import os

# Correct the file paths for input and output, both within the 'data' folder
input_file = 'data/skills_preprocessed.csv'  # This should be the file path as a string
output_file = 'data/sample_skills.csv'

# Define the approximate size limit for the sample in bytes (500MB)
max_size = 500 * 1024 * 1024  # 500 MB in bytes

# Read the large CSV in chunks and sample the data
chunk_size = 100000  # Size of each chunk to process
total_sample = pd.DataFrame()

try:
    for chunk in pd.read_csv(input_file, chunksize=chunk_size):
        sample_chunk = chunk.sample(frac=0.01)  # Adjust the fraction if necessary
        total_sample = pd.concat([total_sample, sample_chunk], ignore_index=True)

        # Save the intermediate sample to CSV and check the size
        total_sample.to_csv(output_file, index=False)
        if os.path.getsize(output_file) >= max_size:
            break

    # Save the final sample to a CSV file in the 'data' folder
    total_sample.to_csv(output_file, index=False)
    print(f"Sample saved to {output_file} (approximately {max_size / (1024 * 1024)} MB)")

except FileNotFoundError:
    print(f"Error: The file {input_file} does not exist. Please check the file path.")

except Exception as e:
    print(f"An unexpected error occurred: {e}")


Sample saved to data/sample_skills.csv (approximately 500.0 MB)
