In [1]:
import dask
import dask.dataframe as dd
import dask.array as da
import itertools as it
import numpy as np
import pandas as pd
import pyarrow
import numba
import cython
from time import time
import os

In [2]:
# Folder containing subfolders with the data files
data_folder = '/home/kamenpetkov23/CS546-Linux/datasets'

# Destination for concatenated datasets
output_folder = './datasets_thesios_io_traces'

# Ensure output folder exists
os.makedirs(output_folder, exist_ok=True)

# Dataset sizes in terms of the number of rows (data points)
dataset_row_limits = {
    "dataset-125m": 125_000_000,  # 125 million rows
    "dataset-25m": 25_000_000,    # 25 million rows
    "dataset-5m": 5_000_000,      # 5 million rows
    "dataset-1m": 1_000_000,      # 1 million rows
    "dataset-200k": 200_000       # 200,000 rows
}

def concatenate_files_by_rows(output_filename, row_limit, folder, chunksize=1000000):
    """Concatenates files in chunks until the total number of rows reaches row_limit and saves to output_filename."""
    total_rows = 0
    output_path = os.path.join(output_folder, output_filename)

    # Create the CSV file if it doesn't exist, or append data if it does
    with open(output_path, 'w') as f_output:
        # Iterate through subfolders in the main folder
        for subfolder in sorted(os.listdir(folder)):
            subfolder_path = os.path.join(folder, subfolder)
            if os.path.isdir(subfolder_path):  # Ensure it's a folder
                print(f"Processing folder: {subfolder_path}")
                
                # List and process files in the subfolder
                for file_name in sorted(os.listdir(subfolder_path)):
                    file_path = os.path.join(subfolder_path, file_name)
                    print(f"Processing file: {file_path}")

                    # Read in chunks to avoid loading entire file into memory
                    for chunk in pd.read_csv(file_path, chunksize=chunksize):
                        remaining_rows = row_limit - total_rows
                        
                        # Check how many rows are needed
                        if len(chunk) > remaining_rows:
                            # If chunk has more rows than needed, take only required rows
                            chunk = chunk.iloc[:remaining_rows]
                            total_rows += len(chunk)
                            chunk.to_csv(f_output, mode='a', header=f_output.tell()==0, index=False)
                            print(f'Added {len(chunk)} rows from {file_name}, total rows: {total_rows}')
                            return
                        else:
                            # Otherwise, add the entire chunk
                            total_rows += len(chunk)
                            chunk.to_csv(f_output, mode='a', header=f_output.tell()==0, index=False)
                            print(f'Added {len(chunk)} rows from {file_name}, total rows: {total_rows}')

                    # Stop if we've reached the row limit
                    if total_rows >= row_limit:
                        return

# Create each dataset based on the row limits
for dataset_name, row_limit in dataset_row_limits.items():
    concatenate_files_by_rows(f'{dataset_name}.csv', row_limit, data_folder, chunksize=1000000)

Processing folder: /home/kamenpetkov23/CS546-Linux/datasets/20240115
Processing file: /home/kamenpetkov23/CS546-Linux/datasets/20240115/data-00000-of-00100
Added 168590 rows from data-00000-of-00100, total rows: 168590
Processing file: /home/kamenpetkov23/CS546-Linux/datasets/20240115/data-00001-of-00100
Added 168591 rows from data-00001-of-00100, total rows: 337181
Processing file: /home/kamenpetkov23/CS546-Linux/datasets/20240115/data-00002-of-00100
Added 168591 rows from data-00002-of-00100, total rows: 505772
Processing file: /home/kamenpetkov23/CS546-Linux/datasets/20240115/data-00003-of-00100
Added 168591 rows from data-00003-of-00100, total rows: 674363
Processing file: /home/kamenpetkov23/CS546-Linux/datasets/20240115/data-00004-of-00100
Added 168591 rows from data-00004-of-00100, total rows: 842954
Processing file: /home/kamenpetkov23/CS546-Linux/datasets/20240115/data-00005-of-00100
Added 168591 rows from data-00005-of-00100, total rows: 1011545
Processing file: /home/kamenpe

In [3]:
# Now, we want to convert to parquet format because it's much faster and efficient
# even though it was suggested to us that this was not necessary as we want to focus on testing with one file type

# Folder containing the CSV datasets
datasets_folder = './datasets_thesios_io_traces'

# Map each CSV dataset name to its corresponding folder and target row limit
dataset_info = {
    "dataset-125m.csv": {"output_dir": './datasets_thesios_io_traces/dataset-125m', "row_limit": 125_000_000},  # 125 million rows
    "dataset-25m.csv": {"output_dir": './datasets_thesios_io_traces/dataset-25m', "row_limit": 25_000_000},    # 25 million rows
    "dataset-5m.csv": {"output_dir": './datasets_thesios_io_traces/dataset-5m', "row_limit": 5_000_000},       # 5 million rows
    "dataset-1m.csv": {"output_dir": './datasets_thesios_io_traces/dataset-1m', "row_limit": 1_000_000},       # 1 million rows
    "dataset-200k.csv": {"output_dir": './datasets_thesios_io_traces/dataset-200k', "row_limit": 200_000}      # 200,000 rows
}

chunk_size = 200000  # 200k rows at a time

# Ensure the output directories exist
for info in dataset_info.values():
    os.makedirs(info["output_dir"], exist_ok=True)

# Convert each CSV to Parquet in chunks for its corresponding dataset size
for csv_file, info in dataset_info.items():
    csv_path = os.path.join(datasets_folder, csv_file)
    
    if os.path.exists(csv_path):
        print(f"Processing {csv_file}")
        
        # Track total rows written for the current dataset
        total_rows = 0
        chunk_index = 0

        # Read the CSV in chunks
        for chunk in pd.read_csv(csv_path, chunksize=chunk_size):
            # Calculate the remaining rows needed to reach the limit
            remaining_rows = info["row_limit"] - total_rows
            
            if len(chunk) > remaining_rows:
                # If the chunk exceeds the remaining rows, slice the chunk
                chunk_to_write = chunk.iloc[:remaining_rows]
                chunk = chunk.iloc[remaining_rows:]
            else:
                # If the chunk can fit fully, write the whole chunk
                chunk_to_write = chunk
                chunk = pd.DataFrame()  # Empty the chunk

            # Save the chunk to the appropriate folder with the part number
            output_folder = info["output_dir"]
            parquet_path = os.path.join(output_folder, f"part.{chunk_index}.parquet")
            chunk_to_write.to_parquet(parquet_path, engine='pyarrow', index=False)

            # Update the total rows for this dataset
            total_rows += len(chunk_to_write)
            print(f"Processed part.{chunk_index} for {csv_file}, total rows: {total_rows}")

            # Increment the chunk index
            chunk_index += 1

            # If we've reached the limit for this dataset, stop processing
            if total_rows >= info["row_limit"]:
                break

        print(f"Completed processing for {csv_file}")
    else:
        print(f"CSV dataset {csv_file} not found at {csv_path}")

Processing dataset-125m.csv
Processed part.0 for dataset-125m.csv, total rows: 200000
Processed part.1 for dataset-125m.csv, total rows: 400000
Processed part.2 for dataset-125m.csv, total rows: 600000
Processed part.3 for dataset-125m.csv, total rows: 800000
Processed part.4 for dataset-125m.csv, total rows: 1000000
Processed part.5 for dataset-125m.csv, total rows: 1200000
Processed part.6 for dataset-125m.csv, total rows: 1400000
Processed part.7 for dataset-125m.csv, total rows: 1600000
Processed part.8 for dataset-125m.csv, total rows: 1800000
Processed part.9 for dataset-125m.csv, total rows: 2000000
Processed part.10 for dataset-125m.csv, total rows: 2200000
Processed part.11 for dataset-125m.csv, total rows: 2400000
Processed part.12 for dataset-125m.csv, total rows: 2600000
Processed part.13 for dataset-125m.csv, total rows: 2800000
Processed part.14 for dataset-125m.csv, total rows: 3000000
Processed part.15 for dataset-125m.csv, total rows: 3200000
Processed part.16 for data

In [4]:
# Now that we have the datasets ready, we can experiment on them

# First, analyze the data and check for inconsistencies
# The first inconsistency I will check is whether there are empty (a.k.a. no value) cells
# If I find such, I will remove the whole row

# Base path for the datasets
base_path = './datasets_thesios_io_traces'

# List of Parquet dataset folder names
# parquet_dataset_folders = ["dataset-125m", "dataset-25m", "dataset-5m", "dataset-1m", "dataset-200k"]

csv_dataset_folder = base_path

counter_missing = 0
chunk_size = 10**6

# Iterate over each csv dataset folder to analyze and clean the data
if os.path.exists(csv_dataset_folder):
    print(f"\nAnalyzing dataset from: {csv_dataset_folder}\n")
    
    # List all CSV files in the folder
    csv_files = sorted([f for f in os.listdir(csv_dataset_folder) if f.endswith('.csv')])
    
    for csv_file in csv_files:
        file_path = os.path.join(csv_dataset_folder, csv_file)
        
        print(f"Processing {csv_file}...")
        missing_values_in_file = 0
        
        # Process the CSV file in chunks
        for chunk in pd.read_csv(file_path, chunksize=chunk_size):
            # Check for missing values in the chunk
            missing_values = chunk.isnull().sum().sum()  # Total number of missing values in the chunk
                
        if missing_values_in_file > 0:
            print(f"Found {missing_values_in_file} missing values in {csv_file}")
            counter_missing += missing_values_in_file
        else:
            print(f"No missing values found in {csv_file}")
else:
    print(f"Folder {csv_dataset_folder} not found")

print(f"Total number of missing values (--- corrupted rows ---): {counter_missing}")


Analyzing dataset from: ./datasets_thesios_io_traces

Processing dataset-125m.csv...
No missing values found in dataset-125m.csv
Processing dataset-1m.csv...
No missing values found in dataset-1m.csv
Processing dataset-200k.csv...
No missing values found in dataset-200k.csv
Processing dataset-25m.csv...
No missing values found in dataset-25m.csv
Processing dataset-5m.csv...
No missing values found in dataset-5m.csv
Total number of missing values (--- corrupted rows ---): 0


In [5]:
# Base path for the datasets
base_path = './datasets_thesios_io_traces'

parquet_dataset_folders = ["dataset-125m", "dataset-25m", "dataset-5m", "dataset-1m", "dataset-200k"]

counter_missing = 0

for dataset_folder in parquet_dataset_folders:
    dataset_path = os.path.join(base_path, dataset_folder)
    
    if os.path.exists(dataset_path):
        print(f"\nAnalyzing dataset from: {dataset_folder}\n")
        
        # List all Parquet files in the folder
        parquet_files = sorted([f for f in os.listdir(dataset_path) if f.endswith('.parquet')])
        
        for parquet_file in parquet_files:
            file_path = os.path.join(dataset_path, parquet_file)
            
            # Load the Parquet file
            df = pd.read_parquet(file_path)
            
            # Check for missing values
            missing_values = df.isnull().sum().sum()  # Total number of missing values
            
            if missing_values > 0:
                print(f"Found {missing_values} missing values in {parquet_file}")
                counter_missing += 1
            else:
                print(f"No missing values found in {parquet_file}")
    else:
        print(f"Folder {dataset_folder} not found at {dataset_path}")

print(f"Number of missing values (--- corrupted rows ---): {counter_missing}")


Analyzing dataset from: dataset-125m

No missing values found in part.0.parquet
No missing values found in part.1.parquet
No missing values found in part.10.parquet
No missing values found in part.100.parquet
No missing values found in part.101.parquet
No missing values found in part.102.parquet
No missing values found in part.103.parquet
No missing values found in part.104.parquet
No missing values found in part.105.parquet
No missing values found in part.106.parquet
No missing values found in part.107.parquet
No missing values found in part.108.parquet
No missing values found in part.109.parquet
No missing values found in part.11.parquet
No missing values found in part.110.parquet
No missing values found in part.111.parquet
No missing values found in part.112.parquet
No missing values found in part.113.parquet
No missing values found in part.114.parquet
No missing values found in part.115.parquet
No missing values found in part.116.parquet
No missing values found in part.117.parquet