# Data Merging
Master thesis of Nikolai Popov, MAE-2025

In [1]:
# Libraries import
import pandas as pd # dataframes
import glob # for reading several files one by one
from tqdm import tqdm # for progress bar
import gc # to delete a dataframe from the memory

Upload the initial datasets and merge them into one. Note- cannot export and than import such a merged file- requires too much RAM.

In [2]:
# Define the file pattern to match all relevant CSV files
file_pattern = "C:/Users/Popov/Documents/Research/Volchkova_thesis/Data/Raw/2025-02-06_Part_*.csv"

# Get all matching file names
file_list = glob.glob(file_pattern)

# Check if any files were found
if not file_list:
    print("No CSV files found. Check the file path and extension.")
else:
    df_list = []

    # Read and append each file with tqdm progress bar
    for file in tqdm(file_list, desc="Loading CSV files", unit="file"):
        try:
            df = pd.read_csv(file, sep=";", on_bad_lines="skip", low_memory=False)
            df_list.append(df)
        except Exception as e:
            print(f"Error reading {file}: {e}")

    # Print the number of files read successfully
    print(f"\n Successfully read {len(df_list)} files out of {len(file_list)} found.")

    # Concatenate all DataFrames into one
    if df_list:
        raw_dataset = pd.concat(df_list, ignore_index=True)

        # Save the merged dataset
        output_file = "C:/Users/Popov/Documents/Research/Volchkova_thesis/Data/Raw/Merged_raw_data.csv"
        raw_dataset.to_csv(output_file, index=False, encoding="utf-8-sig")
        print(f"\n Merged dataset saved as: {output_file}")
    else:
        print(" No valid dataframes could be read. Please check file contents.")

# Delete the DataFrame
del raw_dataset  

# Run garbage collection to free memory
gc.collect()

Loading CSV files: 100%|██████████| 141/141 [00:32<00:00,  4.30file/s]



 Successfully read 141 files out of 141 found.

 Merged dataset saved as: C:/Users/Popov/Documents/Research/Volchkova_thesis/Data/Raw/Merged_raw_data.csv


0