## This code reads the .dat files generated after running the PANDAT simulations using panpython. It can read large number of files as it runs in parallel. The data is stored in a pickle format as large number of rows and columns cannot be accomodated in excel.

### Importing all the necessary modules

In [None]:
import numpy as np
import pandas as pd
import concurrent.futures
import glob
import time
import pickle

#Define the path where files are placed
files_path = glob.glob("output/dat_files/*.dat")
print(f'Total number of dat files are {len(files_path)}')

### Defining a function to read all the dat files defined above and append it in a dataframe.

In [None]:
def read_csv_to_concatenated_dataframe(file_paths):
    """
    Reads multiple CSV files and concatenates their contents into a single Pandas DataFrame.    
    Returns:
    A Pandas DataFrame containing the combined contents of all CSV files.
    """
    # Create a list of DataFrames from the CSV files
    dfs = [pd.read_csv(file_path, sep='\t') for file_path in file_paths] # Reads all the dat files described lying on files_path
    dfs = [df.tail(-1) for df in dfs] #Deleting the first row of each dataframe as it contains the units row, not necessary to read
    # Concatenate the DataFrames into a single DataFrame
    concatenated_df = pd.concat(dfs) # Concatenating the dataframes
    return concatenated_df

### Running the above function in parallel over all the files contained in file_paths

In [None]:
# Initiating the time counter
ini = time.time()

with concurrent.futures.ThreadPoolExecutor(16) as executor:
    futures = [executor.submit(read_csv_to_concatenated_dataframe, [file_path]) for file_path in files_path]
    dfs = [future.result() for future in futures] # Getting the concatenated dataframes after running the above function on multiple threads for each thread

# Concatenate the DataFrames into a single DataFrame
final_df = pd.concat(dfs) # Concatenating all the dataframes obtained after running the function on multiple threads

# Dropping the completely empty rows. Empty rows are generated when two dataframes are concatenated. 
final_df = final_df.dropna(subset=['phase_name'], how = 'all')
fin = time.time()
time_taken = round(fin-ini, 2)

print(f"Time taken to execute function and get the final dataframe is: {time_taken} seconds")
print(f'Number of rows and columns for the final data are {final_df.shape[0]} and {final_df.shape[1]}')

### Getting the total memory of the final dataframe

In [None]:
memory = []
a = final_df.memory_usage(deep = True)
for i in range(len(a)):
    _ = a[i]
    memory.append(_)
ar = np.array(memory)
ar = round((ar.sum()/1073741824),2)
print(f'Size of dataframe is {ar} GB')

### Importing the final dataframe into a pickle file

In [None]:
ini = time.time()

# Write dataframe to pickle file
with open('final_data_test.pickle', 'wb') as f:
    pickle.dump(final_df, f)

fin = time.time()
time_taken = round(fin-ini, 2)
print(f"Time taken to write a picke file is: {time_taken} seconds")

## 