## This code reads one at a time .dat files generated after running the PANDAT simulations using panpython. It can read large number of files as it runs in parallel and post process it based on the information required. The data is stored in a pickle format as large number of rows and columns cannot be accomodated in excel. This code can also plot the Cp, density and phase diagram for the given dataframe obtained after filtering the data.

### Importing all the necessary modules

In [None]:
import numpy as np
import pandas as pd
import concurrent.futures
import glob
import time
import pickle
import gc

gc.collect()

#Define the path where files are placed
files_path = glob.glob("output/intermediate/*.dat")
print(f'Total number of dat files are {len(files_path)}')
#files_path = files_path[0:20000]

### Defining a function to read all the dat files and processing it on the go and extratcing the required information form a given .dat file.

In [None]:
def read_dat_file_post_process(file_path):
    df = pd.read_csv(file_path, sep='\t')
    df = df.drop([0, len(df)-1])
    
    df = df.rename(columns={'T': 'Temp'})
    
    # Select columns that start with "f(@" using startswith() method
    all_phases_columns = [col for col in df.columns if col.startswith('f(@')]
    # Select columns that start with "f(@B)"
    required_cols = [col for col in all_phases_columns if col.startswith('f(@B')]
    # Exclude columns that start with "f(@B" using list comprehension
    not_required_cols = [col for col in all_phases_columns if not col.startswith('f(@B')]
    
    #putting the required mask
    #This mask is for getting the temperature for the onset of BCC or B2 phase with no other phase present and solidus temp
    mask_temp = df[required_cols].notna().all(axis=1) & df[not_required_cols].isna().all(axis=1)
    onset_of_B_phase = df.loc[mask_temp, 'Temp'].astype(float).min()
    solidus = df.loc[mask_temp, 'Temp'].astype(float).max()
    range_of_single_phase = solidus - onset_of_B_phase
    
    #Max and min Cp and densities
    max_cp = df['Cp'].astype(float).max()
    min_cp = df['Cp'].astype(float).min()
    
    max_den = df['density'].astype(float).max()
    min_den = df['density'].astype(float).min()
    
    #Getting the composition and it value and making a dataframe
    comp_columns = [col for col in df.columns if col.startswith('x(')]
    comp = df.loc[2, comp_columns]
    comp_df = pd.DataFrame(comp).transpose()
    comp_df = comp_df.reset_index(drop=True)
    
    
    # combining all the necessary value to one dataframe alsong with the filename
    final_df = pd.DataFrame({'onset of single phase': onset_of_B_phase, 'Solidus': solidus, 'range of single phase': range_of_single_phase,
                             'max cp': max_cp, 'min Cp': min_cp,'max density': max_den, 
                             'min density': min_den, 'filename': str(file_path) }, index=[0])
    
    # concatenating the dataframes
    result_df = pd.concat([comp_df, final_df], axis=1)
    float_cols = result_df.select_dtypes(include=['float']).columns
    result_df[float_cols] = result_df[float_cols].round(2)
    
    return result_df


## Reading the files, processing it and storing it a dataframe as a picke format. Pickle format file can be read later as well.

In [None]:
ini = time.time()
def process_file(file_path):
    result = read_dat_file_post_process(file_path)
    return result

with concurrent.futures.ThreadPoolExecutor(max_workers = 4) as executor:
    # submit the function to the pool for each file path
    futures = [executor.submit(process_file, file_path) for file_path in files_path]
    
    # combine the results from each process
    result = pd.concat([f.result() for f in futures])

    # Write dataframe to pickle file
with open('final_processed_data.pickle', 'wb') as f:
    pickle.dump(result, f)
    
fin = time.time()
total_time = round((fin-ini)/60,2)
print(f'Total time taken to read and process {len(files_path)} files is {total_time} minutes')

In [None]:
# Viweing the dataframe
result.head(5)

### Getting the total memory of the final dataframe

In [None]:
memory = []
a = result.memory_usage(deep = True)
for i in range(len(a)):
    _ = a[i]
    memory.append(_)
ar = np.array(memory)
ar = round((ar.sum()/1073741824),2)
print(f'Size of dataframe is {ar} GB')

## Sorting/filtering the dataframe as per requirement

In [None]:
top_30 = result.sort_values(by=['range of single phase'], ascending = False).head(5)

## Plotting the phase diagram, Cp and density for the sorted data

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
%matplotlib widget
def ploting_phase_Cp_density_plots(filename):

    #Collecting all the data for plotting
    data = pd.read_csv(filename, sep = '\t')
    data = data.drop(index = 0)
    phases_columns = [col for col in data.columns if col.startswith('f(@')]
    density = data['density'].astype(float)
    Cp = data['Cp'].astype(float)
    Temp = data['T'].astype(float)

    comp_columns = [col for col in data.columns if col.startswith('x(')]
    comp_name = []
    for col in comp_columns:
        _ = col
        comp_name.append(_)
        __ = data.loc[1,col]
        comp_name.append(__)


    fig,axs = plt.subplots(nrows = 1, ncols = 3, figsize = (13,5))

    #Plotting phase diagram
    for col in phases_columns:
        axs[0].plot(Temp, data[col], '-', label = col)

    #Plotting density and Cp
    axs[1].plot(Temp, density, label = 'Density' )
    axs[2].plot(Temp, Cp, label = 'Cp' )

    for i in range(3):
        axs[i].set_xlabel('Temp (C)')
        axs[i].legend()
        axs[i].grid()
        axs[i].set_title(str(comp_name))
    plt.tight_layout()
    plt.savefig(str(comp_name) + '.jpeg')
    return


In [None]:
for filename in top_30['filename']:
    ploting_phase_Cp_density_plots(filename)

## 