# FNCT Data Read In and Transformation Postprocessing

Within this script, manual FNCT data input added using the previously created 'input mask' (applying [FNCT Data Transformation](https://github.com/MarkusSchilling/fnct-data-transformation/blob/main/FNCT_Data_Transformation.ipynb) script) is added to the FNCT Data files. Being completed, the file stack my be used for data mapping in order to obtain RDF data using the [FNCT Data Mapping](https://github.com/MarkusSchilling/fnct-data-transformation/blob/main/FNCT_Data_Mapping.ipynb) script to create a comprehensive knowledge graph. A further step to add entries is necessary because some measurements, such as the actual residual fracture surface, are only conducted in a microscopy process downstream of the FNCT process. Accordingly, this data is available later and must be added separately. 
Moreover, some calculations are performed using this script such as the determination of the actual tensile stress used which is calculated from the data on the actual residual fracture surface as is usual in FNCT experiments.

The working folder path has to be specified manually (see below, second cell): 
```python 
folder_path = r'Path_to_your_folder'

In [1]:
import os
import sys
import pandas as pd
from Module import fnct_calculations as calc

In [None]:
# Specify the folder path where Excel files to be processed are located (format for Windows: folder_path = "C:\\Path\\To\\Your\\Folder" | using double backslashes to escape special characters)
folder_path = r'Path_to_your_folder'
# For using the current working folder, the os.getcwd() method can be used
# folder_path = os.getcwd()

# Check if "secondary_data" folder exists, otherwise stop the script since no additional data can be included
secondary_data_folder = os.path.join(folder_path, "secondary_data")
if os.path.exists(secondary_data_folder):
    print(f"Accessing secondary data in '{secondary_data_folder}' folder.")
else:
    print(f"No '{secondary_data_folder}' folder found. No data processing possible.")
    sys.exit()

# Check if "input" folder exists, otherwise stop the script since no input data can be included
input_folder = os.path.join(folder_path, "input")
if os.path.exists(input_folder):
    print(f"Accessing input data in '{input_folder}' folder.")
else:
    print(f"No '{input_folder}' folder found. No data processing possible.")
    sys.exit()

# Check if "metadata" folder exists, otherwise create it
metadata_folder = os.path.join(folder_path, "metadata")
if os.path.exists(metadata_folder):
    print(f"Accessing metadata in '{metadata_folder}' folder.")
else:
    print(f"No '{metadata_folder}' folder found. Metadata will not be updated.")


# Read CSV files to be merged into pandas DataFrames
secondary_data_path = os.path.join(secondary_data_folder, 'FNCT_secondary_data.csv')
input_data_path = os.path.join(input_folder, 'FNCT_Input_Information.csv')

secondary_data = pd.read_csv(secondary_data_path, sep=';', dtype=str)
input_data = pd.read_csv(input_data_path, sep=';', dtype=str)

# Identify rows with missing data in secondary_data
# Columns to check for missing data in secondary_data
columns_to_check = ['Specimen_ID', 'Process_ID', 'Material', 'Medium', 
                    'Residual fracture surface measured AL1', 
                    'Residual fracture surface measured AL2', 
                    'Notch depth measured nm']

# Find rows with any missing values in these columns
missing_rows = secondary_data[columns_to_check].isnull().any(axis=1)

# Update missing data in secondary_data from input_data
# Iterate over each column in input_data and update corresponding columns in secondary_data
for col in input_data.columns:
    if col in secondary_data.columns and col != 'Process_ID':  # Exclude 'Process_ID' which is common identifier
        secondary_data.loc[missing_rows, col] = input_data.loc[missing_rows, col].values

columns_to_calculate = ['Residual fracture surface measured AL', 'Stress measured sigma_L']
# Find rows with any missing values in these columns to calculate
missing_calculation_rows = secondary_data[columns_to_calculate].isnull().any(axis=1)

# Update missing data to calculate in secondary_data from input_data
# Iterate over each column and update corresponding columns in secondary_data while using calculation methods and functions
for index, row in secondary_data.iterrows():
    if missing_calculation_rows[index]: # Check if the row needs calculation (are values already existing)
        AL1 = row['Residual fracture surface measured AL1']
        AL2 = row['Residual fracture surface measured AL2']
        F = row['Force']
        
        residual_fracture_surface_measured = calc.get_calculation_residual_fracture_surface_measured(AL1, AL2)
        secondary_data.at[index, 'Residual fracture surface measured AL'] = residual_fracture_surface_measured
        sigma_L = calc.get_actual_stress_measured(F, residual_fracture_surface_measured)
        secondary_data.at[index, 'Stress measured sigma_L'] = sigma_L

# Write updated secondary_data dataframe to a new CSV file
output_file = os.path.join(secondary_data_folder, 'FNCT_secondary_data.csv')
secondary_data.to_csv(output_file, sep=';', index=False, encoding='utf-8-sig')

In [None]:
# Read in input data
# Check if "input" folder exists, otherwise stop the script since no input data can be included
input_folder = os.path.join(folder_path, "input")
if os.path.exists(input_folder):
    print(f"Accessing input data in '{input_folder}' folder.")
    input_data_path = os.path.join(input_folder, 'FNCT_Input_Information.csv')
    input_data = pd.read_csv(input_data_path, sep=';', dtype=str)
else:
    print(f"No '{input_folder}' folder found. No data processing possible.")
    sys.exit()

# Ensure 'Process_ID' is a column before extracting unique values
if 'Process_ID' not in input_data.columns:
    raise KeyError("'Process_ID' column not found in input_data")

# Get unique Process_IDs from the input CSV
unique_process_ids = input_data['Process_ID'].unique()
# Remove NaN values
unique_process_ids = unique_process_ids[~pd.isna(unique_process_ids)]

# Set 'Process_ID' as the index for input_data
input_data.set_index('Process_ID', inplace=True)

# Process each Process_ID and its corresponding metadata file
for process_id in unique_process_ids:
    # Construct the filename for the metadata file, respectively
    metadata_file = os.path.join(metadata_folder, f"{process_id}_metadata.csv")
    
    # Check if the metadata file exists
    if os.path.exists(metadata_file):
        # Read metadata file into a DataFrame
        df_metadata = pd.read_csv(metadata_file, sep=';', dtype=str)

        # Get the corresponding row from input_data for the current process_id
        if process_id not in input_data.index:
            print(f"Process_ID {process_id} not found in input_data")
            continue
        input_row = input_data.loc[process_id]
        
        # Columns to check for missing data in metadata file
        columns_to_check_metadata = ['Funding Party', 'Funding Party ID', 'Grant number']

        # Find rows with any missing values in these columns of the metadata file
        missing_rows_metadata = df_metadata[columns_to_check_metadata].isnull().any(axis=1)
        missing_rows_metadata[0] = False  # Ensure the header row is not considered

        if not missing_rows_metadata.any():
            print(f"Metadata file: {metadata_file} does not need to be updated.")
            continue

        # Only update rows with missing data
        rows_to_update = df_metadata[missing_rows_metadata]

        # Update missing data in metadata from input_data
        # Iterate over each column in input_data and update corresponding columns in metadata
        for col in input_data.columns:
            if col in df_metadata.columns:
                # Update the metadata DataFrame where data is missing
                df_metadata.loc[missing_rows_metadata, col] = df_metadata.loc[missing_rows_metadata, col].fillna(input_row[col])

        # Write updated metadata DataFrame back to CSV
        df_metadata.to_csv(metadata_file, sep=';', index=False, encoding='utf-8-sig')
        
        print(f"Updated metadata file: {metadata_file}")
    else:
        print(f"Metadata file not found for Process_ID: {process_id}")

## Insert standard Funding Party and Grant Number

As it may be the case with a lot of FNCT data being obtained in the frame of certain projects or in (in-house) quality control processes, funders / principals / customers and grant numbers / project references may often be the same for a set of experiments. Accordingly, this data does not have to be added by hand, but can be added using the following code by script. 
Therefore, the fixed data has to be edited in the following code.

In this line, the code may be edited:

```python
# Define the fixed values for the metadata fields
fixed_values = {
    'Funding Party': '',
    'Funding Party ID': '',
    'Grant number': ''
}

In [None]:
# Read in input data
input_folder = os.path.join(folder_path, "input")
if os.path.exists(input_folder):
    print(f"Accessing input data in '{input_folder}' folder.")
    input_data_path = os.path.join(input_folder, 'FNCT_Input_Information.csv')
    input_data = pd.read_csv(input_data_path, sep=';', dtype=str)
else:
    print(f"No '{input_folder}' folder found. No data processing possible.")
    sys.exit()

# Ensure 'Process_ID' is a column before extracting unique values
if 'Process_ID' not in input_data.columns:
    raise KeyError("'Process_ID' column not found in input_data")

# Get unique Process_IDs from the input CSV
unique_process_ids = input_data['Process_ID'].unique()
# Remove NaN values
unique_process_ids = unique_process_ids[~pd.isna(unique_process_ids)]

# Set 'Process_ID' as the index for input_data
input_data.set_index('Process_ID', inplace=True)

# Define the fixed values for the metadata fields
fixed_values = {
    'Funding Party': '',
    'Funding Party ID': '',
    'Grant number': ''
}

# Process each Process_ID and its corresponding metadata file
for process_id in unique_process_ids:
    metadata_file = os.path.join(metadata_folder, f"{process_id}_metadata.csv")
    
    if os.path.exists(metadata_file):
        # Read metadata file into a DataFrame
        df_metadata = pd.read_csv(metadata_file, sep=';', dtype=str)

        # Get the corresponding row from input_data for the current process_id
        if process_id not in input_data.index:
            print(f"Process_ID {process_id} not found in input_data")
            continue
        input_row = input_data.loc[process_id]
        
        # Columns to check for missing data in metadata file
        columns_to_check_metadata = ['Funding Party', 'Funding Party ID', 'Grant number']

        # Find rows with any missing values in these columns of the metadata file
        missing_rows_metadata = df_metadata[columns_to_check_metadata].isnull().any(axis=1)
        missing_rows_metadata[0] = False  # Ensure the header row is not considered

        if not missing_rows_metadata.any():
            print(f"Metadata file: {metadata_file} does not need to be updated.")
            continue

        # Update missing data in metadata with fixed values
        for col in columns_to_check_metadata:
            if col in df_metadata.columns:
                df_metadata.loc[missing_rows_metadata, col] = fixed_values[col]

        # Write updated metadata DataFrame back to CSV
        df_metadata.to_csv(metadata_file, sep=';', index=False, encoding='utf-8-sig')
        
        print(f"Updated metadata file: {metadata_file}")
    else:
        print(f"Metadata file not found for Process_ID: {process_id}")
