In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
import os

from src.Cyclic_volt import drug
from src.Cyclic_volt import activation, cyclic_voltammogram, plot_cyclic_voltammogram

# Generating Dataset (activation)

In [2]:
drugs = ["CP", "IFOS", "ETOP", "FLUR", "NAP", "DX"]
concentration_drug1 = np.arange(20, 711, 30).tolist()
concentration_drug2 = np.arange(20, 711, 30).tolist()

scan_rate = [0.0001, 0.001, 0.01]
capacitance = [4]
num_cycles=3


In [3]:
# Function to save the data in chunks
def save_data_chunk(data_chunk, file_name="dataset.h5"):
    df_chunk = pd.DataFrame(data_chunk, columns=["Drug1", "Drug2", "Conc1", "Conc2", "ScanRate", "Capacitance", "Signal"])
    df_chunk.to_hdf(file_name, key="data", mode="a", append=True, index=False, complevel=9, complib="blosc", min_itemsize={"Signal": 140000})

def save_potential_chunk(data_chunk, file_name="potential.h5"):
    df_chunk = pd.DataFrame(data_chunk, columns=["Potential"])
    df_chunk.to_hdf(file_name, key="potential", mode="a", append=True, index=False, complevel=9, complib="blosc")

# Create a list to store the data
data = []

# Counter for iterations
counter = 0

# Number of iterations after which to save data to CSV
save_interval = 1000

for drug1 in tqdm(drugs, desc="Drugs", position=0, leave=False):
    for drug2 in drugs:
        if drug1 != drug2:
            for conc1 in concentration_drug1:
                for conc2 in concentration_drug2:

                    i_drug1_drug2_activ, potential = activation(
                        drug1,
                        drug2,
                        Cs=[conc1, conc2],
                        peak_amplitude=[1, 1],
                    )

                    for scan in scan_rate:
                        for cap in capacitance:
                            faradaic_current = i_drug1_drug2_activ
                            cyclic_potential_data, cyclic_current_data = cyclic_voltammogram(
                                potential,
                                faradaic_current,
                                scan_rate=scan,
                                capacitance=cap,
                                num_cycles=num_cycles,
                            )

                            signal_str = " ".join(map(str, cyclic_current_data))
                            data.append([drug1, drug2, conc1, conc2, scan, cap, signal_str])
                            
                            counter += 1
                            
                            if counter % save_interval == 0:
                                save_data_chunk(data)
                                data.clear()

# Save the remaining data
if data:
    save_data_chunk(data)

# Save the potential data to use for plotting later
potential_str = " ".join(map(str, cyclic_potential_data))
save_potential_chunk([potential_str])

print("Data length: ", counter)   

                                                     

Data length:  51840


# Normalize data

In [13]:
chunksize = 1000  # Adjust the chunk size as needed
input_file = "dataset.h5"
output_file = "processed_data.h5"
key = "data"

with pd.HDFStore(input_file, mode="r") as hdf:
    num_rows = hdf.get_storer(key).nrows

print(f"Length of the dataset in {input_file}: {num_rows}")

Length of the dataset in dataset.h5: 51840


In [14]:
def add_features(chunk, drug_dict):
    chunk["Sensitivity1"] = chunk["Drug1"].apply(lambda x: drug_dict[x]["Sensitivity"]).astype("float64")
    chunk["Sensitivity2"] = chunk["Drug2"].apply(lambda x: drug_dict[x]["Sensitivity"]).astype("float64")
    chunk["Peak_pos1"] = chunk["Drug1"].apply(lambda x: drug_dict[x]["Peak_pos"]).astype("float64")
    chunk["Peak_pos2"] = chunk["Drug2"].apply(lambda x: drug_dict[x]["Peak_pos"]).astype("float64")
    chunk["Peak_width1"] = chunk["Drug1"].apply(lambda x: drug_dict[x]["Peak_width"]).astype("float64")
    chunk["Peak_width2"] = chunk["Drug2"].apply(lambda x: drug_dict[x]["Peak_width"]).astype("float64")
    chunk["k_m1"] = chunk["Drug1"].apply(lambda x: drug_dict[x]["k_m"]).astype("float64")
    chunk["k_m2"] = chunk["Drug2"].apply(lambda x: drug_dict[x]["k_m"]).astype("float64")
    chunk["v_max1"] = chunk["Drug1"].apply(lambda x: drug_dict[x]["v_max"]).astype("float64")
    chunk["v_max2"] = chunk["Drug2"].apply(lambda x: drug_dict[x]["v_max"]).astype("float64")
    return chunk

def normalize_chunk(chunk, feature, max_feature):
    # Normalize the Signal column using the global maximum value
    chunk[feature] = chunk[feature].apply(lambda x: x / max_feature)
    # Add other tasks as needed
    return chunk

In [15]:
# feature names to be normalized
feature_names = [
    "Conc1", "Conc2", "ScanRate", "Capacitance",
    "Sensitivity1", "Sensitivity2",
    "Peak_pos1", "Peak_pos2",
    "Peak_width1", "Peak_width2",
    "k_m1", "k_m2",
    "v_max1", "v_max2"
]

### Add features to the dataset

In [16]:
# Generate the list of start indexes for each chunk
chunk_start = [i for i in range(0, num_rows, chunksize)]
chunk_start.append(num_rows)  # Append the last element (the remaining rows)

In [17]:
########### ADD FEATURE ###########

for i in tqdm(range(len(chunk_start) - 1), desc="Processing chunks"):

    # Add features to the chunk
    chunk_df = pd.read_hdf(
        "dataset.h5", key="data", start=chunk_start[i], stop=chunk_start[i + 1]
    )
    chunk_with_features = add_features(chunk_df, drug)

    # Save the processed chunk to the output HDF5 file
    if i == 0:
        # If it's the first chunk, create a new HDF5 file
        chunk_with_features.to_hdf(
            output_file,
            key="data",
            mode="w",
            index=False,
            complevel=9,
            complib="blosc",
            format="table",
            min_itemsize={"Signal": 140000, "Drug1": 20, "Drug2": 20},
            #data_columns=data_columns,
        )
    else:
        # If it's not the first chunk, append to the existing HDF5 file
        chunk_with_features.to_hdf(
            output_file,
            key="data",
            mode="a",
            append=True,
            index=False,
            complevel=9,
            complib="blosc",
            format="table",
            min_itemsize={"Signal": 140000, "Drug1": 20, "Drug2": 20},
            #data_columns=data_columns,
        )


Processing chunks: 100%|██████████| 52/52 [01:16<00:00,  1.48s/it]


### Searching for maximum and normalize by chunk

In [18]:
# Initialize the global maximum values for each feature
max_dict = {feature: -float("inf") for feature in feature_names}


for i in tqdm(range(len(chunk_start) - 1), desc="Searching max in chunks"):
    # Read the chunk
    chunk_df = pd.read_hdf(
        "processed_data.h5", key="data", start=chunk_start[i], stop=chunk_start[i + 1]
    )

    # Update the global maximum values for each feature
    for feature in max_dict:
        if feature in chunk_df.columns:
            max_value = chunk_df[feature].apply(abs).max()
            if max_value > max_dict[feature]:
                max_dict[feature] = max_value
        else:
            print(f"Column '{feature}' not found in the DataFrame.")

Searching max in chunks: 100%|██████████| 52/52 [00:19<00:00,  2.64it/s]


In [19]:
# APPLIED NORMALIZATION TO ALL CHUNKS
####################################
normalized = "data_normalized.h5"
# Process chunks and write to a new output file
for i in tqdm(range(len(chunk_start) - 1), desc="Processing and normalizing chunks"):
    # Read the chunk
    chunk_df = pd.read_hdf(
        "processed_data.h5", key="data", start=chunk_start[i], stop=chunk_start[i + 1]
    )

    # Normalize the chunk
    for feature, max_value in max_dict.items():
        if feature in chunk_df.columns:
            chunk_normalized = normalize_chunk(chunk_df, feature, max_value)
        else:
            print(f"Column '{feature}' not found in the DataFrame.")

    chunk_df[["Drug1", "Drug2"]] = chunk_df[["Drug1", "Drug2"]].replace({"CP": 0, "IFOS": 1, "ETOP": 2, "FLUR": 3, "NAP": 4, "DX": 5})

    # Save the processed chunk to the output HDF5 file
    if i == 0:
        # If it's the first chunk, create a new HDF5 file
        chunk_normalized.to_hdf(
            normalized,
            key="data",
            mode="w",
            index=False,
            complevel=9,
            complib="blosc",
            format="table",
            min_itemsize={"Signal": 140000, "Drug1": 20, "Drug2": 20}
        )
    else:
        # If it's not the first chunk, append to the existing HDF5 file
        chunk_normalized.to_hdf(
            normalized,
            key="data",
            mode="a",
            append=True,
            index=False,
            complevel=9,
            complib="blosc",
            format="table",
            min_itemsize={"Signal": 140000, "Drug1": 20, "Drug2": 20}
        )



Processing and normalizing chunks: 100%|██████████| 52/52 [01:19<00:00,  1.53s/it]


# Load the full dataset and convert it into .feather file to speed up the loading

In [20]:
# clear variables
del signal_str
del potential_str

# delete processed data file
os.remove("processed_data.h5")

In [3]:
# load the normalized data
df = pd.read_hdf("data_normalized.h5", key="data")
df.head()

#reset index
df.reset_index(drop=True, inplace=True)

In [4]:
# save as feather
df.to_feather("normalized_data.feather")

In [5]:
# load the normalized data
df = pd.read_feather("normalized_data.feather")
df.head()

Unnamed: 0,Drug1,Drug2,Conc1,Conc2,ScanRate,Capacitance,Signal,Sensitivity1,Sensitivity2,Peak_pos1,Peak_pos2,Peak_width1,Peak_width2,k_m1,k_m2,v_max1,v_max2
0,0,1,0.028169,0.028169,0.01,1.0,0.006850648514866037 33.39623902311649 66.7856...,0.069231,0.043956,-0.657778,-1.0,1.0,0.375,1.0,0.001157,0.006706,0.003499
1,0,1,0.028169,0.028169,0.1,1.0,0.006850648514866037 3.3461764536005996 6.6855...,0.069231,0.043956,-0.657778,-1.0,1.0,0.375,1.0,0.001157,0.006706,0.003499
2,0,1,0.028169,0.028169,1.0,1.0,0.006850648514866037 0.34117019664901005 0.675...,0.069231,0.043956,-0.657778,-1.0,1.0,0.375,1.0,0.001157,0.006706,0.003499
3,0,1,0.028169,0.070423,0.01,1.0,0.006858229126972536 33.396248401363984 66.785...,0.069231,0.043956,-0.657778,-1.0,1.0,0.375,1.0,0.001157,0.006706,0.003499
4,0,1,0.028169,0.070423,0.1,1.0,0.006858229126972536 3.3461858318480897 6.6855...,0.069231,0.043956,-0.657778,-1.0,1.0,0.375,1.0,0.001157,0.006706,0.003499
