# SDR data processing

## Library import

In [19]:
import pandas
import numpy
from pathlib import Path
from collections import defaultdict

## Data import

We first import the data. Put in the dataset path into `dataset_path` variable.

In [2]:
dataset_path = Path(r"C:\Users\GOIYF\OneDrive - Bayer\Personal Data\RiskHunt3R\SDR_HuSa_dashboard - hypertrophy.csv")

In [5]:
str(dataset_path)

'C:\\Users\\GOIYF\\OneDrive - Bayer\\Personal Data\\RiskHunt3R\\SDR_HuSa_dashboard - hypertrophy.csv'

In [7]:
data = pandas.read_csv(dataset_path, header=None)

## Extraction of observation metadata

This assumes the observation metadata is collected in the first three rows. It starts by taking the third row, in which the dose and compound data is stored. THen it adds the first row to it, as it assumes that contains the sex of the animal.

**Note:** we take here and below in the data section the columns 8::2 (so every second column from the 8th one on); this means we focus on only the percentage of the affected animals, and not the absolute numbers. If you wish to extract different values, change this value here and below.

In [8]:
metadata = pandas.DataFrame([[x[0], x[1], " ".join(x[2:])] for x in data.loc[2, 8::2].str.split(" ")], columns=("Dose", "Unit", "Compound"))

In a perfect world all the entries would have a general structure `dose unit compound`. This is not the case.

In the cases where we have weird formatting we follow up with the following assumptions:
- if we split by `;` the last entry will be the compound name
- the unit is included in the string and is either `mg/kg`, `ug/kg`, `PPM`, or `ppm`
- where the strings contain `;` this is also a delimiter for the dose, and is thus contained also in that column

In [9]:
indices = numpy.where(~metadata.loc[:,"Unit"].isin(("mg/kg", "PPM", "ppm")))[0]
for i in indices:
    cpd_txt = metadata.loc[i, "Compound"]
    cpd = cpd_txt.split(";")[-1]
    
    if "mg/kg" in cpd_txt:
        unit = 'mg/kg'
    elif "ug/kg" in cpd_txt:
        unit = "ug/kg"
    elif "PPM" in cpd_txt or "ppm" in cpd_txt:
        unit = "PPM"
    else:
        unit = ""
    
    metadata.at[i, "Compound"] = cpd
    if unit:
        metadata.at[i, "Unit"] = unit
    
    if metadata.loc[i, "Dose"]:
        metadata.at[i, "Dose"] = metadata.loc[i, "Dose"].split(";")[0]


As stated above, in the last step we add the information on the sex. Here the observed values are `M`, `F`, `Male` and `Female`. We convert the last two into their single letter counterpart.

In [10]:
metadata["Sex"] = data.loc[0, 8::2].tolist()
metadata["Sex"] = metadata["Sex"].str[0]

## Extraction of study metadata

Additional metadata on the study performed is collected in the first 5 columns of the data matrix. We take these into a `study_metadata` object.

In [11]:
study_metadata = data.loc[4:, :4]
study_metadata.columns = data.loc[3, :4].tolist()

## Extraction of data matrix

As above, we take only every second column from the 8th on. Next, we make sure that column names are the same as the indices in the `metadata` so we can have an easy referencing.

In [12]:
subdata = data.loc[4:, 8::2]
subdata.columns = metadata.index

## Processing the data
In this next cell we go through every row in the study metadata. In the data matrix we check then every column that has a value (as there can be multiple values available per study). We aggregate the values from all three DataFrames into a single list of lists, which is then transformed into a DataFrame.

In [13]:
final_data = [("Compound", "Study ID", "Phase Name", "Result Raw", "Dose", "Dose-unit", "Sex", "Severity", "Affected")]
for i in study_metadata.index:
    for j in subdata.loc[i, ~subdata.loc[i].isna()].index:
        final_data.append([
            metadata.loc[j, "Compound"],
            study_metadata.loc[i, "study_id"],
            study_metadata.loc[i, "phase_name"],
            study_metadata.loc[i, "result_raw"],
            metadata.loc[j, "Dose"],
            metadata.loc[j, "Unit"],
            metadata.loc[j, "Sex"],
            study_metadata.loc[i, "severity"],
            subdata.loc[i, j]
        ])

In [14]:
processed_data = pandas.DataFrame(final_data[1:], columns=final_data[0])

## Data cleanup

We strip the trailing whitespaces, regularize the compound and study names (by substituting spaces for `-` and using all capital letters).

In [70]:
processed_data["Compound"] = processed_data["Compound"].str.upper().str.replace(" ", "").str.replace(r'([A-Z]+)', r'\1-', regex=True)
processed_data["Study ID"] = processed_data["Study ID"].str.upper().str.replace(" ", "").str.replace(r'([A-Z]+)', r'\1-', regex=True)

In [88]:
processed_data["Phase Name"] = processed_data["Phase Name"].str.strip()
processed_data["Result Raw"] = processed_data["Result Raw"].str.strip()
processed_data["Dose"] = processed_data["Dose"].str.strip()

In [89]:
subset = processed_data.loc[(processed_data["Compound"] == "AE-0014793")&(processed_data["Study ID"] == "SA-00047")&(processed_data["Result Raw"] == "Hepatocellular hypertrophy, panlobular, diffuse")&(processed_data["Dose"] == "5000")]

In [90]:
subset

Unnamed: 0,Compound,Study ID,Phase Name,Result Raw,Dose,Dose-unit,Sex,Severity,Affected
0,AE-0014793,SA-00047,Dosing phase,"Hepatocellular hypertrophy, panlobular, diffuse",5000,PPM,F,Slight,30%


In [44]:
processed_data["Compound"].unique()

array([' AE 0014793', ' AE0014793', 'AE 0013604', 'RPA 420678',
       'AE C654097', 'AE 0327873', ' AE C638206', 'PHENMEDIPHAM',
       ' ACLONIFEN', ' AE 0317309', 'AE 0317670', 'AE 0000623',
       ' AE 0173473', 'AE 0172747', ' AE 0541177', 'AE F051327',
       ' OXADIAZON', ' ethiprole', ' AE0173473', 'AE C632847',
       ' Oxadiargyl', 'Control', 'BCS-AM71463', 'BCS-AI41561',
       'AE R808135', 'AE 1371906', 'BCS-AR83685', ' AE C656948',
       ' BYF 00587', 'BYF 14182', 'AE 1417268', 'AMSI 0334', 'BYI 09110',
       ' AE 1380970', ' OXADIARGYL', 'AE 0553834', 'AE 1379584',
       'BYI 02290', 'NNI-0001 RFA', 'AE 0852999', 'BYI 02960',
       'AE 1801454', 'BYI 04680', 'BCS-AA10091', ' AE1380970',
       'AE 1980675', 'BCS-AA10043', 'DICLOFOP', 'CLOFIBRIC',
       'BCS-AA10579', 'BCS-AA10233', ' AEC 656948', ' AE 1887196',
       'BCS-AA10476', 'BCS-AA10751', ' AE1887196', 'BCS AA10312',
       'BCS-BR22345', 'BCS-CN18250', 'BCS-CN45153', 'AE 1887196',
       ' FLUOPYRAM', 'met

In [41]:
subset

Unnamed: 0,Compound,Study ID,Phase Name,Result Raw,Dose,Dose-unit,Sex,Severity,Affected


In [21]:
processed_data_02 = defaultdict(list)
for compound in processed_data["Compound"].unique():
    subset = processed_data.loc[processed_data["Compound"] == compound]
    for study in processed_data.loc[processed_data["Compound"] == compound]["Study ID"].unique():
        results = subset["Result Raw"].unique()
        for result in processed_data.loc[(processed_data["Compound"] == compound)&(processed_data["Study ID"] == study)]["Result Raw"].unique():
            for dose in processed_data.loc[(processed_data["Compound"] == compound)&(processed_data["Study ID"] == study)&(processed_data["Result Raw"] == result)]["Dose"].unique():
                subset = processed_data.loc[(processed_data["Compound"] == compound)&(processed_data["Study ID"] == study)&(processed_data["Result Raw"] == result)&(processed_data["Dose"] == dose)]
                processed_data_02["Compound"].append(compound)
                processed_data_02["Study ID"].append(study)
                processed_data_02["Result Raw"].append(result)
                processed_data_02["Dose"].append(dose)
                processed_data_02["Dose-unit"].append()

NameError: name 'processe' is not defined

## Saving the output

In [11]:
processed_data.to_csv(dataset_path[:-4] + "_processed.csv", index=False)