# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Loading dataset

In [None]:
from pathlib import Path
import sys

project_root = Path.cwd()
if (project_root / "predicting-cyclodextrin-bioavailability").is_dir():
    project_root = project_root / "predicting-cyclodextrin-bioavailability"

sys.path.append(str(project_root))

from scripts.utils import env_variables as env

dfFileName = env.CD_ENRICHED_DATA
df = pd.read_csv(dfFileName, index_col=0)
df.sample(10)


Subset: Host == 'beta-cyclodextrin' and pH == 7.0 and T == 298.0

In [None]:
df = df[
    (df["Host"] == "beta-cyclodextrin") &
    (df["pH"]   == 7.0) &
    (df["T"]    == 298.0)
]
df.sample(10)

df_with_vecs =

In [None]:
keep_cols_with_vec = [
    "DeltaG",
    "TPSA", "MolecularWeight", "Complexity", "Charge",
    "HBondDonorCount", "HBondAcceptorCount", "HeavyAtomCount", "MolLogP",
    #iso2vec
    "iso2vec-0", "iso2vec-1", "iso2vec-2", "iso2vec-3", "iso2vec-4",
    "iso2vec-5", "iso2vec-6", "iso2vec-7", "iso2vec-8", "iso2vec-9",
]

keep_cols_without_vec = [
    "DeltaG",
    "TPSA", "MolecularWeight", "Complexity", "Charge",
    "HBondDonorCount", "HBondAcceptorCount", "HeavyAtomCount", "MolLogP",
]

df_with_vecs = df[keep_cols_with_vec].copy()

print(df_with_vecs.shape)
df_with_vecs.head()



# Data analysis

In [None]:
df_with_vecs.info(memory_usage="deep")

## Descriptive statistics

In [None]:
df_with_vecs.describe(include="all", percentiles=[.05, .25, .5, .75, .95]).T


In [None]:
overview = (
    pd.DataFrame({
        "dtype":      df_with_vecs.dtypes,
        "non_null":   df_with_vecs.count(),
        "nulls":      df_with_vecs.isna().sum(),
        "null_%":     df_with_vecs.isna().mean()*100,
        "unique_cnt": df_with_vecs.nunique()
    })
    .sort_values("null_%", ascending=False)
)

display(overview)


## duplicates

In [None]:
dup = df_with_vecs.duplicated().sum()
print(f"Duplicated rows: {dup}")

## Outliers

In [None]:
def outliers(name: str):
    dg = df_with_vecs[name].dropna()
    print(dg.min(), dg.max())

    # IQR-rule
    q1, q3 = dg.quantile([.25, .75])
    iqr = q3 - q1
    outliers = dg[(dg < q1 - 1.5*iqr) | (dg > q3 + 1.5*iqr)]
    print("Outliers:", len(outliers))

### DeltaG

In [None]:
# outliers('DeltaG')


### TPSA

In [None]:
outliers('TPSA')


### MolecularWeight

In [None]:
outliers('MolecularWeight')

### Complexity

In [None]:
outliers('Complexity')


### HeavyAtomCount

In [None]:
outliers('HeavyAtomCount')


### MolLogP

In [None]:
outliers('MolLogP')


### iso2vec-0

In [None]:
outliers('iso2vec-0')

### iso2vec-1

In [None]:
outliers('iso2vec-1')

### iso2vec-2

In [None]:
outliers('iso2vec-2')

### iso2vec-3

In [None]:
outliers('iso2vec-3')

### iso2vec-4

In [None]:
outliers('iso2vec-4')

### iso2vec-5

In [None]:
outliers('iso2vec-5')

### iso2vec-6

In [None]:
outliers('iso2vec-6')

### iso2vec-7

In [None]:
outliers('iso2vec-7')

### iso2vec-8

In [None]:
outliers('iso2vec-8')

### iso2vec-9

In [None]:
outliers('iso2vec-9')

## Visualizations

In [None]:
num_cols = (
    df_with_vecs                  
    .select_dtypes(include="number")
    .columns
)

corr = df_with_vecs[num_cols].corr(method="pearson")

visualisation_color = 'inferno'

In [None]:

plt.figure(figsize=(10, 8))
plt.imshow(corr, interpolation="nearest", cmap=visualisation_color)
plt.colorbar()
plt.xticks(range(len(num_cols)), num_cols, rotation=90)
plt.yticks(range(len(num_cols)), num_cols)
plt.title("Correlation matrix (Pearson)")
plt.tight_layout()
plt.show()

# Standardization

In [None]:
df_without_vecs = df_with_vecs[keep_cols_without_vec].copy()

from sklearn.preprocessing import StandardScaler

scaler_with_vecs    = StandardScaler()
scaler_without_vecs = StandardScaler()

X_with_vecs    = df_with_vecs.drop(columns="DeltaG")
X_without_vecs = df_without_vecs.drop(columns="DeltaG")

df_with_vecs.loc[:, X_with_vecs.columns]    = scaler_with_vecs.fit_transform(X_with_vecs)
df_without_vecs.loc[:, X_without_vecs.columns] = scaler_without_vecs.fit_transform(X_without_vecs)

In [None]:
df_with_vecs.sample(10)

# Saving the Dataset

In [None]:
import pathlib
import sys
from pathlib import Path

project_root = Path.cwd()
if (project_root / "predicting-cyclodextrin-bioavailability").is_dir():
    project_root = project_root / "predicting-cyclodextrin-bioavailability"

sys.path.append(str(project_root))

from scripts.utils import env_variables as env

out_dir = pathlib.Path(env.CLEAN_DATA_DIR)
out_dir.mkdir(parents=True, exist_ok=True)

# Parquet
df_with_vecs.to_parquet(out_dir / "dataset_with_vec.parquet",    index=False)
df_without_vecs.to_parquet(out_dir / "dataset_without_vec.parquet", index=False)

# CSV
df_with_vecs.to_csv(out_dir / "dataset_with_vec.csv",    index=False)
df_without_vecs.to_csv(out_dir / "dataset_without_vec.csv", index=False)
