## Data analytics framework

In [None]:
import os

# Check if we are in the correct directory
print("Current working directory:", os.getcwd())
path = os.path.abspath(os.path.join(os.getcwd(), '..', 'path.py'))
%run $path

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

train = pd.read_csv('../data/leish10/train.csv')
val   = pd.read_csv('../data/leish10/val.csv')
test  = pd.read_csv('../data/leish10/test.csv')

df_all = pd.concat([train, val, test], ignore_index=True)
counts = (df_all['Class']
          .value_counts()
          .sort_index()
          .reset_index())
counts.columns = ['Class', 'Count']
counts['Label'] = counts['Class'].map({1.0:'Actives', 0.0:'Inactives'})
counts = counts.iloc[::-1].reset_index(drop=True)

colors = {'Actives':'#2AAD0F', 'Inactives':'#A1C3CE'}

fig, ax = plt.subplots(figsize=(2.5, 3))
x     = np.arange(len(counts))
width = 0.8

ax.bar(
    x,
    counts['Count'],
    width=width,
    color=[colors[label] for label in counts['Label']]
)

ax.set_xticks(x)
ax.set_xticklabels(counts['Label'])
ax.set_xlabel('Class', fontsize=16)
ax.set_ylabel('Count', fontsize=16)

for spine in ['top', 'right', 'bottom', 'left']:
    ax.spines[spine].set_visible(True)
ax.tick_params(axis='x', direction='out', length=4)
ax.tick_params(axis='y', direction='out', length=4)

plt.tight_layout()
plt.show()



##### Configure notebook

In [3]:
# Dataset splitting parameter
split = 'random'

# Select the input e output data
input = '../data/dataset.csv'
output = '../data/'

##### Load data

In [None]:
import pandas as pd

dataset = pd.read_csv(input)
print(dataset.head(10).to_markdown(index=False))

##### Early chemical curation

In [None]:
from sanitize import trustBTverify

sanitized = trustBTverify(dataset)
print(sanitized.head(10).to_markdown(index=False))

##### Data splitting using multiple techniques

In [None]:
from split import DataSpliter

DataSpliter(
    method=split, 
    df=sanitized, 
    out_path=output
    )

##### Chemical diversity analysis of full dataset

In [None]:
from space import SimilarityAnalysis

similarity_analysis = SimilarityAnalysis(sanitized)
similarity_analysis.run()

##### Physicochemical diversity analysis of full dataset

In [None]:
from space import ScatterPlotMatrix

scatter_plot_matrix = ScatterPlotMatrix(sanitized)
scatter_plot_matrix.add_properties()
scatter_plot_matrix.plot()