In [2]:
# notebooks/eda.ipynb

# 1. Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# 2. Set paths
DATA_PATH = Path("../data/antimalarial_activities_selected.csv")

# 3. Load dataset
df = pd.read_csv(DATA_PATH)

# 4. Quick look at data
print("Dataset shape:", df.shape)
df.head()

# 5. Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())

# 6. Check basic info
print("\nData types and non-null counts:")
print(df.info())

# 7. Class distribution
plt.figure(figsize=(6,4))
sns.countplot(x='Activity', data=df, palette='viridis')
plt.title('Distribution of Active vs Inactive Compounds')
plt.xlabel('Activity (0 = Inactive, 1 = Active)')
plt.ylabel('Count')
plt.show()

# 8. Preview some SMILES
print("\nSample SMILES strings:")
print(df['SMILES'].sample(5))

# 9. Basic statistics on IC50 values
plt.figure(figsize=(8,5))
sns.histplot(df['IC50'], bins=50, kde=True, color='purple')
plt.title('Distribution of IC50 Values')
plt.xlabel('IC50 (nM)')
plt.ylabel('Frequency')
plt.xlim(0, 10000)  # limit x-axis for better visualization
plt.show()

# 10. Save basic summary
summary_stats = df.describe()
summary_stats.to_csv("../data/raw/summary_statistics.csv")

print("\nSummary statistics saved to data/raw/summary_statistics.csv")


Matplotlib is building the font cache; this may take a moment.


Dataset shape: (63406, 5)

Missing values per column:
molecule_chembl_id        0
standard_value         2164
canonical_smiles        106
ligand_efficiency     61224
standard_units         2032
dtype: int64

Data types and non-null counts:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63406 entries, 0 to 63405
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   molecule_chembl_id  63406 non-null  object 
 1   standard_value      61242 non-null  float64
 2   canonical_smiles    63300 non-null  object 
 3   ligand_efficiency   2182 non-null   object 
 4   standard_units      61374 non-null  object 
dtypes: float64(1), object(4)
memory usage: 2.4+ MB
None


ValueError: Could not interpret value `Activity` for `x`. An entry with this name does not appear in `data`.

<Figure size 600x400 with 0 Axes>