# Comparison of the datasets contructed using LLM and calculation methods

In [None]:
#Datasets compared by feature extraction method. Uses Euclidean distance, cosine similarity, mean difference and Pearson correlation. Code looks at per molecule comparison and per feature comparison. It has also scaled the features using Z-score normalization.

In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.spatial.distance import euclidean, cosine
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler

# === File paths ===
base_dir = r"C:\PhD\Prompt_Engineering\Outputs2"
output_dir = os.path.join(base_dir, "DS3_Comparison_Results")
os.makedirs(output_dir, exist_ok=True)

calc_file = os.path.join(base_dir, "DS3_Calc_Tox_Clean.csv")
nih_file = os.path.join(base_dir, "DS3_NIH_Clean.csv")

# === Load datasets ===
ds3_calc = pd.read_csv(calc_file)
ds3_nih = pd.read_csv(nih_file)

# === Identify SMILES columns (assume first column) ===
smiles_calc = ds3_calc.columns[0]
smiles_nih = ds3_nih.columns[0]

# === Drop toxicity columns (assume last column) ===
ds3_calc = ds3_calc.iloc[:, :-1]
ds3_nih = ds3_nih.iloc[:, :-1]

# Strip whitespace and standardize SMILES
ds3_calc[smiles_calc] = ds3_calc[smiles_calc].str.strip().str.upper()
ds3_nih[smiles_nih] = ds3_nih[smiles_nih].str.strip().str.upper()

# How many SMILES overlap?
common_smiles = set(ds3_calc[smiles_calc]).intersection(ds3_nih[smiles_nih])
print(f"Common SMILES: {len(common_smiles)}")

# === Merge datasets on SMILES ===
merged = pd.merge(
    ds3_calc,
    ds3_nih,
    left_on=smiles_calc,
    right_on=smiles_nih,
    suffixes=('_calc', '_nih')
)

# === Identify numeric columns ===
numeric_cols_calc = [c for c in merged.columns if c.endswith('_calc')]
numeric_cols_nih  = [c for c in merged.columns if c.endswith('_nih')]

# === Ensure matching feature names ===
feature_names = [c.replace('_calc', '') for c in numeric_cols_calc]

# === Z-score normalize the numeric columns separately for calc and NIH ===
scaler = StandardScaler()

merged[numeric_cols_calc] = scaler.fit_transform(merged[numeric_cols_calc])
merged[numeric_cols_nih]  = scaler.fit_transform(merged[numeric_cols_nih])

print("✅ Z-score normalization complete.")

# === Per-molecule statistics ===
per_molecule_results = []
for i, row in merged.iterrows():
    v_calc = np.nan_to_num(row[numeric_cols_calc].values.astype(float), nan=0.0, posinf=0.0, neginf=0.0)
    v_nih = np.nan_to_num(row[numeric_cols_nih].values.astype(float), nan=0.0, posinf=0.0, neginf=0.0)

    eu_dist = euclidean(v_calc, v_nih)
    cos_sim = 1 - cosine(v_calc, v_nih) if np.any(v_calc) and np.any(v_nih) else np.nan
    corr = pearsonr(v_calc, v_nih)[0] if np.std(v_calc) > 0 and np.std(v_nih) > 0 else np.nan

    per_molecule_results.append({
        "SMILES": row[smiles_calc],
        "euclidean": eu_dist,
        "cosine_similarity": cos_sim,
        "correlation": corr
    })

df_per_molecule = pd.DataFrame(per_molecule_results)
df_per_molecule.to_csv(os.path.join(output_dir, "Per_Molecule_Comparison.csv"), index=False)
print("✅ Per-molecule comparison saved.")

# === Per-feature statistics ===
per_feature_results = []
for f_calc, f_nih, fname in zip(numeric_cols_calc, numeric_cols_nih, feature_names):
    v_calc = np.nan_to_num(merged[f_calc].values.astype(float), nan=0.0, posinf=0.0, neginf=0.0)
    v_nih = np.nan_to_num(merged[f_nih].values.astype(float), nan=0.0, posinf=0.0, neginf=0.0)

    mean_diff = np.mean(v_calc - v_nih)
    eu_dist = euclidean(v_calc, v_nih)
    corr = pearsonr(v_calc, v_nih)[0] if np.std(v_calc) > 0 and np.std(v_nih) > 0 else np.nan

    per_feature_results.append({
        "feature": fname,
        "mean_difference": mean_diff,
        "euclidean": eu_dist,
        "correlation": corr
    })

df_per_feature = pd.DataFrame(per_feature_results)
df_per_feature.to_csv(os.path.join(output_dir, "Per_Feature_Comparison.csv"), index=False)
print("✅ Per-feature comparison saved.")

# === Visualizations ===
sns.set(style="whitegrid")

# 1. Per-molecule histograms
plt.figure(figsize=(12, 4))
sns.histplot(df_per_molecule['euclidean'], bins=30, kde=True)
plt.title("Per-Molecule Euclidean Distance Distribution")
plt.savefig(os.path.join(output_dir, "Per_Molecule_Euclidean_Histogram.png"), dpi=300)
plt.close()

plt.figure(figsize=(12, 4))
sns.histplot(df_per_molecule['cosine_similarity'], bins=30, kde=True)
plt.title("Per-Molecule Cosine Similarity Distribution")
plt.savefig(os.path.join(output_dir, "Per_Molecule_Cosine_Histogram.png"), dpi=300)
plt.close()

plt.figure(figsize=(12, 4))
sns.histplot(df_per_molecule['correlation'], bins=30, kde=True)
plt.title("Per-Molecule Pearson Correlation Distribution")
plt.savefig(os.path.join(output_dir, "Per_Molecule_Correlation_Histogram.png"), dpi=300)
plt.close()

# 2. Per-feature heatmaps
plt.figure(figsize=(12, 6))
sns.heatmap(
    df_per_feature[['euclidean']].set_index(df_per_feature['feature']).T,
    annot=True,
    cmap='viridis',
    cbar_kws={'label': 'Euclidean Distance'}
)
plt.title("Feature Euclidean Distances")
plt.savefig(os.path.join(output_dir, "Per_Feature_Euclidean_Heatmap.png"), dpi=300)
plt.close()

plt.figure(figsize=(12, 6))
sns.heatmap(
    df_per_feature[['correlation']].set_index(df_per_feature['feature']).T,
    annot=True,
    cmap='coolwarm',
    vmin=-1, vmax=1,
    cbar_kws={'label': 'Pearson Correlation'}
)
plt.title("Feature Correlation")
plt.savefig(os.path.join(output_dir, "Per_Feature_Correlation_Heatmap.png"), dpi=300)
plt.close()

print("✅ All visualizations saved.")
print(f"All outputs saved to: {output_dir}")


In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.spatial.distance import euclidean, cosine
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler

# === File paths ===
base_dir = r"C:\PhD\Prompt_Engineering\Outputs2"
output_dir = os.path.join(base_dir, "DS4_Comparison_Results")
os.makedirs(output_dir, exist_ok=True)

calc_file = os.path.join(base_dir, "DS4_Calc_Tox_Clean.csv")
nih_file = os.path.join(base_dir, "DS4_PhoP_Clean.csv")

# === Load datasets ===
ds3_calc = pd.read_csv(calc_file)
ds3_nih = pd.read_csv(nih_file)

# === Identify SMILES columns (assume first column) ===
smiles_calc = ds3_calc.columns[0]
smiles_nih = ds3_nih.columns[0]

# === Drop toxicity columns (assume last column) ===
ds3_calc = ds3_calc.iloc[:, :-1]
ds3_nih = ds3_nih.iloc[:, :-1]

# Strip whitespace and standardize SMILES
ds3_calc[smiles_calc] = ds3_calc[smiles_calc].str.strip().str.upper()
ds3_nih[smiles_nih] = ds3_nih[smiles_nih].str.strip().str.upper()

# How many SMILES overlap?
common_smiles = set(ds3_calc[smiles_calc]).intersection(ds3_nih[smiles_nih])
print(f"Common SMILES: {len(common_smiles)}")

# === Merge datasets on SMILES ===
merged = pd.merge(
    ds3_calc,
    ds3_nih,
    left_on=smiles_calc,
    right_on=smiles_nih,
    suffixes=('_calc', '_nih')
)

# === Identify numeric columns ===
numeric_cols_calc = [c for c in merged.columns if c.endswith('_calc')]
numeric_cols_nih  = [c for c in merged.columns if c.endswith('_nih')]

# === Ensure matching feature names ===
feature_names = [c.replace('_calc', '') for c in numeric_cols_calc]

# === Z-score normalize the numeric columns separately for calc and NIH ===
scaler = StandardScaler()

merged[numeric_cols_calc] = scaler.fit_transform(merged[numeric_cols_calc])
merged[numeric_cols_nih]  = scaler.fit_transform(merged[numeric_cols_nih])

print("✅ Z-score normalization complete.")

# === Per-molecule statistics ===
per_molecule_results = []
for i, row in merged.iterrows():
    v_calc = np.nan_to_num(row[numeric_cols_calc].values.astype(float), nan=0.0, posinf=0.0, neginf=0.0)
    v_nih = np.nan_to_num(row[numeric_cols_nih].values.astype(float), nan=0.0, posinf=0.0, neginf=0.0)

    eu_dist = euclidean(v_calc, v_nih)
    cos_sim = 1 - cosine(v_calc, v_nih) if np.any(v_calc) and np.any(v_nih) else np.nan
    corr = pearsonr(v_calc, v_nih)[0] if np.std(v_calc) > 0 and np.std(v_nih) > 0 else np.nan

    per_molecule_results.append({
        "SMILES": row[smiles_calc],
        "euclidean": eu_dist,
        "cosine_similarity": cos_sim,
        "correlation": corr
    })

df_per_molecule = pd.DataFrame(per_molecule_results)
df_per_molecule.to_csv(os.path.join(output_dir, "Per_Molecule_Comparison.csv"), index=False)
print("✅ Per-molecule comparison saved.")

# === Per-feature statistics ===
per_feature_results = []
for f_calc, f_nih, fname in zip(numeric_cols_calc, numeric_cols_nih, feature_names):
    v_calc = np.nan_to_num(merged[f_calc].values.astype(float), nan=0.0, posinf=0.0, neginf=0.0)
    v_nih = np.nan_to_num(merged[f_nih].values.astype(float), nan=0.0, posinf=0.0, neginf=0.0)

    mean_diff = np.mean(v_calc - v_nih)
    eu_dist = euclidean(v_calc, v_nih)
    corr = pearsonr(v_calc, v_nih)[0] if np.std(v_calc) > 0 and np.std(v_nih) > 0 else np.nan

    per_feature_results.append({
        "feature": fname,
        "mean_difference": mean_diff,
        "euclidean": eu_dist,
        "correlation": corr
    })

df_per_feature = pd.DataFrame(per_feature_results)
df_per_feature.to_csv(os.path.join(output_dir, "Per_Feature_Comparison.csv"), index=False)
print("✅ Per-feature comparison saved.")

# === Visualizations ===
sns.set(style="whitegrid")

# 1. Per-molecule histograms
plt.figure(figsize=(12, 4))
sns.histplot(df_per_molecule['euclidean'], bins=30, kde=True)
plt.title("Per-Molecule Euclidean Distance Distribution")
plt.savefig(os.path.join(output_dir, "Per_Molecule_Euclidean_Histogram.png"), dpi=300)
plt.close()

plt.figure(figsize=(12, 4))
sns.histplot(df_per_molecule['cosine_similarity'], bins=30, kde=True)
plt.title("Per-Molecule Cosine Similarity Distribution")
plt.savefig(os.path.join(output_dir, "Per_Molecule_Cosine_Histogram.png"), dpi=300)
plt.close()

plt.figure(figsize=(12, 4))
sns.histplot(df_per_molecule['correlation'], bins=30, kde=True)
plt.title("Per-Molecule Pearson Correlation Distribution")
plt.savefig(os.path.join(output_dir, "Per_Molecule_Correlation_Histogram.png"), dpi=300)
plt.close()

# 2. Per-feature heatmaps
plt.figure(figsize=(12, 6))
sns.heatmap(
    df_per_feature[['euclidean']].set_index(df_per_feature['feature']).T,
    annot=True,
    cmap='viridis',
    cbar_kws={'label': 'Euclidean Distance'}
)
plt.title("Feature Euclidean Distances")
plt.savefig(os.path.join(output_dir, "Per_Feature_Euclidean_Heatmap.png"), dpi=300)
plt.close()

plt.figure(figsize=(12, 6))
sns.heatmap(
    df_per_feature[['correlation']].set_index(df_per_feature['feature']).T,
    annot=True,
    cmap='coolwarm',
    vmin=-1, vmax=1,
    cbar_kws={'label': 'Pearson Correlation'}
)
plt.title("Feature Correlation")
plt.savefig(os.path.join(output_dir, "Per_Feature_Correlation_Heatmap.png"), dpi=300)
plt.close()

print("✅ All visualizations saved.")
print(f"All outputs saved to: {output_dir}")


In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.spatial.distance import euclidean, cosine
from scipy.stats import pearsonr

# === File paths ===
base_dir = r"C:\PhD\Prompt_Engineering\Outputs2"
output_dir = os.path.join(base_dir, "DS5_Comparison_Results")
os.makedirs(output_dir, exist_ok=True)

calc_file = os.path.join(base_dir, "DS5_Calc_Tox_Clean.csv")
nih_file = os.path.join(base_dir, "DS5_Vero_Clean.csv")

# === Load datasets ===
ds3_calc = pd.read_csv(calc_file)
ds3_nih = pd.read_csv(nih_file)

# === Identify SMILES columns (assume first column) ===
smiles_calc = ds3_calc.columns[0]
smiles_nih = ds3_nih.columns[0]

# === Drop toxicity columns (assume last column) ===
ds3_calc = ds3_calc.iloc[:, :-1]
ds3_nih = ds3_nih.iloc[:, :-1]

# Strip whitespace and standardize SMILES
ds3_calc[smiles_calc] = ds3_calc[smiles_calc].str.strip().str.upper()
ds3_nih[smiles_nih] = ds3_nih[smiles_nih].str.strip().str.upper()

# How many SMILES overlap?
common_smiles = set(ds3_calc[smiles_calc]).intersection(ds3_nih[smiles_nih])
print(f"Common SMILES: {len(common_smiles)}")

# === Merge datasets on SMILES ===
merged = pd.merge(ds3_calc, ds3_nih, left_on=smiles_calc, right_on=smiles_nih,
                  suffixes=('_calc', '_nih'))

# === Keep only numeric columns for comparison ===
numeric_cols_calc = [c for c in merged.columns if c.endswith('_calc')]
numeric_cols_nih = [c for c in merged.columns if c.endswith('_nih')]

# === Ensure feature names match ===
feature_names = [c.replace('_calc','') for c in numeric_cols_calc]

# === Per-molecule statistics ===
per_molecule_results = []
for i, row in merged.iterrows():
    v_calc = np.nan_to_num(row[numeric_cols_calc].values.astype(float), nan=0.0, posinf=0.0, neginf=0.0)
    v_nih = np.nan_to_num(row[numeric_cols_nih].values.astype(float), nan=0.0, posinf=0.0, neginf=0.0)

    eu_dist = euclidean(v_calc, v_nih)
    cos_sim = 1 - cosine(v_calc, v_nih) if np.any(v_calc) and np.any(v_nih) else np.nan
    corr = pearsonr(v_calc, v_nih)[0] if np.std(v_calc) > 0 and np.std(v_nih) > 0 else np.nan

    per_molecule_results.append({
        "SMILES": row[smiles_calc],
        "euclidean": eu_dist,
        "cosine_similarity": cos_sim,
        "correlation": corr
    })

df_per_molecule = pd.DataFrame(per_molecule_results)
df_per_molecule.to_csv(os.path.join(output_dir, "Per_Molecule_Comparison.csv"), index=False)
print("✅ Per-molecule comparison saved.")

# === Per-feature statistics ===
per_feature_results = []
for f_calc, f_nih, fname in zip(numeric_cols_calc, numeric_cols_nih, feature_names):
    v_calc = np.nan_to_num(merged[f_calc].values.astype(float), nan=0.0, posinf=0.0, neginf=0.0)
    v_nih = np.nan_to_num(merged[f_nih].values.astype(float), nan=0.0, posinf=0.0, neginf=0.0)

    mean_diff = np.mean(v_calc - v_nih)
    eu_dist = euclidean(v_calc, v_nih)
    corr = pearsonr(v_calc, v_nih)[0] if np.std(v_calc) > 0 and np.std(v_nih) > 0 else np.nan

    per_feature_results.append({
        "feature": fname,
        "mean_difference": mean_diff,
        "euclidean": eu_dist,
        "correlation": corr
    })

df_per_feature = pd.DataFrame(per_feature_results)
df_per_feature.to_csv(os.path.join(output_dir, "Per_Feature_Comparison.csv"), index=False)
print("✅ Per-feature comparison saved.")

# === Visualizations ===
sns.set(style="whitegrid")

# 1. Per-molecule histograms
plt.figure(figsize=(12, 4))
sns.histplot(df_per_molecule['euclidean'], bins=30, kde=True)
plt.title("Per-Molecule Euclidean Distance Distribution")
plt.savefig(os.path.join(output_dir, "Per_Molecule_Euclidean_Histogram.png"), dpi=300)
plt.close()

plt.figure(figsize=(12, 4))
sns.histplot(df_per_molecule['cosine_similarity'], bins=30, kde=True)
plt.title("Per-Molecule Cosine Similarity Distribution")
plt.savefig(os.path.join(output_dir, "Per_Molecule_Cosine_Histogram.png"), dpi=300)
plt.close()

plt.figure(figsize=(12, 4))
sns.histplot(df_per_molecule['correlation'], bins=30, kde=True)
plt.title("Per-Molecule Pearson Correlation Distribution")
plt.savefig(os.path.join(output_dir, "Per_Molecule_Correlation_Histogram.png"), dpi=300)
plt.close()

# 2. Per-feature heatmaps
plt.figure(figsize=(12, 6))
sns.heatmap(df_per_feature[['euclidean']].set_index(df_per_feature['feature']).T, annot=True,
            cmap='viridis', cbar_kws={'label': 'Euclidean Distance'})
plt.title("Feature Euclidean Distances")
plt.savefig(os.path.join(output_dir, "Per_Feature_Euclidean_Heatmap.png"), dpi=300)
plt.close()

plt.figure(figsize=(12, 6))
sns.heatmap(df_per_feature[['correlation']].set_index(df_per_feature['feature']).T, annot=True,
            cmap='coolwarm', vmin=-1, vmax=1, cbar_kws={'label': 'Pearson Correlation'})
plt.title("Feature Correlation")
plt.savefig(os.path.join(output_dir, "Per_Feature_Correlation_Heatmap.png"), dpi=300)
plt.close()

print("✅ All visualizations saved.")
print(f"All outputs saved to: {output_dir}")


In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.spatial.distance import euclidean, cosine
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler

# === File paths ===
base_dir = r"C:\PhD\Prompt_Engineering\Outputs2"
output_dir = os.path.join(base_dir, "DS6_Comparison_Results")
os.makedirs(output_dir, exist_ok=True)

calc_file = os.path.join(base_dir, "DS6_Calc_Tox_Clean.csv")
nih_file = os.path.join(base_dir, "DS6_THP1_Clean.csv")

# === Load datasets ===
ds3_calc = pd.read_csv(calc_file)
ds3_nih = pd.read_csv(nih_file)

# === Identify SMILES columns (assume first column) ===
smiles_calc = ds3_calc.columns[0]
smiles_nih = ds3_nih.columns[0]

# === Drop toxicity columns (assume last column) ===
ds3_calc = ds3_calc.iloc[:, :-1]
ds3_nih = ds3_nih.iloc[:, :-1]

# Strip whitespace and standardize SMILES
ds3_calc[smiles_calc] = ds3_calc[smiles_calc].str.strip().str.upper()
ds3_nih[smiles_nih] = ds3_nih[smiles_nih].str.strip().str.upper()

# How many SMILES overlap?
common_smiles = set(ds3_calc[smiles_calc]).intersection(ds3_nih[smiles_nih])
print(f"Common SMILES: {len(common_smiles)}")

# === Merge datasets on SMILES ===
merged = pd.merge(
    ds3_calc,
    ds3_nih,
    left_on=smiles_calc,
    right_on=smiles_nih,
    suffixes=('_calc', '_nih')
)

# === Identify numeric columns ===
numeric_cols_calc = [c for c in merged.columns if c.endswith('_calc')]
numeric_cols_nih  = [c for c in merged.columns if c.endswith('_nih')]

# === Ensure matching feature names ===
feature_names = [c.replace('_calc', '') for c in numeric_cols_calc]

# === Z-score normalize the numeric columns separately for calc and NIH ===
scaler = StandardScaler()

merged[numeric_cols_calc] = scaler.fit_transform(merged[numeric_cols_calc])
merged[numeric_cols_nih]  = scaler.fit_transform(merged[numeric_cols_nih])

print("✅ Z-score normalization complete.")

# === Per-molecule statistics ===
per_molecule_results = []
for i, row in merged.iterrows():
    v_calc = np.nan_to_num(row[numeric_cols_calc].values.astype(float), nan=0.0, posinf=0.0, neginf=0.0)
    v_nih = np.nan_to_num(row[numeric_cols_nih].values.astype(float), nan=0.0, posinf=0.0, neginf=0.0)

    eu_dist = euclidean(v_calc, v_nih)
    cos_sim = 1 - cosine(v_calc, v_nih) if np.any(v_calc) and np.any(v_nih) else np.nan
    corr = pearsonr(v_calc, v_nih)[0] if np.std(v_calc) > 0 and np.std(v_nih) > 0 else np.nan

    per_molecule_results.append({
        "SMILES": row[smiles_calc],
        "euclidean": eu_dist,
        "cosine_similarity": cos_sim,
        "correlation": corr
    })

df_per_molecule = pd.DataFrame(per_molecule_results)
df_per_molecule.to_csv(os.path.join(output_dir, "Per_Molecule_Comparison.csv"), index=False)
print("✅ Per-molecule comparison saved.")

# === Per-feature statistics ===
per_feature_results = []
for f_calc, f_nih, fname in zip(numeric_cols_calc, numeric_cols_nih, feature_names):
    v_calc = np.nan_to_num(merged[f_calc].values.astype(float), nan=0.0, posinf=0.0, neginf=0.0)
    v_nih = np.nan_to_num(merged[f_nih].values.astype(float), nan=0.0, posinf=0.0, neginf=0.0)

    mean_diff = np.mean(v_calc - v_nih)
    eu_dist = euclidean(v_calc, v_nih)
    corr = pearsonr(v_calc, v_nih)[0] if np.std(v_calc) > 0 and np.std(v_nih) > 0 else np.nan

    per_feature_results.append({
        "feature": fname,
        "mean_difference": mean_diff,
        "euclidean": eu_dist,
        "correlation": corr
    })

df_per_feature = pd.DataFrame(per_feature_results)
df_per_feature.to_csv(os.path.join(output_dir, "Per_Feature_Comparison.csv"), index=False)
print("✅ Per-feature comparison saved.")

# === Visualizations ===
sns.set(style="whitegrid")

# 1. Per-molecule histograms
plt.figure(figsize=(12, 4))
sns.histplot(df_per_molecule['euclidean'], bins=30, kde=True)
plt.title("Per-Molecule Euclidean Distance Distribution")
plt.savefig(os.path.join(output_dir, "Per_Molecule_Euclidean_Histogram.png"), dpi=300)
plt.close()

plt.figure(figsize=(12, 4))
sns.histplot(df_per_molecule['cosine_similarity'], bins=30, kde=True)
plt.title("Per-Molecule Cosine Similarity Distribution")
plt.savefig(os.path.join(output_dir, "Per_Molecule_Cosine_Histogram.png"), dpi=300)
plt.close()

plt.figure(figsize=(12, 4))
sns.histplot(df_per_molecule['correlation'], bins=30, kde=True)
plt.title("Per-Molecule Pearson Correlation Distribution")
plt.savefig(os.path.join(output_dir, "Per_Molecule_Correlation_Histogram.png"), dpi=300)
plt.close()

# 2. Per-feature heatmaps
plt.figure(figsize=(12, 6))
sns.heatmap(
    df_per_feature[['euclidean']].set_index(df_per_feature['feature']).T,
    annot=True,
    cmap='viridis',
    cbar_kws={'label': 'Euclidean Distance'}
)
plt.title("Feature Euclidean Distances")
plt.savefig(os.path.join(output_dir, "Per_Feature_Euclidean_Heatmap.png"), dpi=300)
plt.close()

plt.figure(figsize=(12, 6))
sns.heatmap(
    df_per_feature[['correlation']].set_index(df_per_feature['feature']).T,
    annot=True,
    cmap='coolwarm',
    vmin=-1, vmax=1,
    cbar_kws={'label': 'Pearson Correlation'}
)
plt.title("Feature Correlation")
plt.savefig(os.path.join(output_dir, "Per_Feature_Correlation_Heatmap.png"), dpi=300)
plt.close()

print("✅ All visualizations saved.")
print(f"All outputs saved to: {output_dir}")


In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.spatial.distance import euclidean, cosine
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler

# === File paths ===
base_dir = r"C:\PhD\Prompt_Engineering\Outputs2"
output_dir = os.path.join(base_dir, "DS7_Comparison_Results")
os.makedirs(output_dir, exist_ok=True)

calc_file = os.path.join(base_dir, "DS7_Calc_Tox_Clean.csv")
nih_file = os.path.join(base_dir, "DS7_VeroE6_Clean.csv")

# === Load datasets ===
ds3_calc = pd.read_csv(calc_file)
ds3_nih = pd.read_csv(nih_file)

# === Identify SMILES columns (assume first column) ===
smiles_calc = ds3_calc.columns[0]
smiles_nih = ds3_nih.columns[0]

# === Drop toxicity columns (assume last column) ===
ds3_calc = ds3_calc.iloc[:, :-1]
ds3_nih = ds3_nih.iloc[:, :-1]

# Strip whitespace and standardize SMILES
ds3_calc[smiles_calc] = ds3_calc[smiles_calc].str.strip().str.upper()
ds3_nih[smiles_nih] = ds3_nih[smiles_nih].str.strip().str.upper()

# How many SMILES overlap?
common_smiles = set(ds3_calc[smiles_calc]).intersection(ds3_nih[smiles_nih])
print(f"Common SMILES: {len(common_smiles)}")

# === Merge datasets on SMILES ===
merged = pd.merge(
    ds3_calc,
    ds3_nih,
    left_on=smiles_calc,
    right_on=smiles_nih,
    suffixes=('_calc', '_nih')
)

# === Identify numeric columns ===
numeric_cols_calc = [c for c in merged.columns if c.endswith('_calc')]
numeric_cols_nih  = [c for c in merged.columns if c.endswith('_nih')]

# === Ensure matching feature names ===
feature_names = [c.replace('_calc', '') for c in numeric_cols_calc]

# === Z-score normalize the numeric columns separately for calc and NIH ===
scaler = StandardScaler()

merged[numeric_cols_calc] = scaler.fit_transform(merged[numeric_cols_calc])
merged[numeric_cols_nih]  = scaler.fit_transform(merged[numeric_cols_nih])

print("✅ Z-score normalization complete.")

# === Per-molecule statistics ===
per_molecule_results = []
for i, row in merged.iterrows():
    v_calc = np.nan_to_num(row[numeric_cols_calc].values.astype(float), nan=0.0, posinf=0.0, neginf=0.0)
    v_nih = np.nan_to_num(row[numeric_cols_nih].values.astype(float), nan=0.0, posinf=0.0, neginf=0.0)

    eu_dist = euclidean(v_calc, v_nih)
    cos_sim = 1 - cosine(v_calc, v_nih) if np.any(v_calc) and np.any(v_nih) else np.nan
    corr = pearsonr(v_calc, v_nih)[0] if np.std(v_calc) > 0 and np.std(v_nih) > 0 else np.nan

    per_molecule_results.append({
        "SMILES": row[smiles_calc],
        "euclidean": eu_dist,
        "cosine_similarity": cos_sim,
        "correlation": corr
    })

df_per_molecule = pd.DataFrame(per_molecule_results)
df_per_molecule.to_csv(os.path.join(output_dir, "Per_Molecule_Comparison.csv"), index=False)
print("✅ Per-molecule comparison saved.")

# === Per-feature statistics ===
per_feature_results = []
for f_calc, f_nih, fname in zip(numeric_cols_calc, numeric_cols_nih, feature_names):
    v_calc = np.nan_to_num(merged[f_calc].values.astype(float), nan=0.0, posinf=0.0, neginf=0.0)
    v_nih = np.nan_to_num(merged[f_nih].values.astype(float), nan=0.0, posinf=0.0, neginf=0.0)

    mean_diff = np.mean(v_calc - v_nih)
    eu_dist = euclidean(v_calc, v_nih)
    corr = pearsonr(v_calc, v_nih)[0] if np.std(v_calc) > 0 and np.std(v_nih) > 0 else np.nan

    per_feature_results.append({
        "feature": fname,
        "mean_difference": mean_diff,
        "euclidean": eu_dist,
        "correlation": corr
    })

df_per_feature = pd.DataFrame(per_feature_results)
df_per_feature.to_csv(os.path.join(output_dir, "Per_Feature_Comparison.csv"), index=False)
print("✅ Per-feature comparison saved.")

# === Visualizations ===
sns.set(style="whitegrid")

# 1. Per-molecule histograms
plt.figure(figsize=(12, 4))
sns.histplot(df_per_molecule['euclidean'], bins=30, kde=True)
plt.title("Per-Molecule Euclidean Distance Distribution")
plt.savefig(os.path.join(output_dir, "Per_Molecule_Euclidean_Histogram.png"), dpi=300)
plt.close()

plt.figure(figsize=(12, 4))
sns.histplot(df_per_molecule['cosine_similarity'], bins=30, kde=True)
plt.title("Per-Molecule Cosine Similarity Distribution")
plt.savefig(os.path.join(output_dir, "Per_Molecule_Cosine_Histogram.png"), dpi=300)
plt.close()

plt.figure(figsize=(12, 4))
sns.histplot(df_per_molecule['correlation'], bins=30, kde=True)
plt.title("Per-Molecule Pearson Correlation Distribution")
plt.savefig(os.path.join(output_dir, "Per_Molecule_Correlation_Histogram.png"), dpi=300)
plt.close()

# 2. Per-feature heatmaps
plt.figure(figsize=(12, 6))
sns.heatmap(
    df_per_feature[['euclidean']].set_index(df_per_feature['feature']).T,
    annot=True,
    cmap='viridis',
    cbar_kws={'label': 'Euclidean Distance'}
)
plt.title("Feature Euclidean Distances")
plt.savefig(os.path.join(output_dir, "Per_Feature_Euclidean_Heatmap.png"), dpi=300)
plt.close()

plt.figure(figsize=(12, 6))
sns.heatmap(
    df_per_feature[['correlation']].set_index(df_per_feature['feature']).T,
    annot=True,
    cmap='coolwarm',
    vmin=-1, vmax=1,
    cbar_kws={'label': 'Pearson Correlation'}
)
plt.title("Feature Correlation")
plt.savefig(os.path.join(output_dir, "Per_Feature_Correlation_Heatmap.png"), dpi=300)
plt.close()

print("✅ All visualizations saved.")
print(f"All outputs saved to: {output_dir}")


In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.spatial.distance import euclidean, cosine
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler

# === File paths ===
base_dir = r"C:\PhD\Prompt_Engineering\Outputs2"
output_dir = os.path.join(base_dir, "DS8_Comparison_Results")
os.makedirs(output_dir, exist_ok=True)

calc_file = os.path.join(base_dir, "DS8_Calc_Tox_Clean.csv")
nih_file = os.path.join(base_dir, "DS8_SA9-PAX8_Clean.csv")

# === Load datasets ===
ds3_calc = pd.read_csv(calc_file)
ds3_nih = pd.read_csv(nih_file)

# === Identify SMILES columns (assume first column) ===
smiles_calc = ds3_calc.columns[0]
smiles_nih = ds3_nih.columns[0]

# === Drop toxicity columns (assume last column) ===
ds3_calc = ds3_calc.iloc[:, :-1]
ds3_nih = ds3_nih.iloc[:, :-1]

# Strip whitespace and standardize SMILES
ds3_calc[smiles_calc] = ds3_calc[smiles_calc].str.strip().str.upper()
ds3_nih[smiles_nih] = ds3_nih[smiles_nih].str.strip().str.upper()

# How many SMILES overlap?
common_smiles = set(ds3_calc[smiles_calc]).intersection(ds3_nih[smiles_nih])
print(f"Common SMILES: {len(common_smiles)}")

# === Merge datasets on SMILES ===
merged = pd.merge(
    ds3_calc,
    ds3_nih,
    left_on=smiles_calc,
    right_on=smiles_nih,
    suffixes=('_calc', '_nih')
)

# === Identify numeric columns ===
numeric_cols_calc = [c for c in merged.columns if c.endswith('_calc')]
numeric_cols_nih  = [c for c in merged.columns if c.endswith('_nih')]

# === Ensure matching feature names ===
feature_names = [c.replace('_calc', '') for c in numeric_cols_calc]

# === Z-score normalize the numeric columns separately for calc and NIH ===
scaler = StandardScaler()

merged[numeric_cols_calc] = scaler.fit_transform(merged[numeric_cols_calc])
merged[numeric_cols_nih]  = scaler.fit_transform(merged[numeric_cols_nih])

print("✅ Z-score normalization complete.")

# === Per-molecule statistics ===
per_molecule_results = []
for i, row in merged.iterrows():
    v_calc = np.nan_to_num(row[numeric_cols_calc].values.astype(float), nan=0.0, posinf=0.0, neginf=0.0)
    v_nih = np.nan_to_num(row[numeric_cols_nih].values.astype(float), nan=0.0, posinf=0.0, neginf=0.0)

    eu_dist = euclidean(v_calc, v_nih)
    cos_sim = 1 - cosine(v_calc, v_nih) if np.any(v_calc) and np.any(v_nih) else np.nan
    corr = pearsonr(v_calc, v_nih)[0] if np.std(v_calc) > 0 and np.std(v_nih) > 0 else np.nan

    per_molecule_results.append({
        "SMILES": row[smiles_calc],
        "euclidean": eu_dist,
        "cosine_similarity": cos_sim,
        "correlation": corr
    })

df_per_molecule = pd.DataFrame(per_molecule_results)
df_per_molecule.to_csv(os.path.join(output_dir, "Per_Molecule_Comparison.csv"), index=False)
print("✅ Per-molecule comparison saved.")

# === Per-feature statistics ===
per_feature_results = []
for f_calc, f_nih, fname in zip(numeric_cols_calc, numeric_cols_nih, feature_names):
    v_calc = np.nan_to_num(merged[f_calc].values.astype(float), nan=0.0, posinf=0.0, neginf=0.0)
    v_nih = np.nan_to_num(merged[f_nih].values.astype(float), nan=0.0, posinf=0.0, neginf=0.0)

    mean_diff = np.mean(v_calc - v_nih)
    eu_dist = euclidean(v_calc, v_nih)
    corr = pearsonr(v_calc, v_nih)[0] if np.std(v_calc) > 0 and np.std(v_nih) > 0 else np.nan

    per_feature_results.append({
        "feature": fname,
        "mean_difference": mean_diff,
        "euclidean": eu_dist,
        "correlation": corr
    })

df_per_feature = pd.DataFrame(per_feature_results)
df_per_feature.to_csv(os.path.join(output_dir, "Per_Feature_Comparison.csv"), index=False)
print("✅ Per-feature comparison saved.")

# === Visualizations ===
sns.set(style="whitegrid")

# 1. Per-molecule histograms
plt.figure(figsize=(12, 4))
sns.histplot(df_per_molecule['euclidean'], bins=30, kde=True)
plt.title("Per-Molecule Euclidean Distance Distribution")
plt.savefig(os.path.join(output_dir, "Per_Molecule_Euclidean_Histogram.png"), dpi=300)
plt.close()

plt.figure(figsize=(12, 4))
sns.histplot(df_per_molecule['cosine_similarity'], bins=30, kde=True)
plt.title("Per-Molecule Cosine Similarity Distribution")
plt.savefig(os.path.join(output_dir, "Per_Molecule_Cosine_Histogram.png"), dpi=300)
plt.close()

plt.figure(figsize=(12, 4))
sns.histplot(df_per_molecule['correlation'], bins=30, kde=True)
plt.title("Per-Molecule Pearson Correlation Distribution")
plt.savefig(os.path.join(output_dir, "Per_Molecule_Correlation_Histogram.png"), dpi=300)
plt.close()

# 2. Per-feature heatmaps
plt.figure(figsize=(12, 6))
sns.heatmap(
    df_per_feature[['euclidean']].set_index(df_per_feature['feature']).T,
    annot=True,
    cmap='viridis',
    cbar_kws={'label': 'Euclidean Distance'}
)
plt.title("Feature Euclidean Distances")
plt.savefig(os.path.join(output_dir, "Per_Feature_Euclidean_Heatmap.png"), dpi=300)
plt.close()

plt.figure(figsize=(12, 6))
sns.heatmap(
    df_per_feature[['correlation']].set_index(df_per_feature['feature']).T,
    annot=True,
    cmap='coolwarm',
    vmin=-1, vmax=1,
    cbar_kws={'label': 'Pearson Correlation'}
)
plt.title("Feature Correlation")
plt.savefig(os.path.join(output_dir, "Per_Feature_Correlation_Heatmap.png"), dpi=300)
plt.close()

print("✅ All visualizations saved.")
print(f"All outputs saved to: {output_dir}")
