<a href="https://colab.research.google.com/github/MZiaAfzal71/Average_Weighted_Path_Vector/blob/main/Data%20Files/Data%20Statistics/Data_Statistics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/MZiaAfzal71/Average_Weighted_Path_Vector.git

In [None]:
%cd Average_Weighted_Path_Vector/Data\ Files

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew, kurtosis
import os

# Output directory
os.makedirs("Data Statistics", exist_ok=True)
# File path
file_path = "Excel Files/Zang_Data.xlsx"
# Sheet names
sheets = ["LogP", "LogS", "LogBCF", "BP", "MP", "Log VP"]

In [None]:
stats_list = []

# Create subplot grid
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
plt.subplots_adjust(hspace=0.4, wspace=0.3)

for idx, sheet in enumerate(sheets):
    df = pd.read_excel(file_path, sheet_name=sheet)

    # Property values (last column)
    prop_values = df.iloc[:, 5].dropna()

    # Train/test split ratio
    n_train = int(len(prop_values) * 0.75) if sheet != "LogP" else int(len(prop_values) * 0.80)
    train_values = prop_values.iloc[:n_train]
    test_values = prop_values.iloc[n_train:]

    # Stats
    mean_val = prop_values.mean()
    std_val = prop_values.std()
    skew_val = skew(prop_values)
    kurt_val = kurtosis(prop_values)

    stats_list.append({
        "Property": sheet,
        "Min": prop_values.min(),
        "Max": prop_values.max(),
        "Mean": mean_val,
        "Std": std_val,
        "Skewness": skew_val,
        "Kurtosis": kurt_val
    })

    ax = axes[idx // 3, idx % 3]

    # Histograms
    sns.histplot(train_values, bins=30, kde=True, color="skyblue", label="Train", ax=ax, alpha=0.6)
    sns.histplot(test_values, bins=30, kde=True, color="salmon", label="Test", ax=ax, alpha=0.6)

    # Annotation box
    annotation_text = (
        f"Mean: {mean_val:.2f}\n"
        f"Std: {std_val:.2f}\n"
        f"Skew: {skew_val:.2f}\n"
        f"Kurt: {kurt_val:.2f}"
    )
    ax.text(0.98, 0.97, annotation_text, transform=ax.transAxes,
            ha="right", va="top", fontsize=8,
            bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.6))

    ax.set_title(sheet)
    ax.legend(loc="upper left")

# Save stats table
stats_df = pd.DataFrame(stats_list)
stats_df.to_csv("Data Statistics/property_statistics.csv", index=False)
print(stats_df)

# Save figure
plt.savefig("Data Statistics/property_histograms_annotations.png", dpi=300)
plt.show()


In [None]:
property_names = ["LogP", "LogS", "LogBCF", "BP", "MP", "LogVP"]

# Fixed colors for consistency with table ordering
color_map = {
    "LogP": ("#1f77b4", "#aec7e8"),
    "LogS": ("#ff7f0e", "#ffbb78"),
    "LogBCF": ("#2ca02c", "#98df8a"),
    "BP": ("#d62728", "#ff9896"),
    "MP": ("#9467bd", "#c5b0d5"),
    "LogVP": ("#8c564b", "#c49c94")
}

fig, axes = plt.subplots(2, 3, figsize=(16, 8))
axes = axes.flatten()

for i, (sheet, prop) in enumerate(zip(sheets, property_names)):
    df = pd.read_excel(file_path, sheet_name=sheet)

    # Ensure consistent column naming

    train_vals = df[df["Training/Test"] == "Training"][df.columns[5]]
    test_vals = df[df["Training/Test"] == "Test"][df.columns[5]]

    ax = axes[i]

    # Plot histograms with KDE overlay
    sns.histplot(train_vals, kde=True, color=color_map[prop][0], label="Train", ax=ax, alpha=0.6)
    sns.histplot(test_vals, kde=True, color=color_map[prop][1], label="Test", ax=ax, alpha=0.6)

    # Add annotations (mean, std, skewness, kurtosis)
    stats_text = (
        f"Train μ={train_vals.mean():.2f}, σ={train_vals.std():.2f}\n"
        f"Skew={skew(train_vals):.2f}, Kurt={kurtosis(train_vals):.2f}\n"
        f"Test μ={test_vals.mean():.2f}, σ={test_vals.std():.2f}\n"
        f"Skew={skew(test_vals):.2f}, Kurt={kurtosis(test_vals):.2f}"
    )
    ax.text(0.98, 0.97, stats_text, transform=ax.transAxes,
            fontsize=8, va="top", ha="right", bbox=dict(boxstyle="round", facecolor="white", alpha=0.8))

    ax.set_title(prop)
    ax.legend(loc="upper left")

plt.tight_layout()
plt.savefig("Data Statistics/property_histograms_annotations_colored.png", dpi=300)
plt.show()
