In [None]:
# Import Basis
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# Paellete
palette = ["#2D2926FF", "#E94B3CFF"]
color_palette = sns.color_palette(palette)

# Remove Warnings
warnings.filterwarnings("ignore")
# Set the option to display all columns
pd.set_option("display.max_columns", None)

In [None]:
# df = pd.read_csv("data/group_datagroups_new/0.1/grouped_data.csv")
df = pd.read_csv("data/groups/0.1/grouped_data.csv")
df.head()


In [None]:
# Count nan
df.isnull().sum()

In [None]:
# Replace null 'group_id' values with 8
df["group_id"] = df["group_id"].fillna(8)

# Count nan again
df.isnull().sum()

In [None]:
df.shape

In [None]:
features = ["pol", "psa", "n_donors", "nrotb", "n_acceptors", "logP"]
X = df[features]
y = df["dG_exp"]
groups = df["group_id"]
id_column = "mobleyID"

In [None]:
print(df[features + ["dG_exp"]].describe())

In [None]:
import os

import seaborn as sns


def plot_distributions(df, features, figsize=(8, 6), output_dir=None, custom_bins=None):
    """
    Create publication-quality KDE plots for each feature with detailed statistics and consistent bar alignment.

    Parameters:
    df (pandas.DataFrame): DataFrame containing the features
    features (list): List of feature names to plot
    figsize (tuple): Figure size (width, height) for each plot
    output_dir (str, optional): Directory to save plots. If None, plots are displayed.
    custom_bins (dict, optional): Dictionary specifying the number of bins for specific features.

    Returns:
    list: List of generated figure objects
    """
    # Set Seaborn style
    sns.set_style("whitegrid")
    sns.set_palette("dark")  # Use a darker color palette for publication quality

    # List to store figures
    figures = []

    # Plot each feature
    for col in features:
        # Create a new figure for each feature
        fig, ax = plt.subplots(figsize=figsize)

        # Determine the number of bins
        if custom_bins and col in custom_bins:
            bins = custom_bins[col]
        else:
            bins = "auto"
        # bins = None

        # Create histogram with consistent bar alignment
        sns.histplot(
            data=df,
            x=col,
            stat="density",
            kde=True,
            ax=ax,
            color="#1f77b4",  # Darker blue
            alpha=0.6,
            bins=bins,  # Use custom bins
            line_kws={"linewidth": 2.5},  # Thicker KDE line
        )

        # Calculate statistics
        mean = df[col].mean()
        std = df[col].std()
        median = df[col].median()
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)

        # Create statistics text
        stats_text = (
            f"Mean: {mean:.2f}\n"
            f"Std: {std:.2f}\n"
            f"Median: {median:.2f}\n"
            f"Q1: {q1:.2f}\n"
            f"Q3: {q3:.2f}"
        )

        # Add statistics text
        ax.text(
            0.95,
            0.95,
            stats_text,
            transform=ax.transAxes,
            verticalalignment="top",
            horizontalalignment="right",
            fontsize=10,  # Increase font size for text box
            bbox=dict(boxstyle="round", facecolor="white", alpha=0.9),
        )

        # Customize the plot
        ax.set_title(f"Distribution of {col}", fontsize=16, pad=10)
        ax.set_xlabel(col, fontsize=14)
        ax.set_ylabel("Density", fontsize=14)

        # Increase axis line width
        ax.spines["top"].set_linewidth(1.5)
        ax.spines["right"].set_linewidth(1.5)
        ax.spines["left"].set_linewidth(1.5)
        ax.spines["bottom"].set_linewidth(1.5)

        # Increase tick size
        ax.tick_params(axis="both", which="major", labelsize=12)

        # Add rug plot for better visualization
        sns.rugplot(data=df, x=col, ax=ax, color="gray", alpha=0.5)

        # Adjust layout
        plt.tight_layout()

        # Save or display the plot
        if output_dir:
            # Ensure output directory exists
            os.makedirs(output_dir, exist_ok=True)
            plt.savefig(
                os.path.join(output_dir, f"{col}_distribution.png"), dpi=300
            )  # Publication quality
            plt.close(fig)  # Close the figure to free up memory
        else:
            figures.append(fig)

    # If not saving, return list of figures
    return figures if output_dir is None else None


In [None]:
# Specify custom bins for 'psa' and 'pol'
custom_bins = {
    "psa": 10,
    "pol": 8,
    "n_donors": 7,
    "n_acceptors": 1,
    "nrotb": 1,
}

# Call the function with custom bins
figures = plot_distributions(
    df,
    features=features,
    custom_bins=custom_bins,
    output_dir="plots/eda/",
)

plt.show()

In [None]:
correlation_matrix = df[features + ["dG_exp"]].corr()
print(correlation_matrix)

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

In [None]:
group_counts = df["group_id"].value_counts()
print("Group distribution:")
print(group_counts)

plt.figure(figsize=(10, 6))
group_counts.plot(kind="bar")
plt.title("Distribution of Groups")
plt.xlabel("Group ID")
plt.ylabel("Count")
plt.show()

In [None]:
fig, axs = plt.subplots(2, 3, figsize=(15, 10))
axs = axs.ravel()

for i, feature in enumerate(features):
    axs[i].scatter(df[feature], df["dG_exp"])
    axs[i].set_xlabel(feature)
    axs[i].set_ylabel("dG_exp")
    axs[i].set_title(f"{feature} vs dG_exp")

plt.tight_layout()
plt.show()

In [None]:
print(df.groupby("group_id")[features + ["dG_exp"]].mean())

In [None]:
import matplotlib.pyplot as plt

variables = ["dG_exp", "pol", "psa", "n_donors"]

fig, axs = plt.subplots(1, 4, figsize=(15, 5))

for i, var in enumerate(variables):
    data = df[var]

    # Calculate IQR
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1

    # Define bounds
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Identify outliers
    outliers = data[(data < lower_bound) | (data > upper_bound)]

    # Plot
    axs[i].boxplot(data)
    axs[i].scatter(np.ones(len(outliers)), outliers, color="red", s=20)
    axs[i].set_title(var)

    print(f"{var}:")
    print(f"Number of outliers: {len(outliers)}")
    print(f"Percentage of outliers: {len(outliers) / len(data) * 100:.2f}%")
    print(f"Range of outliers: {outliers.min()} to {outliers.max()}")
    print("\n")

plt.tight_layout()
plt.show()

In [None]:
from scipy.stats import skew

# List of variables to analyze
variables = ["pol", "psa", "n_donors", "nrotb", "n_acceptors", "dG_exp"]

# Calculate skewness
skewness = df[variables].apply(lambda x: skew(x))

print("Skewness for each variable:")
print(skewness)

In [None]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer, StandardScaler


class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, column="dG_exp"):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()
        data = df[self.column]
        Q1 = data.quantile(0.25)
        Q3 = data.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(data >= lower_bound) & (data <= upper_bound)]
        return df


class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, pol_transformer=None):
        self.pol_transformer = pol_transformer
        self.columns = [
            "pol",
            "n_acceptors",
            "n_donors",
            "nrotb",
            "psa",
        ]

    def fit(self, X, y=None):
        self.pol_transformer.fit(X[self.columns])
        return self

    def transform(self, X):
        X_ = X.copy()
        X_[self.columns] = self.pol_transformer.transform(X_[self.columns])
        return X_


class CustomStandardScaler(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        columns=[
            "pol",
            "psa",
            "logP",
            "n_acceptors",
            "n_donors",
            "nrotb",
        ],
    ):
        self.columns = columns
        self.scaler = StandardScaler()

    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns])
        return self

    def transform(self, X):
        X_ = X.copy()
        X_[self.columns] = self.scaler.transform(X_[self.columns])
        return X_


# Create the pipeline
preprocessing_pipeline = Pipeline(
    [
        ("outlier_remover", OutlierRemover()),
        (
            "custom_transformer",
            CustomTransformer(pol_transformer=PowerTransformer(method="yeo-johnson")),
        ),
        ("standard_scaler", CustomStandardScaler()),
    ]
)


# Assuming 'df' is your DataFrame
# Apply the pipeline
df_processed = preprocessing_pipeline.fit_transform(df)

print("Original shape:", df.shape)
print("Processed shape:", df_processed.shape)

# Check the first few rows of the processed data
print(df_processed.head())

# Verify the transformations
print("\nMean of scaled features:")
print(df_processed[["pol", "psa", "dG_exp", "logP"]].mean())
print("\nStandard deviation of scaled features:")
print(df_processed[["pol", "psa", "dG_exp", "logP"]].std())

print("\nSkewness of log-transformed 'pol':", skew(df_processed["pol"]))

In [None]:
df_processed

In [None]:
fig, axs = plt.subplots(3, 3, figsize=(15, 12))

# Flatten the 2D array of axes, but keep it as a 2D array
axs_flat = axs.flatten()

for i, col in enumerate(features + ["dG_exp"]):
    if i < 6:
        # For the first two rows, use axes as normal
        ax = axs_flat[i]
    else:
        # For the last plot, use the center axis in the last row
        ax = axs[2, 1]

    ax.hist(df_processed[col], bins=30)
    ax.set_title(col)
    ax.set_xlabel(col)
    ax.set_ylabel("Frequency")

# Remove the unused subplots
fig.delaxes(axs[2, 0])
fig.delaxes(axs[2, 2])

plt.tight_layout()
plt.show()

In [None]:
from scipy.stats import skew

# List of variables to analyze
variables = ["pol", "psa", "n_donors", "nrotb", "n_acceptors", "dG_exp"]

# Calculate skewness
skewness = df_processed[variables].apply(lambda x: skew(x))

print("Skewness for each variable:")
print(skewness)

In [None]:
# Save the dataframe as a csv file
df_processed.to_csv("groups_new/0.1/grouped_data_without_outliers.csv", index=False)

In [None]:
# Create the pipeline
preprocessing_pipeline = Pipeline(
    [
        (
            "custom_transformer",
            CustomTransformer(pol_transformer=PowerTransformer(method="yeo-johnson")),
        ),
        ("standard_scaler", CustomStandardScaler()),
    ]
)


# Assuming 'df' is your DataFrame
# Apply the pipeline
df_processed_with_outliers = preprocessing_pipeline.fit_transform(df)

print("Original shape:", df.shape)
print("Processed shape:", df_processed.shape)

# Check the first few rows of the processed data
print(df_processed_with_outliers.head())

# Verify the transformations
print("\nMean of scaled features:")
print(df_processed_with_outliers[["pol", "psa", "dG_exp", "logP"]].mean())
print("\nStandard deviation of scaled features:")
print(df_processed_with_outliers[["pol", "psa", "dG_exp", "logP"]].std())

print("\nSkewness of log-transformed 'pol':", skew(df_processed_with_outliers["pol"]))

In [None]:
# Save the dataframe as a csv file
df_processed_with_outliers.to_csv(
    "groups_new/0.1/grouped_data_with_outliers.csv", index=False
)