In [None]:
# # Data Visualization & Analysis for Regression (AIM-VEE / CM + EV)
# This notebook:
# - Loads CM features + target (EV / excitation energy)
# - Checks basic stats and ranges
# - Visualizes distributions
# - Computes correlations with the target
# - Runs PCA for structure


In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor

# Make plots look bit nicer
plt.style.use("default")
sns.set_theme(context="notebook", style="whitegrid")



#Load Data

DATA_DIR = "/Users/lohitakshbadarala/Desktop/AIM-VEE/data/vee_predictor/Data"

X_path = os.path.join(DATA_DIR, "CM_train.npy")  # features
y_path = os.path.join(DATA_DIR, "EV_train.npy")  # target

X = np.load(X_path)   
y = np.load(y_path)   

print("X shape:", X.shape)
print("y shape:", y.shape)


y = y.reshape(-1)
print("y shape after reshape:", y.shape)


n_features_for_df = min(100, X.shape[1])  
df = pd.DataFrame(X[:, :n_features_for_df],
                  columns=[f"f_{i}" for i in range(n_features_for_df)])
df["target"] = y

df.head()



In [None]:

# Basic stats

print("\n Basic Feature Stats (first", n_features_for_df, "features)")
display(df.describe().T)

print("\n Target Stats")
print("min:", float(y.min()))
print("max:", float(y.max()))
print("mean:", float(y.mean()))
print("std:", float(y.std()))


# Target distribution
plt.figure(figsize=(6,4))
sns.histplot(y, bins=40, kde=True)
plt.title("Target Distribution (EV / excitation energy)")
plt.xlabel("Target value")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

#NOTE: The min value -24900.45 seems like an outlier and is not possible. Consider investigating further or removing it if necessary.


In [None]:
# Pearson correlations between each feature and the target hearmap

corr_with_target = df.corr()["target"].sort_values(ascending=False)
print("\nTop 20 features most positively correlated with target:")
display(corr_with_target.head(20))

print("\nTop 20 features most negatively correlated with target:")
display(corr_with_target.tail(20))

# Correlation heatmap for a small subset of features
subset_feats = [f"f_{i}" for i in range(min(20, n_features_for_df))] + ["target"]
corr_matrix = df[subset_feats].corr()

plt.figure(figsize=(10,8))
sns.heatmap(corr_matrix, annot=False, cmap="coolwarm", center=0)
plt.title("Correlation Heatmap (first 20 features + target)")
plt.tight_layout()
plt.show()


In [None]:
# PCA Analysis

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# PCA to see intrinsic dimensionality/structure
pca = PCA(n_components=20)
X_pca = pca.fit_transform(X_scaled)

print("Explained variance ratio (first 10 components):")
print(pca.explained_variance_ratio_[:10])
print("Cumulative explained variance (first 10):")
print(np.cumsum(pca.explained_variance_ratio_[:10]))

# 2D PCA scatter colored by target)
max_points = 3000
idx = np.random.choice(len(X_pca), size=min(max_points, len(X_pca)), replace=False)

plt.figure(figsize=(7,6))
sc = plt.scatter(X_pca[idx, 0], X_pca[idx, 1], c=y[idx], s=15, alpha=0.8)
plt.colorbar(sc, label="Target")
plt.title("PCA (2D) projection colored by target")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.tight_layout()
plt.show()
