### Model 1

In [1]:
import math, pickle, warnings
from pathlib import Path
import os, random, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, precision_recall_fscore_support,
                             mean_absolute_error, mean_squared_error, r2_score, precision_score, recall_score,f1_score)
import re
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, Dataset, DataLoader
from torch.optim.lr_scheduler import CosineAnnealingLR
from pytorch_tabnet.tab_model import TabNetRegressor
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [2]:
SEED = 1
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
BATCH               = 64
PATIENCE            = 15
MIN_ROWS_PER_CLASS  = 3      
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

warnings.filterwarnings("ignore")

DATA_PATH = Path("./data/processed_csv.csv")
assert DATA_PATH.exists(), "processed_csv.csv not found!"

df = pd.read_csv(DATA_PATH)
label_col = "sg_encoded" if "sg_encoded" in df.columns else "structure_label"
need = {"a","b","c","alpha","beta","gamma","vol",label_col}
assert need.issubset(df.columns), f"CSV must contain {need}"

def add_features(d):
    d = d.copy()
    for ang in ["alpha","beta","gamma"]:
        d[f"cos_{ang}"] = np.cos(np.deg2rad(d[ang]))
    d["abc_prod"] = d["a"]*d["b"]*d["c"]
    return d

df = add_features(df)
NUMERIC = ["a","b","c","alpha","beta","gamma",
           "cos_alpha","cos_beta","cos_gamma","abc_prod"]

In [5]:
tmp_le = LabelEncoder().fit(df[label_col])
counts = np.bincount(tmp_le.transform(df[label_col]))
keep_mask = counts[tmp_le.transform(df[label_col])] >= MIN_ROWS_PER_CLASS
df = df.loc[keep_mask].reset_index(drop=True)

le = LabelEncoder().fit(df[label_col])
y_cls = le.transform(df[label_col]).astype(np.int64)
y_reg = df["vol"].astype(np.float32).values
X     = df[NUMERIC].astype(np.float32).values

scaler = StandardScaler().fit(X)
X = scaler.transform(X)

In [None]:
X_tr_c, X_tmp_c, y_tr_c, y_tmp_c = train_test_split(
    X, y_cls, test_size=0.2, random_state=SEED, stratify=y_cls)

X_val_c, X_te_c, y_val_c, y_te_c = train_test_split(
    X_tmp_c, y_tmp_c, test_size=0.5, random_state=SEED)  

X_tr_r, X_tmp_r, y_tr_r, y_tmp_r = train_test_split(
    X, y_reg, test_size=0.2, random_state=SEED)
X_val_r, X_te_r, y_val_r, y_te_r = train_test_split(
    X_tmp_r, y_tmp_r, test_size=0.5, random_state=SEED)

class TabDS(Dataset):
    def __init__(self,X,y):
        self.X = torch.tensor(X,dtype=torch.float32)
        self.y = torch.tensor(y)
    def __len__(self): return len(self.X)
    def __getitem__(self,i): return self.X[i], self.y[i]

dls = {
    "train_cls": DataLoader(TabDS(X_tr_c,y_tr_c),BATCH,shuffle=True),
    "val_cls"  : DataLoader(TabDS(X_val_c,y_val_c),BATCH),
    "test_cls" : DataLoader(TabDS(X_te_c ,y_te_c ),BATCH),
    "train_reg": DataLoader(TabDS(X_tr_r,y_tr_r),BATCH,shuffle=True),
    "val_reg"  : DataLoader(TabDS(X_val_r,y_val_r),BATCH),
    "test_reg" : DataLoader(TabDS(X_te_r ,y_te_r ),BATCH),
}

In [7]:
class MLPCls(nn.Module):
    def __init__(self,inp,nc):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(inp,64), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(64,32), nn.ReLU(),
            nn.Linear(32,nc)
        )
    def forward(self,x): return self.net(x)

class MLPReg(nn.Module):
    def __init__(self,inp):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(inp,64), nn.ReLU(),
            nn.Linear(64,32), nn.ReLU(),
            nn.Linear(32,1)
        )
    def forward(self,x): return self.net(x).squeeze(1)

model_cls = MLPCls(len(NUMERIC), len(le.classes_)).to(DEVICE)
model_reg = MLPReg(len(NUMERIC)).to(DEVICE)

def train(model, loaders, loss_fn, opt, epochs=200):
    hist = {"tr":[],"val":[]}
    best, wait, best_sd = float("inf"),0,None
    for ep in range(1,epochs+1):
        model.train(); tloss=[]
        for xb,yb in loaders["train"]:
            xb,yb=xb.to(DEVICE), yb.to(DEVICE)
            opt.zero_grad()
            loss = loss_fn(model(xb), yb)
            loss.backward(); opt.step()
            tloss.append(loss.item())
        hist["tr"].append(np.mean(tloss))
        model.eval(); vloss=[]
        with torch.no_grad():
            for xb,yb in loaders["val"]:
                xb,yb=xb.to(DEVICE), yb.to(DEVICE)
                vloss.append(loss_fn(model(xb), yb).item())
        v = np.mean(vloss); hist["val"].append(v)
        print(f"Epoch {ep:03d} | Val Loss = {v:.4f}")

        if v < best - 1e-4:
            best, wait = v, 0
            best_sd = {k:v.cpu().clone() for k,v in model.state_dict().items()}
        else:
            wait += 1
            if wait >= PATIENCE:
                print("Early stopping"); break
    model.load_state_dict(best_sd)
    return hist

In [8]:
print("\n--- Training classifier ---")
hist_cls = train(model_cls,
                 {"train":dls["train_cls"],"val":dls["val_cls"]},
                 nn.CrossEntropyLoss(),
                 torch.optim.Adam(model_cls.parameters(),lr=1e-3))

print("\n--- Training regressor ---")
hist_reg = train(model_reg,
                 {"train":dls["train_reg"],"val":dls["val_reg"]},
                 nn.MSELoss(),
                 torch.optim.Adam(model_reg.parameters(),lr=1e-3))



--- Training classifier ---
Epoch 001 | Val Loss = 2.1446
Epoch 002 | Val Loss = 1.8782
Epoch 003 | Val Loss = 1.7457
Epoch 004 | Val Loss = 1.6541
Epoch 005 | Val Loss = 1.5897
Epoch 006 | Val Loss = 1.5470
Epoch 007 | Val Loss = 1.5152
Epoch 008 | Val Loss = 1.4883
Epoch 009 | Val Loss = 1.4641
Epoch 010 | Val Loss = 1.4464
Epoch 011 | Val Loss = 1.4297
Epoch 012 | Val Loss = 1.4148
Epoch 013 | Val Loss = 1.4131
Epoch 014 | Val Loss = 1.3992
Epoch 015 | Val Loss = 1.3866
Epoch 016 | Val Loss = 1.3881
Epoch 017 | Val Loss = 1.3784
Epoch 018 | Val Loss = 1.3729
Epoch 019 | Val Loss = 1.3740
Epoch 020 | Val Loss = 1.3600
Epoch 021 | Val Loss = 1.3560
Epoch 022 | Val Loss = 1.3525
Epoch 023 | Val Loss = 1.3424
Epoch 024 | Val Loss = 1.3404
Epoch 025 | Val Loss = 1.3413
Epoch 026 | Val Loss = 1.3373
Epoch 027 | Val Loss = 1.3268
Epoch 028 | Val Loss = 1.3262
Epoch 029 | Val Loss = 1.3253
Epoch 030 | Val Loss = 1.3316
Epoch 031 | Val Loss = 1.3161
Epoch 032 | Val Loss = 1.3223
Epoch 033 |

In [9]:
model_cls.eval(); yp,yt=[],[]
with torch.no_grad():
    for xb,yb in dls["test_cls"]:
        yp.append(torch.argmax(model_cls(xb.to(DEVICE)),1).cpu().numpy())
        yt.append(yb.numpy())
y_pred_cls = np.concatenate(yp); y_true_cls=np.concatenate(yt)
acc = accuracy_score(y_true_cls,y_pred_cls)

labels_used = np.sort(np.unique(y_true_cls))
names_used  = [str(le.inverse_transform([i])[0]) for i in labels_used]

print("\n=== CLASSIFICATION RESULTS ===")
print(f"Test accuracy : {acc:.4f}")
print(classification_report(y_true_cls,y_pred_cls,
                            labels=labels_used,
                            target_names=names_used, digits=3))
cm = confusion_matrix(y_true_cls,y_pred_cls,labels=labels_used)


=== CLASSIFICATION RESULTS ===
Test accuracy : 0.5979
              precision    recall  f1-score   support

           0      0.000     0.000     0.000         2
           3      0.000     0.000     0.000         6
           4      0.573     0.580     0.577        81
           5      0.000     0.000     0.000         8
           6      0.000     0.000     0.000         9
           9      0.000     0.000     0.000         2
          13      0.000     0.000     0.000         1
          14      0.000     0.000     0.000         1
          15      0.000     0.000     0.000         1
          16      0.000     0.000     0.000         1
          22      0.333     1.000     0.500         1
          23      1.000     1.000     1.000         2
          24      0.667     0.667     0.667         3
          27      0.500     1.000     0.667         3
          28      0.000     0.000     0.000         1
          29      0.000     0.000     0.000         1
          35      0.000   

In [10]:
model_reg.eval(); yp,yt=[],[]
with torch.no_grad():
    for xb,yb in dls["test_reg"]:
        yp.append(model_reg(xb.to(DEVICE)).cpu().numpy())
        yt.append(yb.numpy())
y_pred_reg = np.concatenate(yp); y_true_reg = np.concatenate(yt)
rmse = math.sqrt(mean_squared_error(y_true_reg,y_pred_reg))
mae  = mean_absolute_error(y_true_reg,y_pred_reg)
r2   = r2_score(y_true_reg,y_pred_reg)

print("\n=== REGRESSION RESULTS ===")
print(f"RMSE: {rmse:.3f}")
print(f"MAE : {mae :.3f}")
print(f"R²  : {r2  :.4f}")


=== REGRESSION RESULTS ===
RMSE: 0.061
MAE : 0.015
R²  : 0.9973


In [11]:
def plot_loss(hist, title):
    epochs = range(1, len(hist["tr"]) + 1)
    plt.figure(figsize=(12, 8))
    plt.plot(epochs, hist["tr"], label="Train")
    plt.plot(epochs, hist["val"], label="Validation")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title(title)
    plt.legend()
    plt.tight_layout()
    plt.show()

plot_loss(hist_cls, "Classifier – Cross‑Entropy Loss")

# Confusion matrix heat-map
plt.figure(figsize=(12, 8))
plt.imshow(cm, interpolation="nearest")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.colorbar()
plt.tight_layout()
plt.show()

# Error skew bar – false-negative vs false-positive balance
fp_mask  = (y_pred_cls != y_true_cls) & np.isin(y_pred_cls, labels_used)
fn_mask  = (y_pred_cls != y_true_cls) & np.isin(y_true_cls, labels_used)

fp_count = pd.Series(y_pred_cls[fp_mask]).value_counts()
fn_count = pd.Series(y_true_cls[fn_mask]).value_counts()

err_df = pd.DataFrame({"FP": fp_count, "FN": fn_count}).reindex(labels_used, fill_value=0)

threshold = 15
sig_err_df = err_df[(err_df["FP"] + err_df["FN"]) >= threshold]

plt.figure(figsize=(10,4))
sig_err_df.plot(kind="bar", width=0.8, color={"FP":"tab:red", "FN":"tab:blue"}, ax=plt.gca())

plt.title(f"False Positives vs False Negatives (combined errors ≥ {threshold})")
plt.xlabel("Class ID")
plt.ylabel("Count")
plt.legend(title="Error type")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()

plot_loss(hist_reg, "Regressor – MSE Loss")

# True vs Predicted scatter
plt.figure(figsize=(12, 8))
plt.scatter(y_true_reg, y_pred_reg, s=15, alpha=0.4)
lims = [min(y_true_reg.min(), y_pred_reg.min()), max(y_true_reg.max(), y_pred_reg.max())]
plt.plot(lims, lims, color='red', linestyle='--', linewidth=1)
plt.xlim(0, 8)
plt.ylim(0, 8)
plt.xlabel("True Volume")
plt.ylabel("Predicted Volume")
plt.title("Volume – True vs Predicted")
plt.tight_layout()
plt.show()


In [12]:
parent_dir    = "Models"
model_name    = "Model 1"
model_dir     = os.path.join(parent_dir, model_name)

try:
    os.mkdir(parent_dir)
    print(f"Directory '{parent_dir}' created successfully.")
except FileExistsError:
    print(f"Directory '{parent_dir}' already exists.")
except PermissionError:
    print(f"Permission denied: Unable to create '{parent_dir}'.")
except Exception as e:
    print(f"An error occurred: {e}")

try:
    os.mkdir(model_dir)
    print(f"Directory '{model_dir}' created successfully.")
except FileExistsError:
    print(f"Directory '{model_dir}' already exists.")
except PermissionError:
    print(f"Permission denied: Unable to create '{model_dir}'.")
except Exception as e:
    print(f"An error occurred: {e}")

torch.save(
    model_cls.state_dict(),
    os.path.join(model_dir, f"{model_name} spacegroup.pt")
)
torch.save(
    model_reg.state_dict(),
    os.path.join(model_dir, f"{model_name} volume_regressor.pt")
)
with open(os.path.join(model_dir, f"{model_name} preprocessing.pkl"), "wb") as f:
    pickle.dump(
        {"scaler": scaler, "label_encoder": le},
        f
    )

print(f"\nSaved files to '{model_dir}':")
print(f"  - {model_name} spacegroup.pt")
print(f"  - {model_name} volume_regressor.pt")
print(f"  - {model_name} preprocessing.pkl")

Directory 'Models' created successfully.
Directory 'Models\Model 1' created successfully.

Saved files to 'Models\Model 1':
  - Model 1 spacegroup.pt
  - Model 1 volume_regressor.pt
  - Model 1 preprocessing.pkl
