#### **Import Libraries**

In [1]:
import warnings
from scipy import stats
warnings.filterwarnings('ignore')

# Data Manipulation
import numpy as np
import pandas as pd
from scipy.stats import zscore

# Saving the Model Pipeline
import os, json, pickle, joblib
from scipy.sparse import hstack
from sklearn.base import BaseEstimator, TransformerMixin

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns; sns.set();
%matplotlib inline

# Model Training
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier, HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_curve, roc_auc_score, classification_report, precision_score, recall_score

#### **Load Datasets**

In [2]:
df_toi = pd.read_csv("toi_data.csv", skiprows=30)
df_koi = pd.read_csv("koi_data.csv", skiprows=86)
df_k2 = pd.read_csv("k2_data.csv", skiprows=132)

In [3]:
print(df_toi.shape)
print(df_koi.shape)
print(df_k2.shape)

(7668, 27)
(9564, 83)
(3992, 129)


## **Data Preprocessing**

### Find & Rename Common Columns

In [4]:
## Selected Common Columns
# orbper (orbital period in days)
# trandep (transit depth in ppm)
# trandur (transit duration in hours)
# rade (planet radius in Earth radii)
# insol (insolation flux in Earth flux units)
# eqt (equilibrium temperature in K)
# teff (stellar effective temperature in K)
# logg (stellar log(g) in cm/s²)
# rad (stellar radius in Solar radii)
# disposition (target)

feature_map = {
            'orbper':     ['pl_orbper', 'koi_period'],
            'trandep':    ['pl_trandep', 'koi_depth'],
            'trandur':    ['pl_trandurh', 'pl_trandur', 'koi_duration'],
            'rade':       ['pl_rade', 'koi_prad'],
            'insol':      ['pl_insol', 'koi_insol'],
            'eqt':        ['pl_eqt', 'koi_teq'],
            'teff':       ['st_teff', 'koi_steff'],
            'logg':       ['st_logg', 'koi_slogg'],
            'rad':        ['st_rad', 'koi_srad'],
            'disposition':['disposition', 'koi_disposition', 'tfopwg_disp']
        }

In [5]:
# Select and rename common columns

# For TOI
df_toi = df_toi.rename(columns={
    'pl_orbper': 'orbper',
    'pl_trandep': 'trandep',
    'pl_trandurh': 'trandur',
    'pl_rade': 'rade',
    'pl_insol': 'insol',
    'pl_eqt': 'eqt',
    'st_teff': 'teff',
    'st_logg': 'logg',
    'st_rad': 'rad',
    'tfopwg_disp' : 'disposition'
})[[ 'orbper', 'trandep', 'trandur', 'rade', 'insol', 'eqt', 'teff', 'logg', 'rad', 'disposition']]

# For K2
df_k2 = df_k2.rename(columns={
    'pl_orbper': 'orbper',
    'pl_trandep': 'trandep',
    'pl_trandur': 'trandur',
    'pl_rade': 'rade',
    'pl_insol': 'insol',
    'pl_eqt': 'eqt',
    'st_teff': 'teff',
    'st_logg': 'logg',
    'st_rad': 'rad'
})[['orbper', 'trandep', 'trandur', 'rade', 'insol', 'eqt', 'teff', 'logg', 'rad', 'disposition']]

# For KOI
df_koi = df_koi.rename(columns={
    'koi_period': 'orbper',
    'koi_depth': 'trandep',
    'koi_duration': 'trandur',
    'koi_prad': 'rade',
    'koi_insol': 'insol',
    'koi_teq': 'eqt',
    'koi_steff': 'teff',
    'koi_slogg': 'logg',
    'koi_srad': 'rad',
    'koi_disposition' : 'disposition'
})[[ 'orbper', 'trandep', 'trandur', 'rade', 'insol', 'eqt', 'teff', 'logg', 'rad', 'disposition']]

In [6]:
df_toi.describe()

Unnamed: 0,orbper,trandep,trandur,rade,insol,eqt,teff,logg,rad
count,7562.0,7668.0,7668.0,7164.0,7492.0,7357.0,7507.0,6812.0,7163.0
mean,17.806572,8248.851101,3.058509,10.337217,2250.634173,1282.899836,5793.004878,4.305006,1.404429
std,97.956222,17533.04344,1.874578,8.59532,10933.78239,686.928756,1482.240723,0.304745,1.600841
min,0.152076,24.583493,0.101,0.552507,0.000342,37.0,2808.0,0.1,0.114827
25%,2.485505,1415.0,1.844832,4.49245,85.241118,813.037862,5211.53,4.12,0.89
50%,4.094161,4740.5,2.732,10.5416,363.603895,1183.027465,5801.93,4.33,1.23975
75%,7.943241,10323.5,3.79725,14.02085,1161.6975,1589.0,6297.1,4.5,1.66
max,1837.889731,767910.313098,30.015619,297.111726,280833.0,6413.0,50000.0,5.96065,102.03


In [7]:
df_koi.describe()

Unnamed: 0,orbper,trandep,trandur,rade,insol,eqt,teff,logg,rad
count,9564.0,9201.0,9564.0,9201.0,9243.0,9201.0,9201.0,9201.0,9201.0
mean,75.671358,23791.34,5.621606,102.891778,7745.737,1085.385828,5706.82328,4.310157,1.728712
std,1334.744046,82242.68,6.471554,3077.639126,159204.7,856.351161,796.857947,0.432606,6.127185
min,0.241843,0.0,0.052,0.08,0.0,25.0,2661.0,0.047,0.109
25%,2.733684,159.9,2.43775,1.4,20.15,539.0,5310.0,4.218,0.829
50%,9.752831,421.1,3.7926,2.39,141.6,878.0,5767.0,4.438,1.0
75%,40.715178,1473.4,6.2765,14.93,870.29,1379.0,6112.0,4.543,1.345
max,129995.7784,1541400.0,138.54,200346.0,10947550.0,14667.0,15896.0,5.364,229.908


In [8]:
df_k2.describe()

Unnamed: 0,orbper,trandep,trandur,rade,insol,eqt,teff,logg,rad
count,3948.0,2088.0,2772.0,3163.0,619.0,843.0,2881.0,2345.0,3864.0
mean,40.365301,2.455346,3.223611,8.452848,382.07499,905.551673,5132.383728,4.436366,1.175579
std,1344.791902,8.501406,2.302638,30.064126,996.629025,446.545316,1233.053433,0.320243,2.406781
min,0.17566,0.00562,-0.068,0.4064,0.0279,82.0,2520.0,1.773,0.11
25%,2.851753,0.06046,1.889,1.8,20.21965,605.45,4471.38,4.29,0.676999
50%,6.771121,0.1348,2.784,2.6,77.7,805.0,5289.0,4.49249,0.863039
75%,14.070344,0.83045,3.924,5.95,254.03685,1117.205,5748.0,4.626,1.17025
max,83830.0,135.5446,53.6,1080.0,9667.9228,2529.02,46696.0,5.27596,85.0


### Normalize Target 'Disposition' Rows

In [9]:
# 1- TOI
df_toi['disposition'] = df_toi['disposition'].map({"CP":"CONFIRMED", "KP":"CONFIRMED", "FP":"FALSE POSITIVE", "PC":"CANDIDATE"})

df_toi = df_toi.dropna(subset='disposition')

# 2- K2
df_k2 = df_k2[df_k2['disposition'] != 'REFUTED']

df_k2["trandep"] = df_k2["trandep"] * 10000

df_k2 = df_k2.drop_duplicates()

### Clean The Data

In [10]:
numeric_cols = ['orbper', 'trandep', 'trandur', 'rade', 'insol', 'eqt', 'teff', 'logg', 'rad']

#### Convert Datatypes

In [11]:
# Clean datatypes
for df in [df_toi, df_koi, df_k2]:
  for col in numeric_cols:
    df[col] = df[col].apply(pd.to_numeric, errors='coerce')

#### Handle Negatives

In [13]:
# Handle Negatives
for df in [df_toi, df_koi, df_k2]:
  for col in numeric_cols:
    if col not in df.columns:
        continue  # skip if column missing

    if col in ["insol", "eqt", "teff"]:
        # Clip negatives to 0
        df.loc[df[col] < 0, col] = 0

    elif col in ["orbper", "trandep", "trandur", "rade", "rad"]:
        # Replace negatives with NaN (to be imputed later)
        df.loc[df[col] < 0, col] = np.nan

    elif col == "logg":
        # Keep as-is (valid negatives allowed)
        continue

#### Handle Missing

In [14]:
# Handle missing
imputer = SimpleImputer(strategy='median')
for df in [df_toi, df_koi, df_k2]:
  df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

#### Handle Outliers

In [15]:
print(df_toi.shape)
print(df_koi.shape)
print(df_k2.shape)

(7111, 10)
(9564, 10)
(3961, 10)


In [16]:
# Handle outliers: remove rows with |z| > 3 in numeric cols
for df in [df_toi, df_koi, df_k2]:
  for col in numeric_cols:
    z = np.abs(zscore(df[col]))
    df.drop(df[z > 3].index, inplace=True)

In [17]:
print(df_toi.shape)
print(df_koi.shape)
print(df_k2.shape)

(6475, 10)
(8421, 10)
(3540, 10)


### Merge Datasets

In [18]:
# Concat
df = pd.concat([df_toi, df_koi, df_k2])

In [19]:
df.shape

(18436, 10)

In [20]:
df.describe()

Unnamed: 0,orbper,trandep,trandur,rade,insol,eqt,teff,logg,rad
count,18436.0,18436.0,18436.0,18436.0,18436.0,18436.0,18436.0,18436.0,18436.0
mean,32.9115,8792.993087,3.870721,12.372583,843.978647,1026.873401,5574.903301,4.390677,1.114394
std,93.420243,26911.070484,3.097626,109.239015,2909.436411,525.661191,825.139385,0.231279,0.477575
min,0.163821,0.0,0.0,0.08,0.014762,97.0,2828.0,3.47,0.16
25%,2.845733,386.9,2.16085,1.870895,57.6125,716.0,5262.0,4.275,0.81
50%,5.98276,1343.0,2.977,3.01,118.376,874.0,5680.0,4.44,1.0
75%,16.067803,5905.725,4.44425,11.667525,581.164,1288.0,6060.0,4.53,1.314805
max,2790.0,263860.0,25.01,7233.87,234049.39,3295.0,9088.0,5.14315,5.86835


In [21]:
whisker_map = {}
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    l_whisker = max(Q1 - 1.5*IQR, df[col].min())
    u_whisker = Q3 + 1.5*IQR
    whisker_map[col] = [u_whisker, l_whisker]

In [22]:
whisker_map

{'orbper': [np.float64(35.90090904899999), 0.1638211],
 'trandep': [np.float64(14183.962500000001), 0.0],
 'trandur': [np.float64(7.869350000000001), 0.0],
 'rade': [np.float64(26.362470318750006), 0.08],
 'insol': [np.float64(1366.49125), 0.0147618],
 'eqt': [np.float64(2146.0), 97.0],
 'teff': [np.float64(7257.0), np.float64(4065.0)],
 'logg': [np.float64(4.9125), np.float64(3.8925000000000005)],
 'rad': [np.float64(2.0720123125), 0.16]}

In [23]:
imputer = SimpleImputer(strategy='median')
imputer.fit(df[numeric_cols])

In [None]:
from google.colab import files
df.to_csv("clean_data.csv", index=False)
files.download("clean_data.csv")

## **Load Cleaned Data**

In [None]:
df = pd.read_csv('clean_data.csv')

In [None]:
df.describe()

Unnamed: 0,orbper,trandep,trandur,rade,insol,eqt,teff,logg,rad
count,20644.0,20644.0,20644.0,20644.0,20644.0,20644.0,20644.0,20644.0,20644.0
mean,48.927313,13405.62,4.26762,50.809775,4234.289,1096.186589,5630.406645,4.341396,1.489391
std,1084.024342,56485.21,4.786061,2055.161109,106764.2,718.192542,1100.614761,0.356652,4.328269
min,0.163821,0.0,0.0,0.08,0.0,25.0,2520.0,0.047,0.109
25%,2.685181,86.575,2.184,1.980477,58.3425,718.0,5231.0,4.23,0.82
50%,5.886654,570.0,3.0358,3.412906,124.248,878.0,5686.0,4.438,1.02
75%,16.573001,4920.0,4.63491,12.622546,680.8313,1350.0,6087.0,4.523,1.43
max,129995.7784,1541400.0,138.54,200346.0,10947550.0,14667.0,50000.0,5.96065,229.908


## **Encode & Split Data**

In [24]:
df_enc = df.copy()

In [25]:
# Split
X = df_enc.drop(columns=['disposition'])
y = df_enc['disposition']

le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [26]:
X_train.shape, y_train.shape

((14748, 9), (14748,))

In [27]:
X_test.shape, y_test.shape

((3688, 9), (3688,))

## **Model Training**

### XGBoost Grid Search

In [None]:
#  XGBoost with GridSearch
print("\n--- XGBOOST MODEL WITH GRID SEARCH ---")

# Label encode y (since XGBoost needs numerical labels)

# Define XGBoost model
xgb = XGBClassifier(random_state=42, eval_metric='mlogloss')

# Hyperparameters for GridSearch
xgb_params = {
    'max_depth': [5, 7, 9],
    'learning_rate': [0.05, 0.1, 0.2],
    'n_estimators': [300, 500, 700],
}

# GridSearch with 3-fold CV
xgb_grid = GridSearchCV(xgb, xgb_params, cv=5, scoring='f1_weighted', n_jobs=-1)
xgb_grid.fit(X_train, y_train)

# Best model
best_xgb = xgb_grid.best_estimator_

print("\n--- TRAINING COMPLETE ---")

In [None]:
xgb_pred = best_xgb.predict(X_test)

print("\nBest XGBoost Params:", xgb_grid.best_params_) # best parameters : { max_depth=7, learning_rate=0.1, n_estimators=300 }
print("\nXGBoost Accuracy (full precision):", accuracy_score(y_test, xgb_pred))
print("\nXGBoost Detailed Classification Report:\n", classification_report(y_test, xgb_pred, digits=4))

# Confusion Matrix
labels = [0, 1, 2]
cm_xgb = confusion_matrix(y_test, xgb_pred, labels=labels)
print("\nXGBoost Confusion Matrix (rows=true, cols=pred):\n")
display(pd.DataFrame(cm_xgb, index=labels, columns=labels))

## **Save & Predict Functions**

### Save Function

#### - Function

In [None]:
def save_artifacts(model, label_encoder, imputer, whisker_map, feature_map, numeric_cols, artifacts_dir="artifacts"):
    """
    Save all training artifacts for later inference.

    Parameters
    ----------
    model : trained model (sklearn/XGBoost etc.)
    label_encoder : fitted LabelEncoder (or None)
    imputer : fitted SimpleImputer (median)
    whisker_map : dict mapping col -> [upper, lower] (from training)
    feature_map : dict mapping input column names -> canonical names
    numeric_cols : list of numeric feature names used for training
    artifacts_dir : str, directory to save artifacts
    """
    os.makedirs(artifacts_dir, exist_ok=True)

    # 1. Save model
    with open(os.path.join(artifacts_dir, "model.pkl"), "wb") as f:
        pickle.dump(model, f)

    # 2. Save label encoder (optional)
    if label_encoder is not None:
        with open(os.path.join(artifacts_dir, "label_encoder.pkl"), "wb") as f:
            pickle.dump(label_encoder, f)

    # 3. Save imputer
    joblib.dump(imputer, os.path.join(artifacts_dir, "imputer.joblib"))

    # 4. Save whisker map
    with open(os.path.join(artifacts_dir, "whisker_map.json"), "w") as f:
        json.dump(whisker_map, f, indent=2)

    # 5. Save metadata (feature map + numeric cols + model info)
    metadata = {
        "feature_map": feature_map,
        "numeric_cols": numeric_cols,
        "model_type": model.__class__.__name__,
        "imputer_filename": "imputer.joblib",
        "whisker_filename": "whisker_map.json",
        "model_filename": "model.pkl",
        "label_encoder_filename": "label_encoder.pkl"
    }
    with open(os.path.join(artifacts_dir, "metadata.json"), "w") as f:
        json.dump(metadata, f, indent=2)

    print(f"Artifacts saved to {artifacts_dir}")

#### - Call Example

In [None]:
# Save everything
save_artifacts(
    model=best_xgb,
    label_encoder=le,
    imputer=imputer,
    whisker_map=whisker_map,
    feature_map=feature_map,
    numeric_cols=numeric_cols,
    artifacts_dir="artifacts")

Artifacts saved to artifacts


### Predict Function

#### - Function

In [None]:
def load_artifacts(
    artifacts_dir="artifacts",
    model_filename_candidates=("model.pkl", "model.joblib", "model.sav"),
    imputer_filename="imputer.joblib",
    whisker_filename="whisker_map.json",
    metadata_filename="metadata.json",
    label_encoder_filename="label_encoder.pkl"
):
    # --------------------------- #
    #  Load artifacts & metadata  #
    # --------------------------- #
    meta_path = os.path.join(artifacts_dir, metadata_filename)
    if not os.path.exists(meta_path):
        raise FileNotFoundError(f"Missing metadata file: {meta_path}")
    meta = json.load(open(meta_path, "r"))

    feature_map = meta.get("feature_map")
    numeric_cols = meta.get("numeric_cols")
    if numeric_cols is None:
        raise ValueError("metadata.json must contain numeric_cols (list of numeric features).")

    # imputer
    imputer = joblib.load(os.path.join(artifacts_dir, imputer_filename))

    # whisker map
    whisker_map = json.load(open(os.path.join(artifacts_dir, whisker_filename), "r"))

    # model
    model = None
    for fname in model_filename_candidates:
        cand = os.path.join(artifacts_dir, fname)
        if os.path.exists(cand):
            try:
                model = joblib.load(cand)
            except Exception:
                with open(cand, "rb") as f:
                    model = pickle.load(f)
            break
    if model is None:
        raise FileNotFoundError(f"No model found in {artifacts_dir}")

    # label encoder
    label_encoder = None
    le_path = os.path.join(artifacts_dir, label_encoder_filename)
    if os.path.exists(le_path):
        with open(le_path, "rb") as f:
            label_encoder = pickle.load(f)

    return meta, feature_map, numeric_cols, imputer, whisker_map, model, label_encoder


# Example : meta, feature_map, numeric_cols, imputer, whisker_map, model, label_encoder = load_artifacts()

In [None]:
from datetime import datetime

def predict_with_model(
    data, meta, feature_map, numeric_cols, imputer,
    whisker_map, model, label_encoder,
    return_proba=False,
    output_csv_path=None,
    artifacts_dir="artifacts"
):
    """
    Predict using the saved artifacts and preprocessing policy.
    """

    # ---------------------------
    # 1. Normalize input to DataFrame
    # ---------------------------
    if isinstance(data, str) and os.path.exists(data):
        df = pd.read_csv(data)
    elif isinstance(data, pd.DataFrame):
        df = data.copy()
    elif isinstance(data, dict):
        df = pd.DataFrame([data])
    elif isinstance(data, list):
        df = pd.DataFrame(data)
    else:
        raise ValueError("data must be DataFrame, dict, list of dicts, or path to CSV")

    df_original = df.copy(deep=True)

    # ---------------------------
    # 2. Column rename (invert feature_map)
    # ---------------------------
    if feature_map:
        rename_dict = {old: new for new, olds in feature_map.items() for old in olds}
        df = df.rename(columns=rename_dict)

    # ---------------------------
    # 3. Ensure numeric cols exist & convert dtypes
    # ---------------------------
    for col in numeric_cols:
        if col not in df.columns:
            df[col] = np.nan
        else:
            df[col] = pd.to_numeric(df[col], errors="coerce")

    # ---------------------------
    # 4. Handle negatives
    # ---------------------------
    for col in numeric_cols:
      if col not in df.columns:
          continue  # skip if column missing

      if col in ["insol", "eqt", "teff"]:
          # Clip negatives to 0
          df[col] = df[col].clip(lower=0)

      elif col in ["orbper", "trandep", "trandur", "rade", "rad"]:
          # Replace negatives with NaN (to be imputed later)
          df.loc[df[col] < 0, col] = np.nan

      elif col == "logg":
          # Keep as-is (valid negatives allowed)
          continue

    # ---------------------------
    # 5. Impute missing
    # ---------------------------
    imputed_arr = imputer.transform(df[numeric_cols])
    df[numeric_cols] = pd.DataFrame(imputed_arr, columns=numeric_cols, index=df.index)

    # ---------------------------
    # 6. Clip outliers
    # ---------------------------
    for col in numeric_cols:
        if col not in df.columns:
            continue
        val = whisker_map.get(col)
        if val is None:
            continue
        if isinstance(val, dict):
            upper = float(val.get("upper", np.inf))
            lower = float(val.get("lower", -np.inf))
        elif isinstance(val, (list, tuple)) and len(val) >= 2:
            upper = float(val[0])
            lower = float(val[1])
        else:
            continue

        high_mask = df[col] > upper
        low_mask = df[col] < lower
        if high_mask.any():
            df.loc[high_mask, col] = upper
        if low_mask.any():
            df.loc[low_mask, col] = lower

    # ---------------------------
    # 7. Model features
    # ---------------------------
    model_feature_order = meta.get("model_feature_order") or numeric_cols
    for col in model_feature_order:
        if col not in df.columns:
            df[col] = 0.0
    X = df[model_feature_order].values

    # ---------------------------
    # 8. Predict
    # ---------------------------
    preds_idx = np.array(model.predict(X))

    proba_list = None
    if return_proba and hasattr(model, "predict_proba"):
        proba_raw = model.predict_proba(X)
        if label_encoder is not None and hasattr(model, "classes_"):
            try:
                class_labels = label_encoder.inverse_transform(model.classes_)
            except Exception:
                class_labels = [str(c) for c in model.classes_]
        else:
            class_labels = [str(c) for c in getattr(model, "classes_", range(proba_raw.shape[1]))]
        proba_list = [
            {str(class_labels[c_i]): float(proba_raw[row_i, c_i])
             for c_i in range(proba_raw.shape[1])}
            for row_i in range(proba_raw.shape[0])
        ]

    # ---------------------------
    # 9. Decode labels
    # ---------------------------
    if label_encoder is not None:
        try:
            preds_decoded = label_encoder.inverse_transform(preds_idx)
        except Exception:
            preds_decoded = preds_idx
    else:
        preds_decoded = preds_idx

    # ---------------------------
    # 10. Build output
    # ---------------------------
    n_rows = len(df)

    if n_rows == 1:
        return {
            "prediction_index": int(preds_idx[0]),
            "prediction": str(preds_decoded[0]),
            "probabilities": proba_list[0] if proba_list is not None else None
        }

    df_out = df_original.reset_index(drop=True)
    df_out["prediction"] = preds_decoded
    df_out["prediction_index"] = preds_idx.astype(int)
    if proba_list is not None:
        class_labels = list(proba_list[0].keys())
        for cl in class_labels:
            df_out[f"prob_{cl}"] = [p.get(cl, None) for p in proba_list]

    if not output_csv_path:
        timestamp = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
        output_csv_path = os.path.join(artifacts_dir, f"predictions_{timestamp}.csv")
    df_out.to_csv(output_csv_path, index=False)
    return output_csv_path

#### - Call

In [None]:
# Load once at server startup
meta, feature_map, numeric_cols, imputer, whisker_map, model, label_encoder = load_artifacts()

sample = {
    "pl_orbper": 12.3,
    "pl_trandep": 150,
    "pl_trandurh": 3.2,
    "pl_rade": 1.8,
    "pl_insol": 300,
    "pl_eqt": 1200,
    "st_teff": 5800,
    "st_logg": 4.5,
    "st_rad": 1.0
}

result = predict_with_model(sample, meta, feature_map, numeric_cols,
                            imputer, whisker_map, model, label_encoder)
print(result)
# shape : {"prediction_index": 1, "prediction": "CONFIRMED", "probabilities": {...}}

{'prediction_index': 1, 'prediction': 'CONFIRMED', 'probabilities': None}


## **Model Predict Class**

In [None]:
import os
import json
import pickle
import joblib
import numpy as np
import pandas as pd
from datetime import datetime
from typing import Optional, Union, List, Dict, Any

class PredictionModel:
    def __init__(
        self,
        artifacts_dir: str = "artifacts",
        model_filename_candidates: tuple = ("model.pkl", "model.joblib", "model.sav"),
        imputer_filename: str = "imputer.joblib",
        whisker_filename: str = "whisker_map.json",
        metadata_filename: str = "metadata.json",
        label_encoder_filename: str = "label_encoder.pkl",
    ):
        self.artifacts_dir = artifacts_dir
        self.model_filename_candidates = model_filename_candidates
        self.imputer_filename = imputer_filename
        self.whisker_filename = whisker_filename
        self.metadata_filename = metadata_filename
        self.label_encoder_filename = label_encoder_filename

        # loaded artifacts placeholders
        self.meta = None
        self.feature_map = None            # mapping canonical -> list of aliases
        self.rename_map = None             # inverted alias -> canonical
        self.numeric_cols = None
        self.imputer = None
        self.whisker_map = None
        self.model = None
        self.label_encoder = None

        # load everything on init
        self._load_artifacts()

    def _load_artifacts(self):
        meta_path = os.path.join(self.artifacts_dir, self.metadata_filename)
        if not os.path.exists(meta_path):
            raise FileNotFoundError(f"Missing metadata file: {meta_path}")
        self.meta = json.load(open(meta_path, "r"))

        # feature_map expected as canonical -> [aliases...]
        self.feature_map = self.meta.get("feature_map", {}) or {}
        # invert to alias -> canonical rename map for pandas.rename
        self.rename_map = {old: new for new, olds in self.feature_map.items() for old in olds}

        self.numeric_cols = self.meta.get("numeric_cols")
        if self.numeric_cols is None:
            raise ValueError("metadata.json must contain numeric_cols (list of numeric features).")

        # imputer
        imputer_path = os.path.join(self.artifacts_dir, self.imputer_filename)
        if not os.path.exists(imputer_path):
            raise FileNotFoundError(f"Missing imputer file: {imputer_path}")
        self.imputer = joblib.load(imputer_path)

        # whisker map
        whisker_path = os.path.join(self.artifacts_dir, self.whisker_filename)
        if not os.path.exists(whisker_path):
            raise FileNotFoundError(f"Missing whisker_map file: {whisker_path}")
        self.whisker_map = json.load(open(whisker_path, "r"))

        # model: try candidate filenames
        self.model = None
        for fname in self.model_filename_candidates:
            cand = os.path.join(self.artifacts_dir, fname)
            if os.path.exists(cand):
                try:
                    self.model = joblib.load(cand)
                except Exception:
                    with open(cand, "rb") as f:
                        self.model = pickle.load(f)
                break
        if self.model is None:
            raise FileNotFoundError(f"No model found in {self.artifacts_dir} matching {self.model_filename_candidates}")

        # label encoder (optional)
        le_path = os.path.join(self.artifacts_dir, self.label_encoder_filename)
        if os.path.exists(le_path):
            with open(le_path, "rb") as f:
                self.label_encoder = pickle.load(f)

    def _preprocess_df(self, df: pd.DataFrame):
        """
        Apply column rename, dtype coercion, negative handling, imputation, and whisker clipping.
        Returns processed df.
        """
        df = df.copy()
        df_orig_index = df.index.tolist()

        # 1) Rename columns using inverted map (alias -> canonical)
        if self.rename_map:
            df = df.rename(columns=self.rename_map)

        # 2) Ensure numeric cols exist & coerce
        for col in self.numeric_cols:
            if col not in df.columns:
                df[col] = np.nan
            else:
                df[col] = pd.to_numeric(df[col], errors="coerce")

        # 3) Handle negatives by rules
        for col in self.numeric_cols:
            if col not in df.columns:
                continue

            if col in ["insol", "eqt", "teff"]:
                neg_mask = df[col] < 0
                if neg_mask.any():
                    df.loc[neg_mask, col] = 0.0

            elif col in ["orbper", "trandep", "trandur", "rade", "rad"]:
                neg_mask = df[col] < 0
                if neg_mask.any():
                    df.loc[neg_mask, col] = np.nan

            elif col == "logg":
                pass  # keep as-is

        # 4) Impute using saved imputer and convert back to DataFrame safely
        imputed_arr = self.imputer.transform(df[self.numeric_cols])
        df[self.numeric_cols] = pd.DataFrame(imputed_arr, columns=self.numeric_cols, index=df.index)

        # 5) Clip outliers using whisker_map
        for col in self.numeric_cols:
            if col not in df.columns:
                continue
            val = self.whisker_map.get(col)
            if val is None:
                continue
            if isinstance(val, dict):
                upper = float(val.get("upper", np.inf))
                lower = float(val.get("lower", -np.inf))
            elif isinstance(val, (list, tuple)) and len(val) >= 2:
                upper = float(val[0])
                lower = float(val[1])
            else:
                continue

            high_mask = df[col] > upper
            low_mask = df[col] < lower
            if high_mask.any():
                df.loc[high_mask, col] = upper
            if low_mask.any():
                df.loc[low_mask, col] = lower

        return df

    def predict(
        self,
        data: Union[str, pd.DataFrame, dict, List[dict]],
        return_proba: bool = False,
        output_csv_path: Optional[str] = None
    ) -> Union[Dict[str, Any], str]:
        """
        Predict on the given data.
        - Single-row input -> returns dict (JSON-ready)
        - Multi-row / CSV -> saves CSV and returns path to CSV
        """
        # normalize to DataFrame
        if isinstance(data, str) and os.path.exists(data):
            df = pd.read_csv(data)
        elif isinstance(data, pd.DataFrame):
            df = data.copy()
        elif isinstance(data, dict):
            df = pd.DataFrame([data])
        elif isinstance(data, list):
            df = pd.DataFrame(data)
        else:
            raise ValueError("data must be DataFrame, dict, list of dicts, or path to CSV")

        df_original = df.copy(deep=True)

        # preprocess
        df_proc = self._preprocess_df(df)

        # prepare feature matrix
        model_feature_order = self.meta.get("model_feature_order") or self.numeric_cols
        for col in model_feature_order:
            if col not in df_proc.columns:
                df_proc[col] = 0.0
        X = df_proc[model_feature_order].values

        # predict
        preds_idx = np.array(self.model.predict(X))

        # probabilities (if requested and supported)
        proba_list = None
        if return_proba and hasattr(self.model, "predict_proba"):
            proba_raw = self.model.predict_proba(X)
            if self.label_encoder is not None and hasattr(self.model, "classes_"):
                try:
                    class_labels = self.label_encoder.inverse_transform(self.model.classes_)
                except Exception:
                    class_labels = [str(c) for c in self.model.classes_]
            else:
                class_labels = [str(c) for c in getattr(self.model, "classes_", range(proba_raw.shape[1]))]
            proba_list = [
                {str(class_labels[c_i]): float(proba_raw[row_i, c_i])
                 for c_i in range(proba_raw.shape[1])}
                for row_i in range(proba_raw.shape[0])
            ]

        # decode labels if label encoder exists
        if self.label_encoder is not None:
            try:
                preds_decoded = self.label_encoder.inverse_transform(preds_idx)
            except Exception:
                preds_decoded = preds_idx
        else:
            preds_decoded = preds_idx

        n_rows = len(df_proc)
        # single-row -> return JSON/dict
        if n_rows == 1:
            #self.h_planet_radius = df_proc['rade']
            #self.h_orbital_period = df_proc['orbper']
            out = {
                "prediction_index": int(preds_idx[0]) if np.issubdtype(preds_idx.dtype, np.integer) else preds_idx[0],
                "prediction": str(preds_decoded[0]),
                "probabilities": proba_list[0] if proba_list is not None else None
            }
            return out

        # multi-row -> append columns to original and save CSV
        df_out = df_original.reset_index(drop=True)
        df_out["prediction"] = preds_decoded

        if proba_list is not None:
          confidence_levels = []
          for i, pred in enumerate(preds_decoded):
              confidence_levels.append(proba_list[i].get(str(pred), None))
          df_out["confidence_level"] = confidence_levels

        if not output_csv_path:
            timestamp = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
            output_csv_path = os.path.join(self.artifacts_dir, f"predictions_{timestamp}.csv")
        df_out.to_csv(output_csv_path, index=False)
        return output_csv_path

In [None]:
sample = {
    "pl_orbper": 12.3,
    "pl_trandep": 150,
    "pl_trandurh": 3.2,
    "pl_rade": 1.8,
    "pl_insol": 300,
    "pl_eqt": 1200,
    "st_teff": 5800,
    "st_logg": 4.5,
    "st_rad": 1.0
}

# instantiate once at server startup
service = PredictionModel(artifacts_dir="artifacts")

result = service.predict(X_test, return_proba=True)
# result is a dict -> return as JSON to chatbot

print(result['prediction'])
print(result['probabilities'][result['prediction']])

In [None]:
# instantiate once at server startup
service = PredictionModel(artifacts_dir="artifacts")

result = service.predict(X_test, return_proba=True, output_csv_path='predict_file.csv')

In [None]:
predict_df = pd.read_csv('predict_file.csv')
predict_df.head()

Unnamed: 0,orbper,trandep,trandur,rade,insol,eqt,teff,logg,rad,prediction,confidence_level
0,4.830142,192.0,0.957,1.3,275.02,1039.0,5693.0,4.467,0.939,CANDIDATE,0.591718
1,4.210006,7990.0,2.653,13.6161,427.599,1266.0,6160.0,4.2,1.42,CANDIDATE,0.739604
2,4.958322,5121.0,5.223,16.3815,862.417,1509.0,5801.0,3.64,2.55,CANDIDATE,0.665858
3,6.392869,451.956323,3.712893,3.317165,435.391459,1165.037021,5712.0,4.12,1.44969,CONFIRMED,0.732415
4,4.371313,2960.0,4.113,12.9868,847.146,1502.0,6029.0,3.71,2.45,CANDIDATE,0.889646


## **Model Trainer Class**

###- Class

In [None]:
import os
import json
import pickle
import joblib
from typing import Optional, Dict, Any, Union, Tuple
from scipy.stats import zscore
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from xgboost import XGBClassifier

# -------------------------
# Utility: Save artifacts
# -------------------------
def save_artifacts(
    model,
    label_encoder,
    imputer,
    whisker_map,
    feature_map,
    numeric_cols,
    artifacts_dir="artifacts",
):
    os.makedirs(artifacts_dir, exist_ok=True)
    # Save model
    with open(os.path.join(artifacts_dir, "model.pkl"), "wb") as f:
        pickle.dump(model, f)
    # Save label encoder
    if label_encoder is not None:
        with open(os.path.join(artifacts_dir, "label_encoder.pkl"), "wb") as f:
            pickle.dump(label_encoder, f)
    # Save imputer
    joblib.dump(imputer, os.path.join(artifacts_dir, "imputer.joblib"))
    # Save whisker_map
    with open(os.path.join(artifacts_dir, "whisker_map.json"), "w") as f:
        json.dump(whisker_map, f, indent=2)
    # Save metadata
    metadata = {
        "feature_map": feature_map,
        "numeric_cols": numeric_cols,
        "model_type": model.__class__.__name__,
        "imputer_filename": "imputer.joblib",
        "whisker_filename": "whisker_map.json",
        "model_filename": "model.pkl",
        "label_encoder_filename": "label_encoder.pkl",
        "saved_at": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
    }
    with open(os.path.join(artifacts_dir, "metadata.json"), "w") as f:
        json.dump(metadata, f, indent=2)
    print(f"Artifacts saved to {artifacts_dir}")


# -------------------------
# Trainer class
# -------------------------
class Trainer:
    def __init__(
        self,
        artifacts_dir: str = "artifacts",
        nasa_clean_csv_path: str = "nasa_clean_data.csv",
        zscore_thresh: float = 3.0,
    ):
        """
        Loads existing metadata (feature_map, numeric_cols) from artifacts_dir/metadata.json.
        The cleaned NASA CSV (clean_data.csv) is loaded only when merging is requested.
        """
        self.artifacts_dir = artifacts_dir
        self.nasa_clean_csv_path = nasa_clean_csv_path
        self.zscore_thresh = zscore_thresh

        # load metadata (feature_map and numeric_cols must exist)
        meta_path = os.path.join(self.artifacts_dir, "metadata.json")
        if not os.path.exists(meta_path):
            raise FileNotFoundError(f"metadata.json not found in {self.artifacts_dir}")

        meta = json.load(open(meta_path, "r"))
        self.feature_map = meta.get("feature_map", {})  # canonical -> [aliases...]
        if not isinstance(self.feature_map, dict):
            raise ValueError("feature_map in metadata.json must be a dict")

        self.numeric_cols = meta.get("numeric_cols")
        if not isinstance(self.numeric_cols, list):
            raise ValueError("numeric_cols must be present as a list in metadata.json")

        # create inverted rename map (alias -> canonical)
        self.rename_map = {old: new for new, olds in self.feature_map.items() for old in olds}

        # default XGBoost params (used if user didn't pass hyperparams)
        self.default_params = {
            "learning_rate": 0.1,
            "max_depth": 6,
            "n_estimators": 300,
            "random_state": 42,
            "eval_metric": "mlogloss",
        }

    # -------------------------
    # Preprocess user DataFrame
    # -------------------------
    def _preprocess_user_df(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, list]]:
        """
        Applies the exact preprocessing pipeline you specified to the user's dataframe.
        Returns (preprocessed_df, whisker_map_computed_from_this_df)

        Steps:
         1. rename columns (aliases -> canonical)
         2. normalize disposition (map known codes -> CONFIRMED/CANDIDATE/FALSE POSITIVE,
            then drop rows with other dispositions)
         3. drop duplicates
         4. coerce datatypes for numeric_cols
         5. missing handling: impute with median (fitted on this df)
         6. outlier removal using z-score (|z| > zscore_thresh) — removes rows
         7. handle negatives per your rules
         8. compute whisker_map using IQR (for later use)
        """
        df = df.copy()

        # 1) rename columns
        if self.rename_map:
            df = df.rename(columns=self.rename_map)

        # 2) normalize disposition
        if "disposition" in df.columns:
            # map common TOI codes to canonical strings
            df['disposition'] = df['disposition'].astype(str).str.upper().str.strip()
            # map short codes (TOI-like) to full labels
            df['disposition'] = df['disposition'].replace({
                "CP": "CONFIRMED",
                "KP": "CONFIRMED",
                "FP": "FALSE POSITIVE",
                "PC": "CANDIDATE",
                # also handle maybe lowercase/other forms
                "CANDIDATE": "CANDIDATE",
                "CONFIRMED": "CONFIRMED",
                "FALSE POSITIVE": "FALSE POSITIVE",
                "FALSE_POSITIVE": "FALSE POSITIVE",
                "FALSEPOSITIVE": "FALSE POSITIVE",
            })
            # keep only desired disposition values
            df = df[df['disposition'].isin(["CONFIRMED", "CANDIDATE", "FALSE POSITIVE"])]
        else:
            # If no disposition present, do not drop — but training requires target later
            pass

        # 3) remove duplicates (keep first)
        df = df.drop_duplicates().reset_index(drop=True)

        # 4) coerce datatypes for numeric columns (create missing columns as NaN)
        for c in self.numeric_cols:
            if c in df.columns:
                df[c] = pd.to_numeric(df[c], errors='coerce')
            else:
                df[c] = np.nan

        # 5) handle negatives per your rules
        for col in self.numeric_cols:
            if col not in df.columns:
                continue
            if col in ["insol", "eqt", "teff"]:
                # clip negatives to 0
                df[col] = df[col].clip(lower=0)
            elif col in ["orbper", "trandep", "trandur", "rade", "rad"]:
                # replace negatives with NaN (will re-impute later by training code)
                df.loc[df[col] < 0, col] = np.nan
            elif col == "logg":
                # keep as-is
                pass

        # 6) handle missing using SimpleImputer(median) - fit on this df
        imputer_local = SimpleImputer(strategy='median')
        # Fit only on numeric columns (ignoring all-nan columns)
        # If all values for a column are NaN, imputer will produce NaN — keep that in mind
        imputer_local.fit(df[self.numeric_cols])
        df[self.numeric_cols] = pd.DataFrame(imputer_local.transform(df[self.numeric_cols]),
                                             columns=self.numeric_cols,
                                             index=df.index)

        # 7) Handle outliers using zscore: remove rows with |z| > zscore_thresh in numeric cols:
        for col in self.numeric_cols:
          z = np.abs(zscore(df[col]))
          df.drop(df[z > self.zscore_thresh].index, inplace=True)
        df = df.reset_index(drop=True)

        # 8) compute whisker_map (IQR) using current df (note: values may contain NaNs)
        whisker_map = {}
        for col in self.numeric_cols:
            col_valid = df[col].dropna()
            if len(col_valid) == 0:
                whisker_map[col] = [float('inf'), float('-inf')]
                continue
            Q1 = col_valid.quantile(0.25)
            Q3 = col_valid.quantile(0.75)
            IQR = Q3 - Q1
            l_whisker = max(Q1 - 1.5 * IQR, float(col_valid.min()))
            u_whisker = Q3 + 1.5 * IQR
            whisker_map[col] = [u_whisker, l_whisker]

        # RETURN preprocessed df (with NaNs where necessary) and whisker_map computed from this preprocessed data
        return df, whisker_map

    # -------------------------
    # Train method
    # -------------------------
    def train(
        self,
        user_data: Union[str, pd.DataFrame],
        merge_with_nasa: bool = False,
        nasa_clean_csv_path: Optional[str] = None,
        hyperparams: Optional[Dict[str, Any]] = None,
        test_size: float = 0.2,
        artifacts_dir: Optional[str] = None,
        save_artifacts_flag: bool = True,
    ) -> Dict[str, Any]:
        """
        Train an XGBoost model on user data or merged with the cleaned NASA CSV.

        Parameters:
         - user_data: path to CSV or pandas DataFrame (must contain disposition/target)
         - merge_with_nasa: if True, will concatenate user's preprocessed df with the cleaned NASA CSV
         - nasa_clean_csv_path: override default clean csv path (if provided)
         - hyperparams: dict of scalar hyperparameters for XGB (no grid search). Example keys:
               'learning_rate', 'max_depth', 'n_estimators', 'subsample',
               'colsample_bytree', 'gamma', 'reg_alpha', 'reg_lambda'
         - returns: summary dict with model, metrics, artifact paths, etc.
        """
        artifacts_dir = artifacts_dir or self.artifacts_dir
        nasa_clean_csv_path = nasa_clean_csv_path or self.nasa_clean_csv_path

        # 1) load user data
        if isinstance(user_data, str) and os.path.exists(user_data):
            df_user = pd.read_csv(user_data)
        elif isinstance(user_data, pd.DataFrame):
            df_user = user_data.copy()
        elif isinstance(user_data, dict):
            df_user = pd.DataFrame([user_data])
        elif isinstance(user_data, list):
            df_user = pd.DataFrame(user_data)
        else:
            raise ValueError("user data must be DataFrame, dict, list of dicts, or path to CSV")

        # 2) preprocess user dataset
        df_user_pre, user_whisker = self._preprocess_user_df(df_user)

        # 3) optionally merge with cleaned nasa csv
        if merge_with_nasa:
            if not os.path.exists(nasa_clean_csv_path):
                raise FileNotFoundError(f"cleaned NASA CSV not found at {nasa_clean_csv_path}")
            df_nasa = pd.read_csv(nasa_clean_csv_path)
            # rename columns in NASA cleaned df too (just in case)
            df_nasa = df_nasa.rename(columns=self.rename_map)
            # combine and drop duplicates
            df_combined = pd.concat([df_user_pre, df_nasa], ignore_index=True, sort=False)
            df_combined = df_combined.drop_duplicates().reset_index(drop=True)
            working_df = df_combined
        else:
            working_df = df_user_pre.copy()

        # 4) ensure target exists and drop rows without it
        if 'disposition' not in working_df.columns:
            raise ValueError("Target column 'disposition' is missing after preprocessing.")
        working_df = working_df.dropna(subset=['disposition']).reset_index(drop=True)

        # 5) after merging we may want to re-impute (fit imputer on training set)
        # compute whisker_map on the training set using IQR
        whisker_map = {}
        for col in self.numeric_cols:
            valid = working_df[col].dropna()
            if len(valid) == 0:
                whisker_map[col] = [float('inf'), float('-inf')]
                continue
            Q1 = valid.quantile(0.25)
            Q3 = valid.quantile(0.75)
            IQR = Q3 - Q1
            l_whisker = max(Q1 - 1.5 * IQR, float(valid.min()))
            u_whisker = Q3 + 1.5 * IQR
            whisker_map[col] = [u_whisker, l_whisker]

        # 6) Fit imputer on training working_df numeric cols
        imputer = SimpleImputer(strategy='median')
        imputer.fit(working_df[self.numeric_cols])

        # 7) Impute training set
        working_df[self.numeric_cols] = pd.DataFrame(imputer.transform(working_df[self.numeric_cols]),
                                                    columns=self.numeric_cols,
                                                    index=working_df.index)

        # 8) Prepare X and y
        X = working_df[self.numeric_cols].copy()
        y = working_df['disposition'].astype(str).copy()

        # 9) encode y
        le = LabelEncoder()
        y_enc = le.fit_transform(y)

        # 10) train/test split
        X_train, X_val, y_train, y_val = train_test_split(
            X, y_enc, test_size=test_size, random_state=42, stratify=y_enc)

        # 11) train model with hyperparams or defaults
        params = self.default_params.copy()
        if hyperparams:
            # only take scalar values (not lists) — this class does not support grid search
            for k, v in hyperparams.items():
                if isinstance(v, (list, tuple)):
                    raise ValueError("hyperparams must be scalar values (no lists) — grid search not supported here.")
                params[k] = v
        model = XGBClassifier(**params)
        model.fit(X_train, y_train)

        # 12) evaluate
        y_pred_val = model.predict(X_val)
        try:
            acc = float(accuracy_score(y_val, y_pred_val))
            f1 = float(f1_score(y_val, y_pred_val, average='weighted'))
        except Exception:
            acc, f1 = None, None

        # 13) Save artifacts (imputer fitted on training working_df, whisker_map from training, label encoder, model)
        if save_artifacts_flag:
            save_artifacts(model, le, imputer, whisker_map, self.feature_map, self.numeric_cols, artifacts_dir)

        # 14) return summary
        summary = {
            "model": model,
            #"label_encoder": le,
            #"imputer": imputer,
            #"whisker_map": whisker_map,
            #"feature_map": self.feature_map,
            #"numeric_cols": self.numeric_cols,
            "params_used": params,
            "val_accuracy": acc,
            "val_f1_weighted": f1,
            #"artifacts_dir": artifacts_dir,
            "n_train_samples": len(X_train),
            "n_val_samples": len(X_val)
        }
        return summary

### - Call

In [None]:
from sklearn.model_selection import train_test_split

nasa_data = pd.read_csv('clean_data.csv')
clean_data70, clean_data30 = train_test_split(nasa_data, test_size=0.3, random_state=42, shuffle=True)

In [None]:
from google.colab import files
clean_data70.to_csv("nasa_data.csv", index=False)
files.download("nasa_data.csv")

clean_data30.to_csv("user_upload.csv", index=False)
files.download("user_upload.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
trainer = Trainer(artifacts_dir="artifacts", nasa_clean_csv_path="nasa_data.csv")
summary = trainer.train('user_upload.csv', merge_with_nasa=False)
print("val acc:", summary["val_f1_weighted"])

In [None]:
hp = {"learning_rate": 0.05, "max_depth": 5, "n_estimators": 500}
summary = trainer.train("user_upload.csv", merge_with_nasa=True, hyperparams=hp)
print("params used:", summary["params_used"])

In [None]:
print("val acc:", summary["val_f1_weighted"])