In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer

In [None]:
#show more columns in outputs
pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 200)

In [None]:
#loading the Summit DBE telemetry dataset

data_path = "datasets/points_with_jobs_tele_ult_original.csv"
df = pd.read_csv(data_path, low_memory=False)

In [None]:
#understanding the dataset structure
print("Number of records:", df.shape[0])
print("Number of columns:", df.shape[1])
print("Columns:", list(df.columns))

Number of records: 50259
Number of columns: 150
Columns: ['timestamp', 'hostname', 'GPU', 'serial', 'is_failure', 'prev_xid48', 'next_xid48', 'hours_in_job', 'allocation_id', 'user_name', 'account', 'gpu_energy', 'gpu_usage', 'flag_gpumps', 'flag_gpudefault', 'flag_smt1', 'flag_smt2', 'flag_smt4', 'flag_nvme', 'flag_cpublink', 'flag_disableautonuma', 'flag_isolategpfs', 'flag_maximizegpfs', 'n_jobs_1h', 'jobhours_1h', 'n_jobs_6h', 'jobhours_6h', 'n_jobs_24h', 'jobhours_24h', 'n_jobs_120h', 'jobhours_120h', 'job_max_power', 'job_range_power', 'job_min_power', 'n_no_tele_6h', 'n_no_tele_1h', 'n_no_tele_15min', 'n_no_tele_5min', 'n_no_tele_1min', 'n_NA_power_6h', 'n_NA_core_temp_6h', 'n_NA_mem_temp_6h', 'n_NA_power_1h', 'n_NA_core_temp_1h', 'n_NA_mem_temp_1h', 'n_NA_power_15min', 'n_NA_core_temp_15min', 'n_NA_mem_temp_15min', 'n_NA_power_5min', 'n_NA_core_temp_5min', 'n_NA_mem_temp_5min', 'n_NA_power_1min', 'n_NA_core_temp_1min', 'n_NA_mem_temp_1min', 'power_fluct_6h', 'core_temp_fluct_6h

In [5]:
print(df["is_failure"].value_counts())
print(df["is_failure"].value_counts(normalize=True))

is_failure
0    49964
1      295
Name: count, dtype: int64
is_failure
0    0.99413
1    0.00587
Name: proportion, dtype: float64


<b style="color:blue;">comment on output above:</b><br />
the output above shows that the *is_failure* is extremely imbalanced. only 295 failures out of 10k rows. This is normal for DBEs and totally expected because DBEs are rare.

In [6]:
print(df.dtypes.head(20))

timestamp           object
hostname            object
GPU                  int64
serial               int64
is_failure           int64
prev_xid48         float64
next_xid48         float64
hours_in_job       float64
allocation_id      float64
user_name           object
account             object
gpu_energy         float64
gpu_usage          float64
flag_gpumps         object
flag_gpudefault     object
flag_smt1           object
flag_smt2           object
flag_smt4           object
flag_nvme           object
flag_cpublink       object
dtype: object


In [7]:
#checking missing values
missing = df.isnull().sum()
print("Columns with missing values:")
print(missing[missing > 0].sort_values(ascending=False).head(30))

Columns with missing values:
last_zero_power         50240
prev_xid48              49982
next_xid48              49977
last_zero_core_temp     49848
last_zero_mem_temp      49833
flag_gpumps             20913
flag_nvme               20913
flag_gpudefault         20913
flag_disableautonuma    20913
flag_cpublink           20913
flag_isolategpfs        20913
flag_maximizegpfs       20913
flag_smt2               20913
flag_smt1               20913
flag_smt4               20913
mem_temp_fluct_1min     17450
mem_temp_fluct_5min     17353
mem_temp_range_1min     17344
mem_temp_min_1min       17344
mem_temp_max_1min       17344
mem_temp_mean_1min      17344
mem_temp_range_5min     17334
mem_temp_min_5min       17334
mem_temp_max_5min       17334
mem_temp_mean_5min      17334
mem_temp_fluct_15min    17322
mem_temp_range_15min    17316
mem_temp_max_15min      17316
mem_temp_min_15min      17316
mem_temp_mean_15min     17316
dtype: int64


<b style="color:blue;">comment on output above:</b><br />
based on output above many features have more than 95% missing values so they need to be removed

<b>Important insight:</b> Double bit errors (DBEs) are rare, which matches the low proportion of failures in this dataset. This supports the motivation for proactive ML prediction.

In [None]:
df_clean = df.copy()

# Instead of dropping all ID columns, drop only irrelevant ones
to_drop = [
    "allocation_id",
    "user_name",
    "account"
]

df_clean = df_clean.drop(columns=to_drop)
print("Shape after dropping irrelevant IDs:", df_clean.shape)

Shape after dropping ID columns: (50259, 143)


<b style="color:blue;">comment on output above:</b><br />
The dataset went from 150 to 143 columns after removing identifier fields. These columns describe users, nodes, timestamps, or GPU IDs rather than actual hardware behavior. Removing them prevents the model from learning irrelevant patterns. These do not describe GPU health or failure conditions. They would cause overfitting to node/user identity so we removed them

In [None]:
#dropping features with extreme missing values moew than 95% missing
#dropped features below appear only in rare cases and would not help a generalizable model. For example prev_xid48 and next_xid48 exist in fewer than 1 percent of rows. 
high_missing = ["prev_xid48", "next_xid48",
                "last_zero_power", "last_zero_core_temp", "last_zero_mem_temp"]

df_clean = df_clean.drop(columns=high_missing)
print(df_clean.shape)

Dropped: ['prev_xid48', 'next_xid48', 'last_zero_power', 'last_zero_core_temp', 'last_zero_mem_temp']
Shape after dropping high-missing columns (50259, 138)


<b style="color:blue;">comment on output above:</b><br />
Only five features had more than 95% missing entries. These fields rarely appear and provide almost no usable signal. Dropping them reduces noise and avoids unreliable imputations.

In [10]:
#listing remaining object-type columns
obj_cols = df_clean.select_dtypes(include=["object"]).columns.tolist()
print("Object columns:", obj_cols)

Object columns: ['flag_gpumps', 'flag_gpudefault', 'flag_smt1', 'flag_smt2', 'flag_smt4', 'flag_nvme', 'flag_cpublink', 'flag_disableautonuma', 'flag_isolategpfs', 'flag_maximizegpfs']


<b style="color:blue;">comment on output above:</b><br />
based on above output all of the object columns are flag columns that represent job-submission performance flags. they should be binary features, but in this dataset they appear as strings.

In [None]:
flag_cols = [
'flag_gpumps', 'flag_gpudefault', 'flag_smt1', 'flag_smt2', 'flag_smt4',
'flag_nvme', 'flag_cpublink', 'flag_disableautonuma', 'flag_isolategpfs', 'flag_maximizegpfs'
]

for col in flag_cols:
    print(col)
    print(df_clean[col].unique()[:20]) # print first 20 unique values
    print("Total unique:", df_clean[col].nunique())
    print("-" * 40)

flag_gpumps
[True False nan]
Total unique: 2
----------------------------------------
flag_gpudefault
[False nan True]
Total unique: 2
----------------------------------------
flag_smt1
[False nan True]
Total unique: 2
----------------------------------------
flag_smt2
[False nan True]
Total unique: 2
----------------------------------------
flag_smt4
[False nan True]
Total unique: 2
----------------------------------------
flag_nvme
[False nan True]
Total unique: 2
----------------------------------------
flag_cpublink
[False nan True]
Total unique: 2
----------------------------------------
flag_disableautonuma
[False nan True]
Total unique: 2
----------------------------------------
flag_isolategpfs
[False nan True]
Total unique: 2
----------------------------------------
flag_maximizegpfs
[False nan True]
Total unique: 2
----------------------------------------


In [None]:
#binary conversion for flags
#Converting True → 1, False → 0, NaN → 0

df_clean[flag_cols] = df_clean[flag_cols].fillna(False).astype(int)

print(df_clean[flag_cols].head())

   flag_gpumps  flag_gpudefault  flag_smt1  flag_smt2  flag_smt4  flag_nvme  flag_cpublink  flag_disableautonuma  flag_isolategpfs  flag_maximizegpfs
0            1                0          0          0          0          0              0                     0                 0                  0
1            0                0          0          0          0          0              0                     0                 0                  0
2            0                0          0          0          0          0              0                     0                 0                  0
3            0                0          1          0          0          0              0                     0                 0                  0
4            0                0          0          0          0          0              0                     0                 0                  0


  df_clean[flag_cols] = df_clean[flag_cols].fillna(False).astype(int)


In [13]:
obj_cols = df_clean.select_dtypes(include=["object"]).columns.tolist()
print("Remaining object columns:", obj_cols)

Remaining object columns: []


remaining object columns is empty meaning:<br />
• All features are numeric<br />
• Dataset is ready for imputation<br />
• No categorical encoding needed<br />
• Model training pipeline is clean

<b style="color:blue;">comment on output above:</b><br />
based on output above there is no string text or categorical columns left. our dataset now only has numeric features. which is great for tree-based models like XGBoost because they work best with numeric features. soo based on that, we dont need to do any encoding steps. the dataset is fairly clean from a datatype perspective. so to summary, no object columns remain. All features left are numeric telemetry metrics. This simplifies preprocessing and avoids encoding steps.

In [None]:
#Impute missing numeric values, thermal and power aggregates still have missing values
numeric_cols = df_clean.select_dtypes(include=["float64", "int64"]).columns
imputer = SimpleImputer(strategy="median")

df_clean[numeric_cols] = imputer.fit_transform(df_clean[numeric_cols])

print("Missing numeric:", df_clean[numeric_cols].isnull().sum().sum())

Any missing left: False


In [None]:
#SAVE CLEANED FILE

df_clean.to_parquet("datasets/summit_dbe_processed_with_timenew.parquet", index=False)
print("Saved corrected file.")

Cleaned dataset saved to datasets/summit_dbe_processed.parquet
