In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [21]:
df = pd.read_csv("/content/drive/MyDrive/GPUlympics/synthetic_gpu_training.csv")
df.head()

Unnamed: 0,gpu_type,model_size,batch_size,learning_rate,seq_length,run_id,training_time_hrs,energy_kwh,efficiency_tok_per_watt
0,A100,7B,256,1e-05,512,1,2.0247,0.6767,1041.8987
1,A100,7B,256,1e-05,512,2,2.0691,0.7339,981.6916
2,A100,7B,256,1e-05,512,3,1.7997,0.5966,1050.4273
3,A100,7B,256,1e-05,1024,1,2.8383,1.0121,690.5203
4,A100,7B,256,1e-05,1024,2,2.818,0.9885,701.9325


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2592 entries, 0 to 2591
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   gpu_type                 2592 non-null   object 
 1   model_size               2592 non-null   object 
 2   batch_size               2592 non-null   int64  
 3   learning_rate            2592 non-null   float64
 4   seq_length               2592 non-null   int64  
 5   run_id                   2592 non-null   int64  
 6   training_time_hrs        2592 non-null   float64
 7   energy_kwh               2592 non-null   float64
 8   efficiency_tok_per_watt  2592 non-null   float64
dtypes: float64(4), int64(3), object(2)
memory usage: 182.4+ KB


# EDA

In [23]:
# gpu_type vs training_time_hrs

for y_val in ['training_time_hrs', 'energy_kwh', 'efficiency_tok_per_watt']:
    fig = px.scatter(
        df,
        x='gpu_type',
        y=y_val,
        color='model_size',
        size='batch_size',
        facet_col='seq_length',
        hover_data=['batch_size', 'seq_length', 'learning_rate', 'run_id'],
        height=500,
        width=1000,
        title=f'{y_val.replace("_", " ").title()} by GPU Type and Model Size',
        color_discrete_sequence=px.colors.qualitative.Set2,
        opacity=0.5
    )
    fig.update_yaxes(type='log' if y_val != 'efficiency_tok_per_watt' else 'linear')
    fig.update_layout(
        xaxis_title="GPU Type",
        yaxis_title=y_val.replace("_", " ").title(),
        showlegend=True,
        font=dict(size=12),
    )
    fig.show()

1. In Training time Vs GPU type and Model size for varying Sequence lengths
- For each seq_length plots are almost same relatively but as Sequence length increases, Training_time increased, for all the models of varying sizes.
- Usually as model_size increased, training_time increased.
- For A100, training time is more and for GB200, training time is less.

2. Now for EnergyKWH
- Energy consumption for various GPU's varies as seq_length varies
- Mostly there is more difference in energy consumption among gpu_types but as seq_length increases energy consumption increased
- 7B less energy and 405 B more energy
- Energy consumption is almost same for all the GPU's not great difference

3. For Efficiency tok per watt
- Bigger the size of the model, less the efficient it is, most of them are almost closer to zero when the model size is 405B
- As model size decreased, efficiency increased
- 7B is considered the most efficient model in all seq_lengths
- As seq_length increased efficiency decreased
- Efficiency is more for GB200 next H100 next A100

So if you want a faster model,
choose GB200 with 7B and seq_length = 512
if you want a greener model,
Choose GB200 with 7B and seq_length = 512
and if you want most efficient model,
Choose GB200 again with 7B and seql_length = 512

In [24]:
df_avg = df.groupby(['gpu_type', 'model_size', 'batch_size', 'seq_length', 'learning_rate'])[['training_time_hrs', 'energy_kwh', 'efficiency_tok_per_watt']].mean().reset_index()

In [25]:
df_avg.head()

Unnamed: 0,gpu_type,model_size,batch_size,seq_length,learning_rate,training_time_hrs,energy_kwh,efficiency_tok_per_watt
0,A100,405B,256,512,1e-05,1013.284267,353.066167,49.990267
1,A100,405B,256,512,5e-05,806.8853,282.274767,49.766267
2,A100,405B,256,512,0.0001,777.0615,264.738567,51.117733
3,A100,405B,256,512,0.0005,920.148533,318.961833,50.224467
4,A100,405B,256,1024,1e-05,1318.675067,459.118667,35.3703


In [26]:
for y_val in ['training_time_hrs', 'energy_kwh', 'efficiency_tok_per_watt']:
    fig = px.scatter(
        df_avg,
        x='gpu_type',
        y=y_val,
        color='model_size',
        size='batch_size',
        facet_col='seq_length',
        hover_data=['batch_size', 'seq_length', 'learning_rate'],
        height=500,
        width=1000,
        title=f'{y_val.replace("_", " ").title()} by GPU Type and Model Size',
        color_discrete_sequence=px.colors.qualitative.Set2,
        opacity=0.5
    )
    fig.update_yaxes(type='log' if y_val != 'efficiency_tok_per_watt' else 'linear')
    fig.update_layout(
        xaxis_title="GPU Type",
        yaxis_title=y_val.replace("_", " ").title(),
        showlegend=True,
        font=dict(size=12),
    )
    fig.show()

## Findings

### 1. GPU Type Sets the Pace for Training Speec

Latest GPUs like GB200 cut training time significantly compared to A100 due to higher throughput, making advanced hardware a must for speed.

In [27]:
fig = px.box(
    df_avg,
    x='gpu_type',
    y='training_time_hrs',
    color='model_size',
    facet_col='seq_length',
    height=500,
    width=1500,
    title="Training Time by GPU Type and Model Size Across Sequence Lengths",
    color_discrete_sequence=px.colors.qualitative.Set2,
    hover_data=['batch_size', 'learning_rate']
)
fig.update_yaxes(type='log', title="Training Time (Hours)")
fig.update_layout(
    xaxis_title="GPU Type",
    showlegend=True,
    font=dict(size=14),
    template='plotly_white'
)

fig.show()

### 2. Batch Size is a Double Win for Time and Energy

Smaller batch sizes spike both training time and energy consumption; cranking up batch size (e.g., to 8192) reduces both, highlighting its key role in optimization.

In [28]:
fig = px.scatter(
    df_avg,
    x='batch_size',
    y='training_time_hrs',
    size='energy_kwh',
    color='model_size',
    height=500,
    width=800,
    title="Training Time vs. Batch Size (Sized by Energy Consumption)",
    color_discrete_sequence=px.colors.qualitative.Set2,
    hover_data=['gpu_type']
)
fig.update_xaxes(type='log', title="Batch Size")
fig.update_yaxes(type='log', title="Training Time (Hours)")
fig.update_layout(
    showlegend=True,
    font=dict(size=14),
    template='plotly_white'
)
fig.show()

### 3. Long Sequences Drive Up Time and Energy Costs

As seq_length grows (from 512 to 4096), training time and energy consumption rise sharply, worsened by non-optimal learning_rate.

In [29]:
fig = px.scatter(
    df_avg,
    x='seq_length',
    y='training_time_hrs',
    size='energy_kwh',
    color='learning_rate',
    height=500,
    width=800,
    title="Training Time vs. Sequence Length (Sized by Energy Consumption)",
    color_discrete_sequence=px.colors.qualitative.Set1,
    hover_data=['gpu_type', 'model_size', 'batch_size']
)
fig.update_xaxes(type='log', title="Sequence Length")
fig.update_yaxes(type='log', title="Training Time (Hours)")
fig.update_layout(
    showlegend=True,
    font=dict(size=14),
    template='plotly_white'
)
fig.show()

### 4. Energy Consumption Stays Consistent Across GPUs

For a given seq_length, energy use is roughly similar among A100, H100, and GB200, with minimal differences—focusing the "green" choice on other factors like model size and learning_rate

In [30]:
fig = px.box(
    df_avg,
    x='gpu_type',
    y='energy_kwh',
    color='model_size',
    facet_col='seq_length',
    height=500,
    width=1500,
    title="Energy Consumption by GPU Type Across Sequence Lengths",
    color_discrete_sequence=px.colors.qualitative.Set2,
    hover_data=['batch_size', 'learning_rate']
)
fig.update_yaxes(type='log', title="Energy Consumption (kWh)")
fig.update_layout(
    xaxis_title="GPU Type",
    showlegend=True,
    font=dict(size=14),
    template='plotly_white'
)

### 5. Efficiency Peaks at Low Seq_Length

The shortest seq_length (512) delivers the highest efficiency (tokens/watt) for all GPUs, especially for 7B models on GB200, dropping as seq_length increases.

In [31]:
fig = px.scatter(
    df_avg,
    x='seq_length',
    y='efficiency_tok_per_watt',
    color='model_size',
    height=500,
    width=800,
    title="Efficiency (Tokens/Watt) vs. Sequence Length",
    color_discrete_sequence=px.colors.qualitative.Set2,
    hover_data=['gpu_type', 'batch_size', 'learning_rate']
)
fig.update_xaxes(type='log', title="Sequence Length")
fig.update_yaxes(title="Efficiency (Tokens/Watt)")
fig.update_layout(
    showlegend=True,
    font=dict(size=14),
    template='plotly_white'
)
fig.show()

## Top picks: Fastest, Greenest and Efficient GPU Configurations

### Fastest Config

GB200 with 7B model, sequence length 512, batch size 8192, and learning rate 0.0001 is the fastest configuration for AI training.

In [32]:
df_fastest = df_avg[
    (df_avg['model_size'] == '7B') &
    (df_avg['seq_length'] == 512) &
    (df_avg['batch_size'] == 8192) &
    (df_avg['learning_rate'] == 0.0001)
]

In [33]:
fig = px.bar(
    df_fastest,
    x='gpu_type',
    y='training_time_hrs',
    color='gpu_type',
    height=500,
    width=1000,
    title="Training Time for 7B Model, seq_length=512, batch_size=8192, learning_rate=0.0001",
    color_discrete_sequence=px.colors.qualitative.Set1,
    hover_data=['model_size', 'seq_length', 'batch_size', 'learning_rate']
)
fig.update_yaxes(type='log', title="Training Time (Hours)")
fig.update_layout(
    xaxis_title="GPU Type",
    showlegend=True,
    font=dict(size=14),
    template='plotly_white'
)

### Greenest Config

GB200 with 7B model, sequence length 512, batch size 8192, and learning rate 0.0001 is the greenest configuration for AI training.

In [34]:
df_greenest = df_avg[
    (df_avg['model_size'] == '7B') &
    (df_avg['seq_length'] == 512) &
    (df_avg['batch_size'] == 8192) &
    (df_avg['learning_rate'] == 0.0001)
]

In [35]:
fig = px.bar(
    df_greenest,
    x='gpu_type',
    y='energy_kwh',
    color='gpu_type',
    height=500,
    width=1000,
    title="Energy Consumption for 7B Model, seq_length=512, batch_size=8192, learning_rate=0.0001",
    color_discrete_sequence=px.colors.qualitative.Set1,
    hover_data=['model_size', 'seq_length', 'batch_size', 'learning_rate']
)
fig.update_yaxes(type='log', title="Energy Consumption")
fig.update_layout(
    xaxis_title="GPU Type",
    showlegend=True,
    font=dict(size=14),
    template='plotly_white'
)

### Efficient Config

In [36]:
df_eff = df_avg[
    (df_avg['model_size'] == '7B') &
    (df_avg['seq_length'] == 512) &
    (df_avg['batch_size'] == 8192) &
    (df_avg['learning_rate'] == 0.0001)
]

In [37]:
fig = px.bar(
    df_eff,
    x='gpu_type',
    y='efficiency_tok_per_watt',
    color='gpu_type',
    height=500,
    width=1000,
    title="Efficiency for 7B Model, seq_length=512, batch_size=8192, learning_rate=0.0001",
    color_discrete_sequence=px.colors.qualitative.Set1,
    hover_data=['model_size', 'seq_length', 'batch_size', 'learning_rate']
)
fig.update_yaxes(type='log', title="Efficient GPU")
fig.update_layout(
    xaxis_title="GPU Type",
    showlegend=True,
    font=dict(size=14),
    template='plotly_white'
)

In [38]:
import plotly.graph_objects as go

# Filter for the exact configuration
best_config = df_avg[
    (df_avg['seq_length'] == 512) &
    (df_avg['model_size'] == '7B') &
    (df_avg['gpu_type'] == 'GB200') &
    (df_avg['batch_size'] == 8192) &
    (df_avg['learning_rate'] == 0.0001)
]


df_filtered = df_avg[df_avg['seq_length'] == 512]

fig = px.scatter(
    df_filtered,
    x='training_time_hrs',
    y='energy_kwh',
    size='efficiency_tok_per_watt',
    color='model_size',
    height=500,
    width=1000,
    title="Fastest, Greenest, Most Efficient: Training Time vs. Energy (Sized by Efficiency) at seq_length=512",
    color_discrete_sequence=px.colors.qualitative.Set2,
    hover_data=['gpu_type', 'batch_size', 'learning_rate']
)

fig.update_xaxes(type='log', title="Training Time (Hours)")
fig.update_yaxes(type='log', title="Energy Consumption (kWh)")
fig.update_layout(
    showlegend=True,
    font=dict(size=14),
    template='plotly_white'
)

if not best_config.empty:
    fig.add_trace(go.Scatter(
        x=best_config['training_time_hrs'],
        y=best_config['energy_kwh'],
        mode='markers',
        marker=dict(
            color='red',
            size=16,
            symbol='star'
        ),
        name='Best Config',
        text="GB200 + 7B + 8192 + 0.0001",
        hoverinfo='text'
    ))


fig.show()



# Model Building

In [39]:
df.head()

Unnamed: 0,gpu_type,model_size,batch_size,learning_rate,seq_length,run_id,training_time_hrs,energy_kwh,efficiency_tok_per_watt
0,A100,7B,256,1e-05,512,1,2.0247,0.6767,1041.8987
1,A100,7B,256,1e-05,512,2,2.0691,0.7339,981.6916
2,A100,7B,256,1e-05,512,3,1.7997,0.5966,1050.4273
3,A100,7B,256,1e-05,1024,1,2.8383,1.0121,690.5203
4,A100,7B,256,1e-05,1024,2,2.818,0.9885,701.9325


In [40]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [41]:
X = df[['gpu_type','model_size','batch_size','learning_rate','seq_length','run_id']]
Y = df[['training_time_hrs', 'energy_kwh', 'efficiency_tok_per_watt']]

In [None]:
X

In [None]:
df['training_time_norm'] = df['training_time_hrs'] - df.groupby('gpu_type')['training_time_hrs'].transform('mean')
df['energy_norm'] = df['energy_kwh'] - df.groupby('gpu_type')['energy_kwh'].transform('mean')
df['efficiency_norm'] = df['efficiency_tok_per_watt'] - df.groupby('gpu_type')['efficiency_tok_per_watt'].transform('mean')
Y_transformed = df[['training_time_norm', 'energy_norm', 'efficiency_norm']]

In [None]:
def map_model_size(X):
    mapping = {'7B':0, '70B':1, '405B':2}
    X = X.copy()
    X['model_size'] = X['model_size'].map(mapping)
    return X

ordinal_transformer = FunctionTransformer(map_model_size)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('gpu', OneHotEncoder(), ['gpu_type'])
    ],
    remainder='passthrough'  # keep other numeric columns including model_size after mapping
)


In [None]:
full_preprocessing = Pipeline([
    ('model_mapping', ordinal_transformer),
    ('one_hot', preprocessor)
])


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
X_train

Unnamed: 0,batch_size,learning_rate,seq_length,model_size,gpu_type,run_id
1047,2048,0.00050,1024,7B,H100,1
1264,1024,0.00005,1024,70B,H100,2
1097,4096,0.00050,1024,7B,H100,3
25,256,0.00010,512,7B,A100,2
617,256,0.00050,1024,405B,A100,3
...,...,...,...,...,...,...
1638,4096,0.00001,2048,405B,H100,1
1095,4096,0.00050,1024,7B,H100,1
1130,8192,0.00010,512,7B,H100,3
1294,1024,0.00050,4096,70B,H100,2


In [None]:
X_train_transformed = full_preprocessing.fit_transform(X_train)
X_test_transformed = full_preprocessing.transform(X_test)

In [None]:
X_train_transformed.shape

(2073, 8)

In [None]:
lin_model = MultiOutputRegressor(LinearRegression())
lin_model.fit(X_train_transformed, y_train)
y_pred = lin_model.predict(X_test_transformed)
print(f"MSE: {mean_squared_error(y_test,y_pred)}")

MSE: 1171121.5558323201


In [None]:
rad_model = MultiOutputRegressor(RandomForestRegressor(random_state=42))
rad_model.fit(X_train_transformed, y_train)
y_pred = rad_model.predict(X_test_transformed)
print(f"MSE: {mean_squared_error(y_test,y_pred)}")

MSE: 5371.531035614082


In [None]:
gb_model = MultiOutputRegressor(GradientBoostingRegressor(random_state=42))
gb_model.fit(X_train_transformed,y_train)
y_pred = gb_model.predict(X_test_transformed)
print(f"MSE: {mean_squared_error(y_test,y_pred)}")

MSE: 11552.138782991757


In [None]:
gpu_types = ['A100', 'H100', 'GB200']
user_input = {
    'batch_size': 82,
    'learning_rate': 0.01,
    'seq_length': 256,
    'model_size': '405B',
    'run_id': 3
}

results = []

for gpu in gpu_types:
    # Prepare input dataframe
    df_input = pd.DataFrame([user_input])
    df_input['gpu_type'] = gpu

    # Apply preprocessing (this handles ordinal + one-hot encoding correctly)
    X_transformed = full_preprocessing.transform(df_input)

    # Predict metrics
    pred = rad_model.predict(X_transformed)

    results.append({
        'gpu_type': gpu,
        'training_time_hrs': pred[0][0],
        'energy_kwh': pred[0][1],
        'efficiency_tok_per_watt': pred[0][2]
    })

# Compare results
fastest_gpu = min(results, key=lambda x: x['training_time_hrs'])
greenest_gpu = min(results, key=lambda x: x['energy_kwh'])
efficient_gpu = max(results, key=lambda x: x['efficiency_tok_per_watt'])

print("Results for each GPU:")
for r in results:
    print(r)

print("\nBest GPUs:")
print(f"Fastest: {fastest_gpu['gpu_type']}")
print(f"Greenest: {greenest_gpu['gpu_type']}")
print(f"Most Efficient: {efficient_gpu['gpu_type']}")

In [None]:
param_dist = {
    "estimator__n_estimators": [100, 200],
    "estimator__max_depth": [None, 10, 20],
    "estimator__min_samples_split": [2, 5, 10],
    "estimator__min_samples_leaf": [1, 2, 4],
    "estimator__max_features": ["sqrt", "log2", None]
}

search = RandomizedSearchCV(
    estimator=rad_model,
    param_distributions=param_dist,
    n_iter=20,              # number of parameter settings to try
    cv=3,                   # 5-fold cross validation
    verbose=2,
    n_jobs=-1,
    scoring="neg_mean_squared_error",
    random_state=42
)

search.fit(X_train_transformed, y_train)

print("Best parameters found:", search.best_params_)
print("Best CV MSE:", -search.best_score_)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best parameters found: {'estimator__n_estimators': 100, 'estimator__min_samples_split': 5, 'estimator__min_samples_leaf': 2, 'estimator__max_features': None, 'estimator__max_depth': 10}
Best CV MSE: 5502.359865010184


In [None]:
best_rad_model = search.best_estimator_

In [None]:
gpu_types = ['A100', 'H100', 'GB200']
user_input = {
    'batch_size': 8192,
    'learning_rate': 0.0001,
    'seq_length': 512,
    'model_size': '7B',
    'run_id': 1
}

results = []

for gpu in gpu_types:
    # Prepare input dataframe
    df_input = pd.DataFrame([user_input])
    df_input['gpu_type'] = gpu

    # Apply preprocessing (this handles ordinal + one-hot encoding correctly)
    X_transformed = full_preprocessing.transform(df_input)

    # Predict metrics
    pred = best_rad_model.predict(X_transformed)

    results.append({
        'gpu_type': gpu,
        'training_time_hrs': pred[0][0],
        'energy_kwh': pred[0][1],
        'efficiency_tok_per_watt': pred[0][2]
    })

# Compare results
fastest_gpu = min(results, key=lambda x: x['training_time_hrs'])
greenest_gpu = min(results, key=lambda x: x['energy_kwh'])
efficient_gpu = max(results, key=lambda x: x['efficiency_tok_per_watt'])

print("Results for each GPU:")
for r in results:
    print(r)

print("\nBest GPUs:")
print(f"Fastest: {fastest_gpu['gpu_type']}")
print(f"Greenest: {greenest_gpu['gpu_type']}")
print(f"Most Efficient: {efficient_gpu['gpu_type']}")

Results for each GPU:
{'gpu_type': 'A100', 'training_time_hrs': np.float64(-204.0569044695985), 'energy_kwh': np.float64(-78.81815386930421), 'efficiency_tok_per_watt': np.float64(14507.902777587624)}
{'gpu_type': 'H100', 'training_time_hrs': np.float64(-113.77590327959399), 'energy_kwh': np.float64(-77.24743527593382), 'efficiency_tok_per_watt': np.float64(15195.050849151257)}
{'gpu_type': 'GB200', 'training_time_hrs': np.float64(-67.84638842216782), 'energy_kwh': np.float64(-65.35705178358147), 'efficiency_tok_per_watt': np.float64(17778.01930539815)}

Best GPUs:
Fastest: A100
Greenest: A100
Most Efficient: GB200


In [None]:
import joblib

joblib.dump(best_rad_model,'model.pkl')
joblib.dump(full_preprocessing,'preprocessor.pkl')

['preprocessor.pkl']

In [None]:
GPU_MEANS = df.groupby('gpu_type')[['training_time_hrs', 'energy_kwh', 'efficiency_tok_per_watt']].mean().to_dict(orient='index')
GPU_MEANS

{'A100': {'training_time_hrs': 204.14758506944443,
  'energy_kwh': 78.85746284722221,
  'efficiency_tok_per_watt': 1617.360736111111},
 'GB200': {'training_time_hrs': 67.88016145833333,
  'energy_kwh': 65.38791550925926,
  'efficiency_tok_per_watt': 1940.2053952546296},
 'H100': {'training_time_hrs': 113.83264594907408,
  'energy_kwh': 77.28707407407408,
  'efficiency_tok_per_watt': 1665.0073725694444}}