In [6]:
from src.utility import get_root
import pandas as pd
import os
from typing import Tuple, List, Optional
import numpy as np
from datetime import timedelta


In [7]:
def estimate_flops(model: str, input_size: Tuple[int, int], training_strategy: str, sample_count: int, estimated_epochs: int, flops_df) -> float:

    model_info = flops_df[flops_df["Model"] == model]

    if model_info.empty:
        raise ValueError(f"Model {model} not found in the flops database")
    
    model_type = model_info['Type'].iloc[0]
    original_input_size = model_info['Input Size'].iloc[0].split()[0]

    if model_type == 'Vision':
        width, height = map(int, original_input_size.split('x'))
        scaling = (input_size[0] * input_size[1]) / (width * height)
    else:
        scaling = input_size[0] / int(original_input_size)

    if training_strategy in ["Fine-tuning the whole model", "Full Training"]:
        return estimated_epochs * sample_count * model_info['FLOPs'].iloc[0] * scaling * 3
    elif training_strategy == "Last Layer Learning":
        return estimated_epochs * sample_count * 2 * model_info['Last Layer FLOPs'].iloc[0] + model_info['FLOPs'].iloc[0] * scaling
    else:
        raise ValueError(f"Unsupported training strategy: {training_strategy}")


In [8]:
def estimate_time(flops: float, gpu: str, training_strategy: str, tflops: str, gpu_df) -> float:
    path_to_gpu = os.path.join(get_root(), "data", "gpus.csv")
    gpu_df = pd.read_csv(path_to_gpu)
    gpu_info = gpu_df[gpu_df["name"] == gpu]

    if gpu_info.empty:
        raise ValueError(f"GPU {gpu} not found in the flops database")
    
    return flops / gpu_info[tflops].iloc[0] / 1e+12 if training_strategy in ["Full Training", "Fine-tuning the whole model"] else flops / gpu_info[tflops].iloc[0] / 1e+12 / 3

In [9]:
def calculate_kwh_consumption(gpu_name, time_seconds, gpu_df):

    tdp_watts = gpu_df.loc[gpu_df['name'] == gpu_name, 'tdp_watts'].values[0]
    
    # Convert TDP from watts to kilowatts
    tdp_kw = tdp_watts / 1000
    
    # Convert time from seconds to hours
    time_hours = time_seconds / 3600
    
    # Calculate the energy consumption in kWh
    energy_consumption_kwh = tdp_kw * time_hours
    
    return energy_consumption_kwh


In [11]:
def normalize_data(data):
    return (data - data.min()) / (data.max() - data.min())

def get_tflops_value(perf_data, tflops_type):
    # If the requested TFLOPS type is NaN, use the available non-NaN TFLOPS value
    if pd.notna(perf_data.get(tflops_type, np.nan)):
        return perf_data[tflops_type]
    elif pd.notna(perf_data.get('TFLOPS32', np.nan)):
        return perf_data['TFLOPS32']
    elif pd.notna(perf_data.get('TFLOPS16', np.nan)):
        return perf_data['TFLOPS16']
    else:
        raise ValueError(f"No valid TFLOPS value found for the GPU: {perf_data['name']}")

def recommend_gpu(scores, tflops_type):
    pricing_df = pd.read_excel(os.path.join(get_root(), 'data', 'pricing', 'GCP gpus pricing.xlsx'))
    pricing_df.columns = ['Region', 'GPU', 'Price']
    gpu_prices = pricing_df.groupby('GPU')['Price'].mean().reset_index()
    gpu_prices['Normalized_Price'] = normalize_data(gpu_prices['Price'])
    
    performance_df = pd.read_csv(os.path.join(get_root(), 'data', 'gpus.csv'))
    
    manual_map = {
        'T4': 'T4',
        'V100': 'Tesla V100-PCIE-16GB',
        'P100': 'Tesla P100',
        'K80': 'Tesla K80',
    }
    
    gpu_prices['Mapped_GPU'] = gpu_prices['GPU'].apply(lambda x: manual_map.get(x, x))
    performance_data = {row['name']: row for _, row in performance_df.iterrows()}
    
    merged_data = []
    for _, row in gpu_prices.iterrows():
        pricing_gpu = row['Mapped_GPU']
        if pricing_gpu in performance_data:
            perf_data = performance_data[pricing_gpu]
            print(perf_data)
            tflops_value = get_tflops_value(perf_data, tflops_type)
            print(tflops_value)
            print('#'*50)
            tflops_per_watt = tflops_value / perf_data['tdp_watts']
            merged_data.append({
                'GPU': row['GPU'],
                'Mapped_GPU': pricing_gpu,
                'Price': row['Price'],
                'Normalized_Price': row['Normalized_Price'],
                'TDP_Watts': perf_data['tdp_watts'],
                tflops_type: tflops_value,
                'TFLOPS_per_Watt': tflops_per_watt
            })
    
    merged_df = pd.DataFrame(merged_data)
    
    merged_df['Normalized_TFLOPS'] = normalize_data(merged_df[tflops_type])
    merged_df['Normalized_TDP'] = normalize_data(merged_df['TDP_Watts'])
    merged_df['Normalized_TFLOPS_per_Watt'] = normalize_data(merged_df['TFLOPS_per_Watt'])

    merged_df['Price_Score'] = (1 - merged_df['Normalized_Price']) * scores['price']
    merged_df['TFLOPS_Score'] = merged_df['Normalized_TFLOPS'] * scores['time']
    merged_df['TDP_Score'] = merged_df['Normalized_TFLOPS_per_Watt'] * scores['co2']

    merged_df['Total_Score'] = merged_df['Price_Score'] + merged_df['TFLOPS_Score'] + merged_df['TDP_Score']
    
    best_gpu = merged_df.sort_values('Total_Score', ascending=False).iloc[0]
    
    return best_gpu['Mapped_GPU']


In [12]:
def calculate_emissions(kwh: float, region: str, emissions_df) -> float:
    emissions_df = emissions_df[emissions_df['region'] == region]
    emissions = emissions_df['impact'].iloc[0]
    return kwh * emissions

In [13]:
def calculate_price(gpu: str, region: str, time: float, pricing_df) -> float:
    pricing_df = pricing_df[pricing_df['region'] == region]
    price = pricing_df[pricing_df['gpu'] == gpu]['price'].iloc[0] * time
    return price

In [15]:
b

NameError: name 'b' is not defined

In [28]:
def format_time(seconds):
    delta = timedelta(seconds=seconds)
    months, days = divmod(delta.days, 30)  # Approximate months
    hours, remaining = divmod(delta.seconds, 3600)
    minutes, seconds = divmod(remaining, 60)
    
    parts = []
    if months > 0:
        parts.append(f"{months}m")
    if days > 0:
        parts.append(f"{days}d")
    if hours > 0:
        parts.append(f"{hours}h")
    if minutes > 0:
        parts.append(f"{minutes}min")
    if seconds > 0 or not parts:
        parts.append(f"{seconds}s")
    
    return " ".join(parts)

def recommend_gpu_configuration(model, input_size, training_strategy, sample_count, estimated_epochs, 
                                time_coeff, cost_coeff, co2_coeff, tflops_type,
                                max_time=None, max_cost=None, max_co2=None):

    pricing_df = pd.read_excel(os.path.join(get_root(), 'data', 'pricing', 'GCP gpus pricing.xlsx'))
    gpu_df = pd.read_csv(os.path.join(get_root(), 'data', 'gpus.csv'))
    flops_df = pd.read_excel(os.path.join(get_root(), 'data', 'model_flops', 'model_flops.xlsx'))
    emissions_df = pd.read_csv(os.path.join(get_root(), 'data', 'impact.csv'))
    
    manual_map = {
        'T4': 'T4',
        'V100': 'Tesla V100-PCIE-16GB',
        'P100': 'Tesla P100',
        'K80': 'Tesla K80',
    }

    pricing_df['Mapped_GPU'] = pricing_df['gpu'].map(manual_map).fillna(pricing_df['gpu'])

    total_flops = estimate_flops(model, input_size, training_strategy, sample_count, estimated_epochs, flops_df)

    results = []

    for _, price_row in pricing_df.iterrows():
        gpu_pricing = price_row['gpu']
        gpu_model_name = price_row['Mapped_GPU']
        region = price_row['region']

        # Find corresponding performance data
        perf_data = gpu_df[gpu_df['name'] == gpu_model_name]
        
        if perf_data.empty:
            print(f"Warning: No performance data found for GPU {gpu_model_name}")
            continue

        time_seconds = estimate_time(total_flops, gpu_model_name, training_strategy, tflops_type, gpu_df)
        
        price = calculate_price(gpu_pricing, region, time_seconds / 3600, pricing_df)  # convert seconds to hours
        
        kwh = calculate_kwh_consumption(gpu_model_name, time_seconds, gpu_df)
        co2 = calculate_emissions(kwh, region, emissions_df) / 1000
        
        results.append({
            'GPU': gpu_pricing,
            'Mapped_GPU': gpu_model_name,
            'Region': region,
            'Time': time_seconds,
            'Time (formatted)': format_time(time_seconds),
            'Cost ($)': price,
            'CO2 (kg)': co2
        })

    # Create DataFrame
    df = pd.DataFrame(results)

    for col in ['Time', 'Cost ($)', 'CO2 (kg)']:
        df[f'Normalized_{col}'] = normalize_data(df[col])
        df[f'{col}_Score'] = (1 - df[f'Normalized_{col}']) * 5 

    df['Ranking'] = (
        df['Time_Score'] * time_coeff + 
        df['Cost ($)_Score'] * cost_coeff + 
        df['CO2 (kg)_Score'] * co2_coeff
    )

    # Apply constraints
    if max_time:
        df = df[df['Time'] <= max_time]
    if max_cost:
        df = df[df['Cost ($)'] <= max_cost]
    if max_co2:
        df = df[df['CO2 (kg)'] <= max_co2]

    df.dropna(inplace=True)
    df = df.sort_values('Ranking', ascending=False)
    df.reset_index(drop=True, inplace=True)

    return df

In [29]:
recommend_gpu_configuration("fasterrcnn_resnet50_fpn", (256, 256), "Fine-tuning the whole model", 1000000, 40, 0.5, 0.3, 0.2, "TFLOPS32")

Unnamed: 0,GPU,Mapped_GPU,Region,Time,Time (formatted),Cost ($),CO2 (kg),Normalized_Time,Time_Score,Normalized_Cost ($),Cost ($)_Score,Normalized_CO2 (kg),CO2 (kg)_Score,Ranking
0,T4,T4,northamerica-northeast1,2587147.0,29d 22h 39min 7s,251.528189,1.735545,0.302063,3.489685,0.0,5.0,0.0,5.0,4.244842
1,T4,T4,europe-west1,2587147.0,29d 22h 39min 7s,251.528189,13.431605,0.302063,3.489685,0.0,5.0,0.029924,4.850381,4.214919
2,T4,T4,us-west1,2587147.0,29d 22h 39min 7s,251.528189,14.970958,0.302063,3.489685,0.0,5.0,0.033862,4.830689,4.21098
3,T4,T4,us-east1,2587147.0,29d 22h 39min 7s,251.528189,18.502414,0.302063,3.489685,0.0,5.0,0.042897,4.785514,4.201945
4,T4,T4,us-east4,2587147.0,29d 22h 39min 7s,251.528189,18.502414,0.302063,3.489685,0.0,5.0,0.042897,4.785514,4.201945
5,T4,T4,asia-east1,2587147.0,29d 22h 39min 7s,251.528189,28.02024,0.302063,3.489685,0.0,5.0,0.067248,4.663759,4.177594
6,T4,T4,us-central1,2587147.0,29d 22h 39min 7s,251.528189,28.488083,0.302063,3.489685,0.0,5.0,0.068445,4.657774,4.176397
7,T4,T4,europe-west4,2587147.0,29d 22h 39min 7s,251.528189,28.623908,0.302063,3.489685,0.0,5.0,0.068793,4.656037,4.17605
8,T4,T4,asia-southeast1,2587147.0,29d 22h 39min 7s,265.901229,21.078062,0.302063,3.489685,0.01787,4.910649,0.049487,4.752565,4.16855
9,T4,T4,asia-northeast1,2587147.0,29d 22h 39min 7s,265.901229,25.957709,0.302063,3.489685,0.01787,4.910649,0.061971,4.690144,4.156066
