# imports

In [None]:
import os
import pandas as pd
import numpy as np
import re
from transformers import AutoTokenizer
import nltk
from nltk.corpus import stopwords

# Tokenizer & Sop words) Setup for NLP

In [None]:
nltk.download("stopwords", quiet=True)
stop_words = set(stopwords.words("english"))

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Load and clean raw data

In [None]:
def clean_text(text):
    #Basic cleaning of data: remove extra spaces and very short string


    if not isinstance(text, str):
        text = str(text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text if len(text) > 5 else None

def load_clean_raw(csv_path="data/raw/raw_prompts.csv"):
    #load csv and clean prompts
    df = pd.read_csv(csv_path)
    df["prompt"] = df["prompt"].apply(clean_text)
    df = df.dropna(subset=["prompt"]).reset_index(drop=True)
    return df

df_raw = load_clean_raw()
print(f"Raw data loaded:{len(df_raw)} rows")

# Feature Engineering Functions

In [None]:

def compute_feature(prompt, num_layers, training_hours, flops_per_hour):
    #Compute features for a given prompt and model params



    word = prompt.split()
    chars = len(prompt)

    #token count using tokenizer
    token_counter = len(tokenizer.encode(prompt))

    #ratio of punctiuatioin characters
    punct_ratio = sum(1 for c in prompt if c in ".,!?;:") /max(chars,1)

    #Average word length
    avg_word_len = sum(len(w) for w in word) /max(len(word),1)


    #Ratio of stopwrods

    stopword_ratio = sum(1 for w in word if w.lower() in stop_words) /max(len(word),1)

    #Derived numeric features
    flops_per_layer = flops_per_hour / max(num_layers,1)
    training_efficiency = training_hours / max(num_layers,1)

    return {
        "prompt": prompt,
        "token_count": token_counter,
        "char_count": chars,
        "punct_ratio": punct_ratio,
        "avg_word_length": avg_word_len,
        "stopword_ratio": stopword_ratio,
        "num_layers": num_layers,
        "training_hours": training_hours,
        "flops_per_hour": flops_per_hour,
        "flops_per_layer": flops_per_layer,
        "training_efficiency": training_efficiency,

    }

def create_feature_pipeline(df, num_layers_list, training_hours_list, flops_per_hour_list):
    rows = []
    for i, row in df.itterows():
        features = compute_feature(
            row["prompt"], num_layers_list, training_hours_list, flops_per_hour_list
        )
        rows.append(features)
    return pd.DataFrame(rows)

# Generate Random MOdel params anc ompute features


In [None]:

n = len(df_raw)
layers = np.random.randint(4,48, size=n)
hours = np.random.uniform(0.5, 20, size=n)
flops = np.random.uniform(1e9, 1e12, size=n)

features_df = create_feature_pipeline(df_raw, layers, hours, flops)

    #Save processed dataset
os.makedirs("data/processed", exist_ok=True)
features_df.to_csv("data/processed/features_df.csv", index=False)
print("Processed dataset saved to data/processed/features_df.csv")


# Generate Synth Energy Dataset

In [None]:


def generate_energy_data(df):
    energy = (
        df["training_hours"] * 0.5 +
        df["flops_per_hour"] * 1e12 +
        df["num_layers"] * 0.2 +
        df["token_count"] * 0.003 +
        df["avg_word_len"] * 0.10
    )
    #Generate synthetic energy labels for testing

    #Create synth energy consumption column for testing purposes
    df_energy = df.copy()
    df_energy["energy_estimate"] = energy
    os.makedirs("data/synthetic", exist_ok=True)
    df_energy.to_csv("data/synthetic/df_energy.csv", index=False)
    return df_energy

energy_df = generate_energy_data(features_df)
print("Synthetic energy dataset saved to data/synthetic/energy_dataset.csv")