Notebook by: [Juliana Gómez Consuegra](https://www.linkedin.com/in/julianagomezconsuegra/)
Code taken from: https://huggingface.co/spaces/frugal-ai-challenge/submission-template/blob/main/tasks/text.py



In [None]:
!pip install -q datasets

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for utils (setup.py) ... [?25l[?25hdone


# Sample from site

In [None]:
from fastapi import APIRouter
from datetime import datetime
from datasets import load_dataset
from sklearn.metrics import accuracy_score
import random

from utils.evaluation import TextEvaluationRequest
from utils.emissions import tracker, clean_emissions_data, get_space_info

router = APIRouter()

DESCRIPTION = "Random Baseline"
ROUTE = "/text"

@router.post(ROUTE, tags=["Text Task"],
             description=DESCRIPTION)
async def evaluate_text(request: TextEvaluationRequest):
    """
    Evaluate text classification for climate disinformation detection.

    Current Model: Random Baseline
    - Makes random predictions from the label space (0-7)
    - Used as a baseline for comparison
    """
    # Get space info
    username, space_url = get_space_info()

    # Define the label mapping
    LABEL_MAPPING = {
        "0_not_relevant": 0,
        "1_not_happening": 1,
        "2_not_human": 2,
        "3_not_bad": 3,
        "4_solutions_harmful_unnecessary": 4,
        "5_science_unreliable": 5,
        "6_proponents_biased": 6,
        "7_fossil_fuels_needed": 7
    }

    # Load and prepare the dataset
    dataset = load_dataset(request.dataset_name)

    # Convert string labels to integers
    dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})

    # Split dataset
    train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed)
    test_dataset = train_test["test"]

    # Start tracking emissions
    tracker.start()
    tracker.start_task("inference")

    #--------------------------------------------------------------------------------------------
    # YOUR MODEL INFERENCE CODE HERE
    # Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
    #--------------------------------------------------------------------------------------------

    # Make random predictions (placeholder for actual model inference)
    true_labels = test_dataset["label"]
    predictions = [random.randint(0, 7) for _ in range(len(true_labels))]

    #--------------------------------------------------------------------------------------------
    # YOUR MODEL INFERENCE STOPS HERE
    #--------------------------------------------------------------------------------------------


    # Stop tracking emissions
    emissions_data = tracker.stop_task()

    # Calculate accuracy
    accuracy = accuracy_score(true_labels, predictions)

    # Prepare results dictionary
    results = {
        "username": username,
        "space_url": space_url,
        "submission_timestamp": datetime.now().isoformat(),
        "model_description": DESCRIPTION,
        "accuracy": float(accuracy),
        "energy_consumed_wh": emissions_data.energy_consumed * 1000,
        "emissions_gco2eq": emissions_data.emissions * 1000,
        "emissions_data": clean_emissions_data(emissions_data),
        "api_route": ROUTE,
        "dataset_config": {
            "dataset_name": request.dataset_name,
            "test_size": request.test_size,
            "test_seed": request.test_seed
        }
    }

    return results

ModuleNotFoundError: No module named 'utils.evaluation'

# Using codecarbon

In [None]:
!pip install -q codecarbon

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/516.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m512.0/516.7 kB[0m [31m15.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m516.7/516.7 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.4/66.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m48.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from datasets import load_dataset

#standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

from codecarbon import EmissionsTracker
import random
from sklearn.metrics import accuracy_score
from datetime import datetime

In [None]:
dataset = load_dataset("quotaclimat/frugalaichallenge-text-train")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
def evaluate_text(dataset, test_size=0.2, test_seed=42):
    # Initialize CodeCarbon tracker
    tracker = EmissionsTracker(project_name="text_classification_baseline")

    # Define the label mapping
    LABEL_MAPPING = {
        "0_not_relevant": 0,
        "1_not_happening": 1,
        "2_not_human": 2,
        "3_not_bad": 3,
        "4_solutions_harmful_unnecessary": 4,
        "5_science_unreliable": 5,
        "6_proponents_biased": 6,
        "7_fossil_fuels_needed": 7
    }

    # Convert string labels to integers
    dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})

    # Split dataset
    train_test = dataset["train"].train_test_split(test_size=test_size, seed=test_seed)
    test_dataset = train_test["test"]

    # Start tracking emissions
    tracker.start()

    ########################################################################
    # Random Baseline
    true_labels = test_dataset["label"]
    predictions = [random.randint(0, 7) for _ in range(len(true_labels))]
    ########################################################################

    # Stop tracking emissions
    emissions = tracker.stop()

    # Calculate accuracy
    accuracy = accuracy_score(true_labels, predictions)

    # Prepare results dictionary
    results = {
        "accuracy": float(accuracy),
        "energy_consumed_wh": emissions * 1000 if isinstance(emissions, float) else emissions.energy_consumed * 1000,
        "emissions_gco2eq": emissions * 1000 if isinstance(emissions, float) else emissions.emissions * 1000,
        "emissions_data": emissions,
        "dataset_config": {
            "dataset_name": "quotaclimat/frugalaichallenge-text-train",
            "test_size": test_size,
            "test_seed": test_seed
        }
    }

    return results


In [None]:
results = evaluate_text(dataset)

print(f"Accuracy: {results['accuracy']}")
print(f"Energy consumed (Wh): {results['energy_consumed_wh']}")
print(f"Emissions (gCO2eq): {results['emissions_gco2eq']}")



[codecarbon INFO @ 00:14:00] [setup] RAM Tracking...
[codecarbon INFO @ 00:14:00] [setup] CPU Tracking...
 Linux OS detected: Please ensure RAPL files exist at \sys\class\powercap\intel-rapl to measure CPU

[codecarbon INFO @ 00:14:01] CPU Model on constant consumption mode: Intel(R) Xeon(R) CPU @ 2.20GHz
[codecarbon INFO @ 00:14:01] [setup] GPU Tracking...
[codecarbon INFO @ 00:14:01] No GPU found.
[codecarbon INFO @ 00:14:01] >>> Tracker's metadata:
[codecarbon INFO @ 00:14:01]   Platform system: Linux-6.1.85+-x86_64-with-glibc2.35
[codecarbon INFO @ 00:14:01]   Python version: 3.11.11
[codecarbon INFO @ 00:14:01]   CodeCarbon version: 2.8.3
[codecarbon INFO @ 00:14:01]   Available RAM : 12.675 GB
[codecarbon INFO @ 00:14:01]   CPU count: 2
[codecarbon INFO @ 00:14:01]   CPU model: Intel(R) Xeon(R) CPU @ 2.20GHz
[codecarbon INFO @ 00:14:01]   GPU count: None
[codecarbon INFO @ 00:14:01]   GPU model: None
[codecarbon INFO @ 00:14:02] Saving emissions data to file /content/emissions.cs

Accuracy: 0.14109926168990977
Energy consumed (Wh): 6.365404314835473e-05
Emissions (gCO2eq): 6.365404314835473e-05
