In [7]:
import pandas as pd
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
import torch
print(torch.__version__)

2.7.1+cpu


In [8]:
df = pd.read_csv('data/datacenter_dataset.csv')

cols = ['country', 'total_data_centers', 'hyperscale_data_centers',
        'colocation_data_centers', 'floor_space_sqft_total',
        'power_capacity_MW_total', 'average_renewable_energy_usage_percent', 
        'cloud_provider', 'internet_penetration_percent', 
        'avg_latency_to_global_hubs_ms',
        'number_of_fiber_connections',
        'growth_rate_of_data_centers_percent_per_year',
        'cooling_technologies_common', 'regulatory_challenges_or_limits']
int_cols = [
    'total_data_centers', 
    'hyperscale_data_centers',
    'colocation_data_centers', 
    'floor_space_sqft_total',
    'power_capacity_MW_total',
    'growth_rate_of_data_centers_percent_per_year'
]
float_cols = [
    'average_renewable_energy_usage_percent',
    'internet_penetration_percent', 
    'avg_latency_to_global_hubs_ms',
    'number_of_fiber_connections',
]

df = df[cols]

df[int_cols] = df[int_cols].replace(r'[~,+]', '', regex=True) \
    .replace(r'\([^)]*\)', '', regex=True) \
    .apply(lambda x: x.str.split('[-–—]').str[-1] if x.dtype == 'object' else x) \
    .replace(r'[^\d]+', ' ', regex=True) \
    .apply(pd.to_numeric, errors='coerce') \
    .astype('Int64')
    
df[float_cols] = df[float_cols].replace(r'[~,+]', '', regex=True) \
    .replace(r'\([^)]*\)', '', regex=True) \
    .apply(lambda x: x.str.split('[-–—]').str[-1] if x.dtype == 'object' else x) \
    .replace(r'[^\d.]+', ' ', regex=True) \
    .apply(pd.to_numeric, errors='coerce') \
    .astype('Float64')
    
df.info()
df.to_csv('test.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 191 entries, 0 to 190
Data columns (total 14 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   country                                       191 non-null    object 
 1   total_data_centers                            191 non-null    Int64  
 2   hyperscale_data_centers                       191 non-null    Int64  
 3   colocation_data_centers                       191 non-null    Int64  
 4   floor_space_sqft_total                        184 non-null    Int64  
 5   power_capacity_MW_total                       165 non-null    Int64  
 6   average_renewable_energy_usage_percent        148 non-null    Float64
 7   cloud_provider                                149 non-null    object 
 8   internet_penetration_percent                  190 non-null    Float64
 9   avg_latency_to_global_hubs_ms                 145 non-null    Flo

In [10]:
# Load model and tokenizer
print("Loading model...")
model = AutoModelForSequenceClassification.from_pretrained("ModernBERT-large-zeroshot-v2.0", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("ModernBERT-large-zeroshot-v2.0", trust_remote_code=True)

classifier = pipeline(
    "zero-shot-classification", 
    model=model,
    tokenizer=tokenizer,
    device=0, 
    multi_label=True
)

# Define your alias map
alias_map = {
    "GCP": ["GCP", "Google", "Google Cloud"],
    "AWS": ["AWS", "Amazon", "Amazon Web Services"],
    "Azure": ["Azure", "Microsoft", "Microsoft Azure"],
    "Oracle": ["Oracle"]
}

# Classification function with logging
def classify_with_aliases(text, alias_map, threshold=0.2, row_num=None, total_rows=None):
    if row_num is not None and total_rows is not None:  # print row progress
        print(f"Row {row_num}/{total_rows}")  # new line
    print(f"\nProcessing text: {text}")
    
    if not isinstance(text, str) or not text.strip():
        print(" - Skipped: Empty or invalid input")
        return {k: 0 for k in alias_map}
    
    # Flatten all aliases
    all_aliases = [alias for aliases in alias_map.values() for alias in aliases]
    print(f" - Candidate labels: {all_aliases}")
    
    # Run classification using pipeline
    result = classifier(text, all_aliases)
    
    # Initialize score dict
    label_scores = {key: 0 for key in alias_map}
    
    # Assign scores based on best match per group
    for alias, score in zip(result["labels"], result["scores"]):
        for key, aliases in alias_map.items():
            if alias in aliases:
                print(f" - Matched alias: '{alias}' for key: '{key}' with score: {score:.4f}")
                label_scores[key] = max(label_scores[key], score)
    
    # Convert to binary using threshold
    binary_result = {k: int(v > threshold) for k, v in label_scores.items()}
    print(f" - Final binary classification: {binary_result}")
    return binary_result

# New section: apply classification with progress tracking
results = []  # new line
total_rows = len(df)  # new line
for idx, val in enumerate(df['cloud_provider'], 1):  # new line
    result = classify_with_aliases(val, alias_map, row_num=idx, total_rows=total_rows)  # new line
    results.append(result)  # new line

# Convert classification results into separate columns
binary_df = pd.DataFrame(results)  # new line
df = pd.concat([df, binary_df], axis=1)  # new line

# Preview results
df.head(5)


Loading model...


Device set to use cpu


Row 1/191

Processing text: AWS:Yes, GCP:Yes, Azure:Yes, Oracle:Yes
 - Candidate labels: ['GCP', 'Google', 'Google Cloud', 'AWS', 'Amazon', 'Amazon Web Services', 'Azure', 'Microsoft', 'Microsoft Azure', 'Oracle']
 - Matched alias: 'Azure' for key: 'Azure' with score: 0.4683
 - Matched alias: 'AWS' for key: 'AWS' with score: 0.3229
 - Matched alias: 'Oracle' for key: 'Oracle' with score: 0.2637
 - Matched alias: 'GCP' for key: 'GCP' with score: 0.2171
 - Matched alias: 'Google Cloud' for key: 'GCP' with score: 0.0891
 - Matched alias: 'Microsoft Azure' for key: 'Azure' with score: 0.0332
 - Matched alias: 'Microsoft' for key: 'Azure' with score: 0.0007
 - Matched alias: 'Google' for key: 'GCP' with score: 0.0004
 - Matched alias: 'Amazon Web Services' for key: 'AWS' with score: 0.0004
 - Matched alias: 'Amazon' for key: 'AWS' with score: 0.0001
 - Final binary classification: {'GCP': 1, 'AWS': 1, 'Azure': 1, 'Oracle': 1}
Row 2/191

Processing text: AWS:Yes, GCP:Yes, Azure:Yes, Oracle:Y

Unnamed: 0,country,total_data_centers,hyperscale_data_centers,colocation_data_centers,floor_space_sqft_total,power_capacity_MW_total,average_renewable_energy_usage_percent,cloud_provider,internet_penetration_percent,avg_latency_to_global_hubs_ms,number_of_fiber_connections,growth_rate_of_data_centers_percent_per_year,cooling_technologies_common,regulatory_challenges_or_limits,GCP,AWS,Azure,Oracle
0,United States,5426,300,2000,3552000000,12000,27.0,"AWS:Yes, GCP:Yes, Azure:Yes, Oracle:Yes",92.0,40.0,60.0,12,"Air, Liquid, Immersion","Power grid strain in VA, moratoriums in some s...",1,1,1,1
1,Germany,529,50,200,350000000,2000,40.0,"AWS:Yes, GCP:Yes, Azure:Yes, Oracle:Yes",92.0,50.0,30.0,8,"Air, Indirect Evap.","Power caps in Frankfurt, strict emissions",1,1,1,1
2,United Kingdom,523,30,150,200000000,1000,0.45,"AWS:Yes, GCP:Yes, Azure:Yes, Oracle:Yes",96.0,40.0,25.0,7,"Air, Free Cooling",Power grid constraints in London,1,1,1,1
3,China,449,100,1000,500000000,8000,0.25,"AWS:Yes, GCP:No, Azure:Yes, Oracle:Yes",76.0,75.0,40.0,15,"Air, Liquid",Power rationing in some provinces,0,1,1,1
4,France,322,20,110,120000000,700,40.0,"AWS:Yes, GCP:Yes, Azure:Yes, Oracle:Yes",91.0,50.0,20.0,6,"Air, Liquid",Power grid constraints in Paris,1,1,1,1


In [11]:
# Load model and tokenizer
print("Loading model...")
model = AutoModelForSequenceClassification.from_pretrained("ModernBERT-large-zeroshot-v2.0", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("ModernBERT-large-zeroshot-v2.0", trust_remote_code=True)

classifier = pipeline(
    "zero-shot-classification", 
    model=model,
    tokenizer=tokenizer,
    device=0, 
    multi_label=True
)

# Define your alias map
alias_map = {
    "air_cooling": ["Air cooling", "Air", "AC"],
    "economizers": ["Economizers", "Economy chiller", "Free cooling"],
    "liquid_immersion": ["Liquid immersion", "Liquid"]
}

# Classification function with logging
def classify_with_aliases(text, alias_map, threshold=0.2, row_num=None, total_rows=None):
    if row_num is not None and total_rows is not None:  # print row progress
        print(f"Row {row_num}/{total_rows}")  # new line
    print(f"\nProcessing text: {text}")
    
    if not isinstance(text, str) or not text.strip():
        print(" - Skipped: Empty or invalid input")
        return {k: 0 for k in alias_map}
    
    # Flatten all aliases
    all_aliases = [alias for aliases in alias_map.values() for alias in aliases]
    print(f" - Candidate labels: {all_aliases}")
    
    # Run classification using pipeline
    result = classifier(text, all_aliases)
    
    # Initialize score dict
    label_scores = {key: 0 for key in alias_map}
    
    # Assign scores based on best match per group
    for alias, score in zip(result["labels"], result["scores"]):
        for key, aliases in alias_map.items():
            if alias in aliases:
                print(f" - Matched alias: '{alias}' for key: '{key}' with score: {score:.4f}")
                label_scores[key] = max(label_scores[key], score)
    
    # Convert to binary using threshold
    binary_result = {k: int(v > threshold) for k, v in label_scores.items()}
    print(f" - Final binary classification: {binary_result}")
    return binary_result

# New section: apply classification with progress tracking
results = []  # new line
total_rows = len(df)  # new line
for idx, val in enumerate(df['cooling_technologies_common'], 1):  # new line
    result = classify_with_aliases(val, alias_map, row_num=idx, total_rows=total_rows)  # new line
    results.append(result)  # new line

# Convert classification results into separate columns
binary_df = pd.DataFrame(results)  # new line
df = pd.concat([df, binary_df], axis=1)  # new line

# Preview results
df.head(5)


Loading model...


Device set to use cpu


Row 1/191

Processing text: Air, Liquid, Immersion
 - Candidate labels: ['Air cooling', 'Air', 'AC', 'Economizers', 'Economy chiller', 'Free cooling', 'Liquid immersion', 'Liquid']
 - Matched alias: 'Liquid immersion' for key: 'liquid_immersion' with score: 0.8687
 - Matched alias: 'AC' for key: 'air_cooling' with score: 0.8476
 - Matched alias: 'Air' for key: 'air_cooling' with score: 0.8075
 - Matched alias: 'Liquid' for key: 'liquid_immersion' with score: 0.6075
 - Matched alias: 'Air cooling' for key: 'air_cooling' with score: 0.0164
 - Matched alias: 'Free cooling' for key: 'economizers' with score: 0.0158
 - Matched alias: 'Economy chiller' for key: 'economizers' with score: 0.0053
 - Matched alias: 'Economizers' for key: 'economizers' with score: 0.0029
 - Final binary classification: {'air_cooling': 1, 'economizers': 0, 'liquid_immersion': 1}
Row 2/191

Processing text: Air, Indirect Evap.
 - Candidate labels: ['Air cooling', 'Air', 'AC', 'Economizers', 'Economy chiller', 'Free

Unnamed: 0,country,total_data_centers,hyperscale_data_centers,colocation_data_centers,floor_space_sqft_total,power_capacity_MW_total,average_renewable_energy_usage_percent,cloud_provider,internet_penetration_percent,avg_latency_to_global_hubs_ms,...,growth_rate_of_data_centers_percent_per_year,cooling_technologies_common,regulatory_challenges_or_limits,GCP,AWS,Azure,Oracle,air_cooling,economizers,liquid_immersion
0,United States,5426,300,2000,3552000000,12000,27.0,"AWS:Yes, GCP:Yes, Azure:Yes, Oracle:Yes",92.0,40.0,...,12,"Air, Liquid, Immersion","Power grid strain in VA, moratoriums in some s...",1,1,1,1,1,0,1
1,Germany,529,50,200,350000000,2000,40.0,"AWS:Yes, GCP:Yes, Azure:Yes, Oracle:Yes",92.0,50.0,...,8,"Air, Indirect Evap.","Power caps in Frankfurt, strict emissions",1,1,1,1,1,1,0
2,United Kingdom,523,30,150,200000000,1000,0.45,"AWS:Yes, GCP:Yes, Azure:Yes, Oracle:Yes",96.0,40.0,...,7,"Air, Free Cooling",Power grid constraints in London,1,1,1,1,1,1,0
3,China,449,100,1000,500000000,8000,0.25,"AWS:Yes, GCP:No, Azure:Yes, Oracle:Yes",76.0,75.0,...,15,"Air, Liquid",Power rationing in some provinces,0,1,1,1,1,0,1
4,France,322,20,110,120000000,700,40.0,"AWS:Yes, GCP:Yes, Azure:Yes, Oracle:Yes",91.0,50.0,...,6,"Air, Liquid",Power grid constraints in Paris,1,1,1,1,1,0,1


In [12]:
df.to_csv('cleaned_data.csv', index=False)