In [7]:
import pandas as pd
from gliclass import GLiClassModel, ZeroShotClassificationPipeline
from transformers import AutoTokenizer
import torch
print(torch.__version__)

2.7.1+cpu


In [8]:
df = pd.read_csv('data/datacenter_dataset.csv')

cols = ['country', 'total_data_centers', 'hyperscale_data_centers',
        'colocation_data_centers', 'floor_space_sqft_total',
        'power_capacity_MW_total', 'average_renewable_energy_usage_percent', 
        'cloud_provider', 'internet_penetration_percent', 
        'avg_latency_to_global_hubs_ms',
        'number_of_fiber_connections',
        'growth_rate_of_data_centers_percent_per_year',
        'cooling_technologies_common', 'regulatory_challenges_or_limits']
int_cols = [
    'total_data_centers', 
    'hyperscale_data_centers',
    'colocation_data_centers', 
    'floor_space_sqft_total',
    'power_capacity_MW_total',
    'growth_rate_of_data_centers_percent_per_year'
]
float_cols = [
    'average_renewable_energy_usage_percent',
    'internet_penetration_percent', 
    'avg_latency_to_global_hubs_ms',
    'number_of_fiber_connections',
]

df = df[cols]

df[int_cols] = df[int_cols].replace(r'[~,+]', '', regex=True) \
    .replace(r'\([^)]*\)', '', regex=True) \
    .apply(lambda x: x.str.split('[-–—]').str[-1] if x.dtype == 'object' else x) \
    .replace(r'[^\d]+', ' ', regex=True) \
    .apply(pd.to_numeric, errors='coerce') \
    .astype('Int64')
    
df[float_cols] = df[float_cols].replace(r'[~,+]', '', regex=True) \
    .replace(r'\([^)]*\)', '', regex=True) \
    .apply(lambda x: x.str.split('[-–—]').str[-1] if x.dtype == 'object' else x) \
    .replace(r'[^\d.]+', ' ', regex=True) \
    .apply(pd.to_numeric, errors='coerce') \
    .astype('Float64')
    
df.info()
df.to_csv('test.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 191 entries, 0 to 190
Data columns (total 14 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   country                                       191 non-null    object 
 1   total_data_centers                            191 non-null    Int64  
 2   hyperscale_data_centers                       191 non-null    Int64  
 3   colocation_data_centers                       191 non-null    Int64  
 4   floor_space_sqft_total                        184 non-null    Int64  
 5   power_capacity_MW_total                       165 non-null    Int64  
 6   average_renewable_energy_usage_percent        148 non-null    Float64
 7   cloud_provider                                149 non-null    object 
 8   internet_penetration_percent                  190 non-null    Float64
 9   avg_latency_to_global_hubs_ms                 145 non-null    Flo

In [None]:
# Load model and tokenizer
print("Loading model...")
model = GLiClassModel.from_pretrained("knowledgator/gliclass-modern-base-v2.0-init")
tokenizer = AutoTokenizer.from_pretrained("knowledgator/gliclass-modern-base-v2.0-init", add_prefix_space=True)

# Initialize GLiClass zero-shot classification pipeline
print("Loading classifier pipeline...")
classifier = ZeroShotClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    classification_type='multi-label',  # important for binary-style output
    device='cuda:0'  # or 'cpu' if no GPU
)

# Define your alias map
alias_map = {
    "GCP": ["GCP", "Google", "Google Cloud"],
    "AWS": ["AWS", "Amazon", "Amazon Web Services"],
    "Azure": ["Azure", "Microsoft", "Microsoft Azure"],
    "Oracle": ["Oracle"]
}

# Classification function with logging
def classify_with_aliases(text, alias_map, threshold=0.5):
    print(f"\nProcessing text: {text}")
    
    if not isinstance(text, str) or not text.strip():
        print(" - Skipped: Empty or invalid input")
        return {k: 0 for k in alias_map}
    
    # Flatten all aliases
    all_aliases = [alias for aliases in alias_map.values() for alias in aliases]
    print(f" - Candidate labels: {all_aliases}")
    
    # Run classification using GLiClass pipeline
    result = classifier(text, all_aliases)
    
    # Initialize score dict
    label_scores = {key: 0 for key in alias_map}
    
    # Assign scores based on best match per group
    for alias, score in zip(result["labels"], result["scores"]):
        for key, aliases in alias_map.items():
            if alias in aliases:
                print(f" - Matched alias: '{alias}' for key: '{key}' with score: {score:.4f}")
                label_scores[key] = max(label_scores[key], score)
    
    # Convert to binary using threshold
    binary_result = {k: int(v > threshold) for k, v in label_scores.items()}
    print(f" - Final binary classification: {binary_result}")
    return binary_result

binary_results = df['Cloud_provider'].apply(lambda x: classify_with_aliases(x, alias_map))
binary_df = pd.DataFrame(binary_results.tolist())
df = pd.concat([df, binary_df], axis=1)

# Preview results
df.head(5)


Loading model...
