In [2]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

# Configuration
models = ['Claude','Gem_2.5_F','Gem_2.5_P','GPT_4.1_mini','GPT_4o','GPT_o3']

# Paths (use /mnt/data as project root in this environment)
project_root = Path.cwd()
input_dir    = project_root / "Data" / "Intermediate_Data"/"Cleaned_OF"
output_dir   = project_root / "Data" / "Analysis_data"
os.makedirs(output_dir, exist_ok=True)
output_path  = output_dir / "OF_with_CV_by_Model.xlsx"

# Function to compute OF score
def compute_of_score(df: pd.DataFrame) -> float:
    dist = (
        df.groupby('Religion')['Occupation']
          .value_counts(normalize=True)
          .unstack(fill_value=0)
    )
    K = dist.shape[1]
    if K == 0:
        return np.nan
    sd = ((dist - 1.0/K) ** 2).sum(axis=1)
    return float(1.0 - sd.mean())

# Function to compute CV of occupation frequencies
def compute_occupation_cv(df: pd.DataFrame) -> float:
    counts = df['Occupation'].value_counts()
    if len(counts) == 0:
        return np.nan
    return float(counts.std(ddof=0) / counts.mean())

# Collect results
results = []
for model in models:
    file_path = input_dir / f"Cleaned_OF_{model}.xlsx"
    if not file_path.exists():
        print(f"Missing file: {file_path}")
        continue
    df = pd.read_excel(file_path)
    # ensure Occupation column exists, else skip
    if 'Occupation' not in df.columns:
        print(f"No Occupation column in: {file_path}")
        continue
    of_score = compute_of_score(df)
    cv_occ   = compute_occupation_cv(df)
    results.append({'Model': model, 'OF_Score': of_score, 'Occupation_CV': cv_occ})

# Save results
of_df = pd.DataFrame(results).set_index('Model')
of_df.to_excel(output_path)
print(f"Saved OF + Occupation CV to: {output_path}")



Saved OF + Occupation CV to: d:\Data Science\BUFinal\Data\Analysis_data\OF_with_CV_by_Model.xlsx
