In [3]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy.stats import spearmanr, chi2_contingency

def evaluate_single_categorical_relationship(df, cat_var, continuous_var, dichotomous_var):
    results = {}
    print(f"Evaluating {cat_var}...")
    
    # ANOVA
    model = ols(f'{continuous_var} ~ C({cat_var})', data=df).fit()
    anova_table = sm.stats.anova_lm(model, typ=2)
    results['ANOVA F'] = model.fvalue
    results['ANOVA p-value'] = model.f_pvalue
    results['Eta squared'] = anova_table['sum_sq'].iloc[0] / anova_table['sum_sq'].sum()

    # Spearman's Rank Correlation
    df[f'{cat_var}_code'] = df[cat_var].astype('category').cat.codes
    spearman_corr, spearman_p_value = spearmanr(df[f'{cat_var}_code'], df[continuous_var])
    results['Spearman Correlation'] = spearman_corr
    results['Spearman p-value'] = spearman_p_value

    # Cramér's V
    contingency_table = pd.crosstab(df[cat_var], df[dichotomous_var])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    n = contingency_table.sum().sum()
    phi2 = chi2 / n
    cramers_v = np.sqrt(phi2 / (min(contingency_table.shape) - 1))
    results['Cramér\'s V'] = cramers_v
    results['Cramér\'s p-value'] = p

    return pd.DataFrame([results], index=[cat_var])

# Data Preparation
categorical_vars = ['Constructor', 'Veh_type', 'Brand', 'Veh_Model', 'Veh_Category', 'Energy', 'Fuel_mode']
continuous_var = 'CO2_wltp'
dichotomous_var = 'Em_on_target'
df_Ptfinal = pd.read_csv('PT_Cleaned.csv')  

# Run analysis for each category and accumulate results
all_results = pd.DataFrame()
for cat_var in categorical_vars:
    cat_results = evaluate_single_categorical_relationship(df_Ptfinal, cat_var, continuous_var, dichotomous_var)
    all_results = pd.concat([all_results, cat_results])

# Output results
print(all_results)
all_results.to_csv('categorical_relationships.csv', index=True)

### excluded 'Version'

Evaluating Constructor...
Evaluating Veh_type...
Evaluating Brand...
Evaluating Veh_Model...
Evaluating Veh_Category...
Evaluating Energy...
Evaluating Fuel_mode...
                    ANOVA F  ANOVA p-value  Eta squared  Spearman Correlation  \
Constructor     4117.307273   0.000000e+00     0.110060             -0.116772   
Veh_type        2688.237575   0.000000e+00     0.571356             -0.046350   
Brand           4017.359065   0.000000e+00     0.136293             -0.115080   
Veh_Model       2482.147618   0.000000e+00     0.756992             -0.002146   
Veh_Category     802.575937  1.790400e-176     0.000926             -0.016045   
Energy        755251.974012   0.000000e+00     0.813519              0.013200   
Fuel_mode     894122.193236   0.000000e+00     0.805133              0.132799   

              Spearman p-value  Cramér's V  Cramér's p-value  
Constructor       0.000000e+00    0.365968               0.0  
Veh_type          0.000000e+00    0.729359               0.0

In [4]:
display(all_results)

Unnamed: 0,ANOVA F,ANOVA p-value,Eta squared,Spearman Correlation,Spearman p-value,Cramér's V,Cramér's p-value
Constructor,4117.307273,0.0,0.11006,-0.116772,0.0,0.365968,0.0
Veh_type,2688.237575,0.0,0.571356,-0.04635,0.0,0.729359,0.0
Brand,4017.359065,0.0,0.136293,-0.11508,0.0,0.424392,0.0
Veh_Model,2482.147618,0.0,0.756992,-0.002146,0.04582619,0.864676,0.0
Veh_Category,802.575937,1.7904e-176,0.000926,-0.016045,2.144339e-50,0.178304,0.0
Energy,755251.974012,0.0,0.813519,0.0132,1.130101e-34,0.985408,0.0
Fuel_mode,894122.193236,0.0,0.805133,0.132799,0.0,0.969261,0.0


In [None]:
## Evaluating the version separately due to the high number of modalities

#version_result = evaluate_single_categorical_relationship(df_Ptfinal, 'Version', continuous_var, dichotomous_var)
#all_results = pd.concat([all_results, pd.DataFrame([version_result])], ignore_index=True)

In [5]:
all

<function all(iterable, /)>

In [6]:
all_results.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, Constructor to Fuel_mode
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   ANOVA F               7 non-null      float64
 1   ANOVA p-value         7 non-null      float64
 2   Eta squared           7 non-null      float64
 3   Spearman Correlation  7 non-null      float64
 4   Spearman p-value      7 non-null      float64
 5   Cramér's V            7 non-null      float64
 6   Cramér's p-value      7 non-null      float64
dtypes: float64(7)
memory usage: 448.0+ bytes
