In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy.stats import spearmanr, chi2_contingency

def evaluate_single_categorical_relationship(df, cat_var, continuous_var, dichotomous_var):
    results = {}
    print(f"Evaluating {cat_var}...")
    
    # ANOVA
    model = ols(f'{continuous_var} ~ C({cat_var})', data=df).fit()
    anova_table = sm.stats.anova_lm(model, typ=2)
    results['ANOVA F'] = model.fvalue
    results['ANOVA p-value'] = model.f_pvalue
    results['Eta squared'] = anova_table['sum_sq'].iloc[0] / anova_table['sum_sq'].sum()

    # Spearman's Rank Correlation
    df[f'{cat_var}_code'] = df[cat_var].astype('category').cat.codes
    spearman_corr, spearman_p_value = spearmanr(df[f'{cat_var}_code'], df[continuous_var])
    results['Spearman Correlation'] = spearman_corr
    results['Spearman p-value'] = spearman_p_value

    # Cramér's V
    contingency_table = pd.crosstab(df[cat_var], df[dichotomous_var])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    n = contingency_table.sum().sum()
    phi2 = chi2 / n
    cramers_v = np.sqrt(phi2 / (min(contingency_table.shape) - 1))
    results['Cramér\'s V'] = cramers_v
    results['Cramér\'s p-value'] = p

    return pd.DataFrame([results], index=[cat_var])

# Data Preparation
categorical_vars = ['Veh_type', 'Veh_Model', 'Energy', 'Fuel_mode']
continuous_var = 'CO2_wltp'
dichotomous_var = 'Em_on_target'
df = pd.read_csv('/Users/livalacaisse/Documents/DataScience/CO2/000-C02 First Delivery/Cleaned_countries/FR_Cleaned.csv', low_memory=False)  # Ensure this CSV is properly formatted and the path is correct

# Run analysis for each category and accumulate results
all_results = pd.DataFrame()
for cat_var in categorical_vars:
    cat_results = evaluate_single_categorical_relationship(df, cat_var, continuous_var, dichotomous_var)
    all_results = pd.concat([all_results, cat_results])

# Output results
print(all_results)
all_results.to_csv('categorical_relationships_fr.csv', index=True)

### excluded 'Version'

Evaluating Veh_type...


In [None]:
display(all_results)

In [None]:
## Evaluating the version separately due to the high number of modalities

#version_result = evaluate_single_categorical_relationship(df_Ptfinal, 'Version', continuous_var, dichotomous_var)
#all_results = pd.concat([all_results, pd.DataFrame([version_result])], ignore_index=True)

In [None]:
all

In [None]:
all_results.info()