# 2024 Digital Cultural & Sports Consumption — Exploratory Analysis

This notebook explores the **2024 ARCOM barometer** on digital cultural and sports consumption in France.
It covers data loading, dictionary parsing, cleaning, KPIs, demographic breakdowns, and **advanced insights**.

**Files expected in the same folder:**
- `2024-barometre-consommation.xlsx` — main microdata
- `2024-datamap.xlsx` — dictionary (3 sheets: Level, VAR, TEXT)

All visuals are produced with seaborn and matplotlib.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings

warnings.simplefilter('ignore', category=FutureWarning)
sns.set(style='whitegrid', palette='muted')
pd.set_option('display.max_colwidth', 120)


## 1) Load data & dictionary

In [None]:
# Paths (adjust if needed)
DATA_PATH = '2024-barometre-consommation.xlsx'
DICT_PATH = '2024-datamap.xlsx'

# Load main dataset
df = pd.read_excel(DATA_PATH)
print(f'Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns')

# Load dictionary (3 sheets)
dm_level = pd.read_excel(DICT_PATH, sheet_name=0)   # general info (not used)
dm_var   = pd.read_excel(DICT_PATH, sheet_name=1)   # variable metadata
dm_text  = pd.read_excel(DICT_PATH, sheet_name=2)   # code labels

# Normalize column names
dm_var.columns  = [c.strip().upper() for c in dm_var.columns]
dm_text.columns = [c.strip().upper() for c in dm_text.columns]

# Build dictionary (VAR + TEXT)
dict_vars = dm_var[['NAME', 'TYPE', 'LABELS', 'ANSWER']].copy()
dict_vars.rename(columns={'LABELS': 'LABEL'}, inplace=True)
dict_vars = dict_vars.dropna(subset=['NAME'])

dict_text = dm_text[['NAME', 'CODE', 'VALUE', 'FR:L']].copy()

merged_dict = dict_vars.merge(dict_text, on='NAME', how='left')
print(f'Dictionary built: {merged_dict['NAME'].nunique()} unique variables')


## 2) Helper functions

In [None]:
# Show dictionary info for a variable or its family (prefix match)
def describe_var(var_name):
    res = merged_dict[merged_dict['NAME'].str.upper() == var_name.upper()]
    if res.empty:
        res = merged_dict[merged_dict['NAME'].apply(lambda x: var_name.upper().startswith(str(x).upper()))]
    if res.empty:
        print(f'Variable {var_name} not found in dictionary.')
    else:
        display(res[['NAME', 'TYPE', 'LABEL', 'CODE', 'VALUE', 'FR:L']].drop_duplicates())

# Extract a question prefix (e.g., QBU7_r1_c3 -> QBU7)
def find_prefix(col):
    m = re.match(r'([A-Z]+[0-9A-Z]+)', col)
    return m.group(1) if m else col


## 3) Identify thematic blocks (prefixes)

In [None]:
prefixes = pd.Series(df.columns).apply(find_prefix)
prefix_counts = prefixes.value_counts().head(20)
print('Top prefixes:')
display(prefix_counts.to_frame('count'))


In [None]:
# Define thematic groups by prefix
culture_cols = [c for c in df.columns if c.startswith(('QBU', 'QBOL'))]
sports_cols  = [c for c in df.columns if c.startswith('RS')]
law_cols     = [c for c in df.columns if c.startswith('Q2')]
demo_cols    = ['SEXE', 'AGE', 'CSP', 'AGGLO', 'REG']

print(f'Culture cols: {len(culture_cols)} | Sport cols: {len(sports_cols)} | Law cols: {len(law_cols)}')


## 4) Clean & ensure safe frame

In [None]:
# Remove constant columns; then copy to avoid SettingWithCopyWarning
df = df.loc[:, df.nunique(dropna=False) > 1].copy()
print(f'After cleaning: {df.shape[1]} columns remain')

# Re-derive safe thematic lists on the cleaned df
def safe_cols(prefixes):
    return [c for c in df.columns if any(c.startswith(p) for p in prefixes)]

culture_cols = safe_cols(['QBU', 'QBOL'])
sports_cols  = safe_cols(['RS'])
law_cols     = safe_cols(['Q2'])

print(f'Culture cols: {len(culture_cols)} | Sport cols: {len(sports_cols)} | Law cols: {len(law_cols)}')


## 5) Build indicators (binary)

In [None]:
if culture_cols:
    df.loc[:, 'consume_culture'] = df[culture_cols].notna().any(axis=1).astype(int)
else:
    df.loc[:, 'consume_culture'] = 0

if sports_cols:
    df.loc[:, 'consume_sport'] = df[sports_cols].notna().any(axis=1).astype(int)
else:
    df.loc[:, 'consume_sport'] = 0

if law_cols:
    df.loc[:, 'illicit_access'] = df[law_cols].notna().any(axis=1).astype(int)
else:
    df.loc[:, 'illicit_access'] = 0

print('Indicators created (consume_culture, consume_sport, illicit_access)')


## 6) Decode demographics (SEXE, AGE)

In [None]:
# Map SEXE using FR:L labels from dictionary
if 'SEXE' in df.columns:
    sex_dict = dict_text[dict_text['NAME'].str.upper() == 'SEXE'].copy()
    if not sex_dict.empty:
        sex_dict = sex_dict.dropna(subset=['CODE', 'FR:L'])
        sex_dict['CODE'] = sex_dict['CODE'].astype(int)
        sex_map = sex_dict.set_index('CODE')['FR:L'].to_dict()
        df.loc[:, 'SEXE_LABEL'] = df['SEXE'].map(sex_map).astype('string')
        print(f'Mapped SEXE: {sex_map}')
    else:
        df.loc[:, 'SEXE_LABEL'] = df['SEXE'].astype('string')
        print('No mapping found for SEXE — kept raw values.')
else:
    df.loc[:, 'SEXE_LABEL'] = pd.Series([pd.NA]*len(df), dtype='string')
    print("Column 'SEXE' not found.")

# Age bands
if 'AGE' in df.columns:
    df.loc[:, 'AGE_BIN'] = pd.cut(
        df['AGE'],
        bins=[15, 24, 34, 44, 54, 64, 100],
        labels=['16–24', '25–34', '35–44', '45–54', '55–64', '65+'],
        include_lowest=True, right=True
    )
    print('AGE binned successfully.')
else:
    df.loc[:, 'AGE_BIN'] = pd.Series([pd.NA]*len(df), dtype='string')
    print("Column 'AGE' not found.")


## 7) KPIs & demographic summaries

In [None]:
kpi = {
    'Culture consumers (%)': df['consume_culture'].mean() * 100,
    'Sport viewers (%)': df['consume_sport'].mean() * 100,
    'Illicit users (%)': df['illicit_access'].mean() * 100
}
display(pd.Series(kpi).round(1).to_frame('value'))

demo_summary = (
    df.groupby('SEXE_LABEL')[['consume_culture', 'consume_sport', 'illicit_access']]
      .mean().mul(100).round(1)
)
age_summary = (
    df.groupby('AGE_BIN', observed=True)[['consume_culture', 'consume_sport', 'illicit_access']]
      .mean().mul(100).round(1)
)
display(demo_summary)
display(age_summary)


## 8) Visualizations — Demographic profiles

In [None]:
# Gender bars
plt.figure(figsize=(6,4))
sns.barplot(x=demo_summary.index, y='consume_culture', data=demo_summary.reset_index())
plt.title('Cultural Consumption by Gender (2024)')
plt.ylabel('% of respondents')
plt.xlabel('')
plt.tight_layout()
plt.show()

plt.figure(figsize=(6,4))
sns.barplot(x=demo_summary.index, y='consume_sport', data=demo_summary.reset_index())
plt.title('Sport Consumption by Gender (2024)')
plt.ylabel('% of respondents')
plt.xlabel('')
plt.tight_layout()
plt.show()

# Age bars
plt.figure(figsize=(8,4))
sns.barplot(x=age_summary.index, y='consume_culture', data=age_summary.reset_index())
plt.title('Cultural Consumption by Age Group (2024)')
plt.ylabel('% of respondents')
plt.xlabel('Age group')
plt.tight_layout()
plt.show()

plt.figure(figsize=(8,4))
sns.barplot(x=age_summary.index, y='consume_sport', data=age_summary.reset_index())
plt.title('Sport Consumption by Age Group (2024)')
plt.ylabel('% of respondents')
plt.xlabel('Age group')
plt.tight_layout()
plt.show()


## 9) Advanced insights

In [None]:
# 9.1 Correlation between key indicators
corr = df[['consume_culture', 'consume_sport', 'illicit_access']].corr()
plt.figure(figsize=(4,3))
sns.heatmap(corr, annot=True, fmt='.2f', vmin=-1, vmax=1, square=True, cbar=True)
plt.title('Correlation — Culture / Sport / Illicit')
plt.tight_layout()
plt.show()

# 9.2 Heatmap by Age x Gender for Culture and Sport
pivot_culture = df.pivot_table(index='AGE_BIN', columns='SEXE_LABEL', values='consume_culture', aggfunc='mean')*100
pivot_sport   = df.pivot_table(index='AGE_BIN', columns='SEXE_LABEL', values='consume_sport', aggfunc='mean')*100

fig, axes = plt.subplots(1, 2, figsize=(10,4))
sns.heatmap(pivot_culture, annot=True, fmt='.1f', ax=axes[0])
axes[0].set_title('Culture — % consumers')
axes[0].set_xlabel('Gender'); axes[0].set_ylabel('Age group')

sns.heatmap(pivot_sport, annot=True, fmt='.1f', ax=axes[1])
axes[1].set_title('Sport — % consumers')
axes[1].set_xlabel('Gender'); axes[1].set_ylabel('')
plt.tight_layout()
plt.show()

# 9.3 Combined profile: Culture & Sport (4-quadrant classification)
df.loc[:, 'profile_cs'] = np.select(
    [
        (df['consume_culture']==1) & (df['consume_sport']==1),
        (df['consume_culture']==1) & (df['consume_sport']==0),
        (df['consume_culture']==0) & (df['consume_sport']==1),
    ],
    ['Both', 'Culture only', 'Sport only'],
    default='Neither'
)

profile_share = (df.groupby(['AGE_BIN','SEXE_LABEL'])['profile_cs']
                   .value_counts(normalize=True)
                   .rename('share').reset_index())
plt.figure(figsize=(10,5))
sns.barplot(data=profile_share, x='AGE_BIN', y='share', hue='profile_cs')
plt.title('Profile mix by Age group and Gender')
plt.ylabel('Share of respondents')
plt.xlabel('Age group')
plt.legend(title='Profile')
plt.tight_layout()
plt.show()

# 9.4 Illicit access by demographics
illicit_demo = (
    df.groupby(['AGE_BIN', 'SEXE_LABEL'])['illicit_access']
      .mean().mul(100).rename('% illicit').reset_index()
)
plt.figure(figsize=(10,4))
sns.barplot(data=illicit_demo, x='AGE_BIN', y='% illicit', hue='SEXE_LABEL')
plt.title('Illicit Access by Age and Gender')
plt.ylabel('% of respondents')
plt.xlabel('Age group')
plt.tight_layout()
plt.show()


## 10) Storytelling summary

**Key takeaways (illustrative):**
- Digital cultural and sports consumption is widespread, with notable age gradients.
- Younger cohorts tend to have higher sport engagement; cultural consumption is more evenly distributed.
- A subset exhibits illicit access; differences are more visible in younger groups.
- Combined profiles show clear clusters: Both, Culture only, Sport only, and Neither — useful for audience targeting.

Next steps: refine indicators using the detailed questionnaire (e.g., content types, platforms), apply weights if provided, and validate assumptions with the official codebook.