# Data Exploration

This notebook explores the GSS data to understand:
- Available demographics
- Opinion question distributions
- Sample sizes by year and demographic segment

In [None]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from hivesight_calibration import GSSLoader, OPINION_QUESTIONS

# Configure plotting
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

## Load GSS Data

In [None]:
# Load GSS cumulative file
loader = GSSLoader(data_dir=Path('../data'))

# Load recent years with key variables
key_vars = ['year', 'age', 'realinc', 'region', 'sex', 'race', 'degree'] + OPINION_QUESTIONS[:20]
gss = loader.load(years=[2018, 2021, 2022, 2024], columns=key_vars)

print(f"Loaded {len(gss):,} respondents")
gss.head()

## Demographics Overview

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Age distribution
axes[0, 0].hist(gss['age'].dropna(), bins=30, edgecolor='white')
axes[0, 0].set_xlabel('Age')
axes[0, 0].set_title('Age Distribution')

# Income distribution (log scale)
axes[0, 1].hist(gss['realinc'].dropna().clip(upper=200000), bins=30, edgecolor='white')
axes[0, 1].set_xlabel('Real Income ($)')
axes[0, 1].set_title('Income Distribution')

# Region
gss['region'].value_counts().sort_index().plot(kind='bar', ax=axes[1, 0])
axes[1, 0].set_xlabel('Region Code')
axes[1, 0].set_title('Region Distribution')

# Education
gss['degree'].value_counts().sort_index().plot(kind='bar', ax=axes[1, 1])
axes[1, 1].set_xlabel('Degree Code')
axes[1, 1].set_title('Education Distribution')

plt.tight_layout()
plt.show()

## Opinion Question Response Distributions

In [None]:
# Look at a few key opinion questions
opinion_vars = ['cappun', 'grass', 'gunlaw', 'abany']

fig, axes = plt.subplots(2, 2, figsize=(12, 8))

for ax, var in zip(axes.flat, opinion_vars):
    if var in gss.columns:
        gss[var].value_counts().sort_index().plot(kind='bar', ax=ax)
        ax.set_title(f'{var} Response Distribution')
        ax.set_xlabel('Response Code')
        ax.set_ylabel('Count')

plt.tight_layout()
plt.show()

## Sample Sizes by Year

In [None]:
gss.groupby('year').size().plot(kind='bar', figsize=(8, 5))
plt.xlabel('Year')
plt.ylabel('Number of Respondents')
plt.title('GSS Sample Size by Year')
plt.show()

## Missing Data Analysis

In [None]:
# Check missing data patterns
missing_pct = (gss.isna().sum() / len(gss) * 100).sort_values(ascending=False)
print("Missing data percentage by variable:")
print(missing_pct.head(20))