# Python — Raw dataset exploratory analysis

This notebook loads the project's raw dataset and runs quick exploratory analysis: shape, head, summary stats, missingness, and a simple numeric plot. It uses pandas and pyreadstat to read SPSS `.sav` files.

Required packages: pandas, pyreadstat, matplotlib, seaborn

If a package is missing, install with:

!pip install pandas pyreadstat matplotlib seaborn


In [None]:
# Imports and plotting setup
import pandas as pd
import pyreadstat
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

# Default file path (repo-relative). Update if your workspace layout differs.
default_path = "../6ix_Pac_Data_Stats520-Final-Project/data/raw/2025_ED_852_HERI_data.sav"

print("Default dataset path:", default_path)


In [None]:
# Load the SPSS .sav file into a pandas DataFrame
try:
    df, meta = pyreadstat.read_sav(default_path)
    print("Loaded dataset successfully")
except Exception as e:
    print("Failed to load dataset:", e)
    # Try prompting the user
    from pathlib import Path
    p = input("Enter path to 2025_ED_852_HERI_data.sav (or press Enter to abort): ")
    if p:
        df, meta = pyreadstat.read_sav(p)
    else:
        df = None

# Quick inspections if loaded
if df is not None:
    print("DataFrame shape:", df.shape)
    display(df.head(10))
    print('\nInfo:')
    print(df.info())
    print('\nDescribe (numeric):')
    display(df.describe(include='number').T)
    print('\nMissingness summary:')
    print(df.isnull().sum().sort_values(ascending=False).head(20))

    # Plot: histogram of the first numeric column
    numeric_cols = df.select_dtypes(include=['number']).columns
    if len(numeric_cols) > 0:
        first_numeric = numeric_cols[0]
        plt.figure(figsize=(8,4))
        sns.histplot(df[first_numeric].dropna(), kde=True)
        plt.title(f'Histogram of {first_numeric}')
        plt.show()
    else:
        print('No numeric columns detected for plotting.')
else:
    print('No DataFrame available. Notebook will stop here.')
