# Notebook 02 — Exploratory Data Analysis

This notebook performs a brief exploratory analysis of the cleaned dataset to understand its structure and main patterns.

# 1. Imports

In [2]:
# Standard library and setup
import sys
from pathlib import Path

# Add the project root to sys.path
project_root = Path().resolve().parent  # adjust if notebook is in a subfolder
sys.path.append(str(project_root))

# Notebook setup
from utils.notebook_setup import setup_notebook

# Data handling
import pandas as pd
from utils.data_loading import load_sav

#import matplotlib.pyplot as plt
#import seaborn as sns

In [2]:
print("Current working directory:", Path.cwd())

Current working directory: c:\Users\gabri\Documents\Dokumenty\job-satisfaction-analysis\notebooks


# 2. Settings / Configuration

In [None]:
setup_notebook(seed=42)

# 3. Loading data

In [4]:
df_clean, meta = load_sav("../data/raw/bkl21d.sav")

In [5]:
df_clean.head()

Unnamed: 0,id,intid,waga_proba,waga_proba_kor,waga_pop,waga_pop_kor,rodzaj,edycja,rok,datawyw,...,pform4t,neet,neet2,form12m,pform12m1,pform12m2,pform12m,nform12m,dokszt_all,dokszt_all_bo
0,6590.0,03540-2021,0.374529,0.37938,3732.470588,3780.814881,1.0,2021.0,2021.0,2021-09-20,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
1,6591.0,02070-2021,0.865094,0.876299,8621.333333,8732.999924,1.0,2021.0,2021.0,2021-09-20,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,6592.0,01987-2021,0.283841,0.287518,2828.7,2865.338333,1.0,2021.0,2021.0,2021-09-20,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0
3,6593.0,01686-2021,1.380901,1.398787,13761.75,13939.997104,1.0,2021.0,2021.0,2021-09-20,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0
4,6594.0,04358-2021,0.897526,0.909151,8944.545455,9060.398404,1.0,2021.0,2021.0,2021-09-20,...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
df_clean.describe()

Unnamed: 0,id,waga_proba,waga_proba_kor,waga_pop,waga_pop_kor,rodzaj,edycja,rok,powiat,woj,...,pform4t,neet,neet2,form12m,pform12m1,pform12m2,pform12m,nform12m,dokszt_all,dokszt_all_bo
count,2529.0,2529.0,2529.0,2529.0,2529.0,2529.0,2529.0,2529.0,2529.0,2529.0,...,2529.0,2529.0,2529.0,2529.0,2529.0,2529.0,2529.0,2529.0,2529.0,2529.0
mean,7854.0,1.0,1.0,9965.778964,9965.778964,2.256623,2021.0,2021.377224,1688.76038,16.661922,...,0.27758,0.297746,0.284698,0.122578,0.441677,0.218664,0.495453,0.691973,0.791617,0.767497
std,730.203739,0.446621,0.414533,4450.930974,4131.146452,0.621494,0.0,0.484788,871.789594,8.713621,...,0.447894,0.457358,0.451359,0.328017,0.496685,0.413422,0.500078,0.461769,0.406232,0.422511
min,6590.0,0.253951,0.25724,2530.815789,2563.59582,1.0,2021.0,2021.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7222.0,0.733557,0.743058,7310.466667,7405.154444,2.0,2021.0,2021.0,1019.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
50%,7854.0,0.906111,0.917848,9030.105263,9147.066414,2.0,2021.0,2021.0,1465.0,14.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
75%,8486.0,1.223727,1.239577,12195.388889,12353.347924,3.0,2021.0,2022.0,2413.0,24.0,...,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0
max,9118.0,2.835938,2.003588,28262.333333,19967.316869,3.0,2021.0,2022.0,3262.0,32.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# 4. Checks

In [None]:
print(df_clean.shape)
print(df_clean.dtypes)
print(df_clean.isna().sum().sum(), "total missing values")

(2529, 910)
id                float64
intid              object
waga_proba        float64
waga_proba_kor    float64
waga_pop          float64
                   ...   
pform12m2         float64
pform12m          float64
nform12m          float64
dokszt_all        float64
dokszt_all_bo     float64
Length: 910, dtype: object
1181559 total missing values


Unnamed: 0,id,intid,waga_proba,waga_proba_kor,waga_pop,waga_pop_kor,rodzaj,edycja,rok,datawyw,...,pform4t,neet,neet2,form12m,pform12m1,pform12m2,pform12m,nform12m,dokszt_all,dokszt_all_bo
0,6590.0,03540-2021,0.374529,0.37938,3732.470588,3780.814881,1.0,2021.0,2021.0,2021-09-20,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
1,6591.0,02070-2021,0.865094,0.876299,8621.333333,8732.999924,1.0,2021.0,2021.0,2021-09-20,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,6592.0,01987-2021,0.283841,0.287518,2828.7,2865.338333,1.0,2021.0,2021.0,2021-09-20,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0
3,6593.0,01686-2021,1.380901,1.398787,13761.75,13939.997104,1.0,2021.0,2021.0,2021-09-20,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0
4,6594.0,04358-2021,0.897526,0.909151,8944.545455,9060.398404,1.0,2021.0,2021.0,2021-09-20,...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
df_clean.head()

In [9]:
numeric_cols = df_clean.select_dtypes(include="number").columns
print(df_clean[numeric_cols].describe())

                id   waga_proba  waga_proba_kor      waga_pop  waga_pop_kor  \
count  2529.000000  2529.000000     2529.000000   2529.000000   2529.000000   
mean   7854.000000     1.000000        1.000000   9965.778964   9965.778964   
std     730.203739     0.446621        0.414533   4450.930974   4131.146452   
min    6590.000000     0.253951        0.257240   2530.815789   2563.595820   
25%    7222.000000     0.733557        0.743058   7310.466667   7405.154444   
50%    7854.000000     0.906111        0.917848   9030.105263   9147.066414   
75%    8486.000000     1.223727        1.239577  12195.388889  12353.347924   
max    9118.000000     2.835938        2.003588  28262.333333  19967.316869   

            rodzaj  edycja          rok       powiat          woj  ...  \
count  2529.000000  2529.0  2529.000000  2529.000000  2529.000000  ...   
mean      2.256623  2021.0  2021.377224  1688.760380    16.661922  ...   
std       0.621494     0.0     0.484788   871.789594     8.713621 