In [8]:
import polars as pl
from pathlib import Path

DATA = Path("data")
survey_path = DATA / "health_survey.csv"
rc_path     = DATA / "ReverseCodingItems.csv"


In [9]:
health_survey = pl.read_csv(survey_path)
reverse_coding = pl.read_csv(rc_path)

health_survey.head()
reverse_coding.head()

Question,Construct,Question # on Qualtrics Survey,Needs Reverse Coding?,Column Name
str,i64,i64,str,str
"""In the future, I plan to parti…",1,1,"""No""","""F1"""
"""Individuals are responsible fo…",5,2,"""Yes""","""F5"""
"""When tryng to understand the p…",2,3,"""No""","""F2"""
"""I plan to become involved in m…",1,4,"""No""","""F1.1"""
"""I can communicate well with ot…",2,5,"""No""","""F2.1"""


In [10]:
col_names = reverse_coding["Column Name"].to_list()
needs_rev = reverse_coding["Needs Reverse Coding?"].to_list()

col_dict = {
    col: f"{col}_reverse" if needs.strip().lower() == "yes" else f"{col}_regular"
    for col, needs in zip(col_names, needs_rev)
}

list(col_dict.items())[:10]


[('F1', 'F1_regular'),
 ('F5', 'F5_reverse'),
 ('F2', 'F2_regular'),
 ('F1.1', 'F1.1_regular'),
 ('F2.1', 'F2.1_regular'),
 ('F6', 'F6_reverse'),
 ('F4', 'F4_regular'),
 ('F3', 'F3_regular'),
 ('F5.1', 'F5.1_regular'),
 ('F1.2', 'F1.2_regular')]

In [11]:
health_survey = health_survey.rename(col_dict)

In [19]:
likert_map = {
    "Strongly Disagree": 1,
    "Somewhat Disagree": 2,
    "Neither Agree nor Disagree": 3,
    "Somewhat Agree": 4,
    "Strongly Agree": 5
}

reverse_cols = reverse_coding.filter(
    pl.col("Needs Reverse Coding?").str.strip_chars().str.to_lowercase() == "yes"
)["Column Name"].to_list()

regular_cols = reverse_coding.filter(
    pl.col("Needs Reverse Coding?").str.strip_chars().str.to_lowercase() == "no"
)["Column Name"].to_list()

health_survey_recoded = health_survey.select(
    [
        *(pl.col(col).replace(likert_map).cast(pl.Int32).alias(f"{col}_regular") for col in regular_cols),
        
        *( (6 - pl.col(col).replace(likert_map).cast(pl.Int32)).alias(f"{col}_reverse") for col in reverse_cols),
    ]
)

health_survey_recoded.head()

F1_regular,F2_regular,F1.1_regular,F2.1_regular,F4_regular,F3_regular,F5.1_regular,F1.2_regular,F2.2_regular,F6.1_regular,F2.3_regular,F2.4_regular,F2.5_regular,F6.2_regular,F1.3_regular,F2.6_regular,F5.3_regular,F4.2_regular,F2.7_regular,F3.1_regular,F2.8_regular,F5.4_regular,F3.2_regular,F1.4_regular,F3.3_regular,F1.5_regular,F1.6_regular,F2.9_regular,F3.4_regular,F2.10_regular,F1.7_regular,F4.4_regular,F5.7_regular,F3.5_regular,F2.11_regular,F5_reverse,F6_reverse,F4.1_reverse,F5.2_reverse,F5.5_reverse,F6.3_reverse,F5.6_reverse,F4.3_reverse,F6.4_reverse
i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32
4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4.0,2,4,4,4,3,4,4,4,4,2,4,4,4,4,4,4,4,4,2,2,4,4,2,3,2
4,4,4,4,4,3,3,4,3,4,4,4,4,4,4,4,4,4,4.0,3,4,3,3,4,3,3,4,4,4,4,4,3,4,3,4,4,4,3,2,3,4,4,3,4
5,4,5,5,4,4,4,5,4,3,4,4,5,4,5,4,4,4,,2,4,4,4,2,4,5,5,4,4,4,4,4,5,1,4,3,4,3,4,4,4,4,3,2
4,5,4,5,3,2,4,5,5,4,4,5,5,4,4,5,4,4,4.0,2,4,4,2,3,2,4,5,4,2,4,3,3,4,2,4,2,3,3,4,4,2,4,2,3
5,3,5,4,5,4,3,5,4,4,2,4,5,5,5,5,5,4,4.0,4,4,3,4,4,4,4,5,4,4,4,4,4,5,2,4,5,5,3,5,5,3,5,3,2


In [18]:
df = health_survey_recoded

prefixes = {re.split(r'[_]', col)[0] for col in df.columns}

aggregated = df.clone()

for q in prefixes:
    cols = [c for c in aggregated.columns if c.startswith(q)]
    
    if not cols:
        continue  # skip if nothing found
    
    mean_expr = (
        pl.reduce(lambda acc, x: acc + x, pl.col(cols)) / len(cols)
    ).alias(f"{q}_mean")
    
    aggregated = aggregated.with_columns(mean_expr)
    aggregated = aggregated.drop(cols)

print(aggregated.head())

shape: (5, 6)
┌─────────┬─────────┬─────────┬──────────┬──────────┬─────────┐
│ F5_mean ┆ F1_mean ┆ F6_mean ┆ F3_mean  ┆ F2_mean  ┆ F4_mean │
│ ---     ┆ ---     ┆ ---     ┆ ---      ┆ ---      ┆ ---     │
│ f64     ┆ f64     ┆ f64     ┆ f64      ┆ f64      ┆ f64     │
╞═════════╪═════════╪═════════╪══════════╪══════════╪═════════╡
│ 3.5     ┆ 3.875   ┆ 3.6     ┆ 3.333333 ┆ 4.0      ┆ 3.4     │
│ 3.375   ┆ 3.875   ┆ 4.0     ┆ 3.166667 ┆ 3.916667 ┆ 3.4     │
│ 4.0     ┆ 4.5     ┆ 3.4     ┆ 3.166667 ┆ null     ┆ 3.6     │
│ 3.75    ┆ 4.0     ┆ 3.2     ┆ 2.0      ┆ 4.5      ┆ 3.0     │
│ 4.5     ┆ 4.625   ┆ 3.8     ┆ 3.666667 ┆ 3.916667 ┆ 3.8     │
└─────────┴─────────┴─────────┴──────────┴──────────┴─────────┘


In [17]:
df = health_survey_recoded

prefixes = {re.split(r'[_]', col)[0] for col in df.columns}

agg_exprs = [
    (
        pl.reduce(lambda acc, x: acc + x, pl.col([c for c in df.columns if c.startswith(q)]))
        / len([c for c in df.columns if c.startswith(q)])
    ).alias(f"{q}_mean")
    for q in prefixes
    if any(c.startswith(q) for c in df.columns)  # only if columns exist
]

aggregated = (
    df.with_columns(agg_exprs)
      .drop([c for c in df.columns if any(c.startswith(p) for p in prefixes)])
)

print(aggregated.head())


shape: (5, 44)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ F2.4_mean ┆ F4.4_mean ┆ F2.10_mea ┆ F4.2_mean ┆ … ┆ F2.5_mean ┆ F1.6_mean ┆ F4.1_mean ┆ F2.11_me │
│ ---       ┆ ---       ┆ n         ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ an       │
│ f64       ┆ f64       ┆ ---       ┆ f64       ┆   ┆ f64       ┆ f64       ┆ f64       ┆ ---      │
│           ┆           ┆ f64       ┆           ┆   ┆           ┆           ┆           ┆ f64      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 4.0       ┆ 4.0       ┆ 4.0       ┆ 4.0       ┆ … ┆ 4.0       ┆ 4.0       ┆ 2.0       ┆ 4.0      │
│ 4.0       ┆ 3.0       ┆ 4.0       ┆ 4.0       ┆ … ┆ 4.0       ┆ 4.0       ┆ 3.0       ┆ 4.0      │
│ 4.0       ┆ 4.0       ┆ 4.0       ┆ 4.0       ┆ … ┆ 5.0       ┆ 5.0       ┆ 3.0       ┆ 4.0      │
│ 5.0       ┆ 3.0       ┆ 4.0       ┆ 4.0       ┆ … ┆ 5.0       ┆ 5.0       