#**Notebook 1**
## Using Demographics to Predict Responses to a Question or Category of Questions

##Import and read the cleaned dataset

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# Load dataset
df = pd.read_csv('/content/UConn_SJI_Cleaned_Data_With_Demographics.csv')  # Update path if needed
df.head()



Unnamed: 0,Participant,S01_c1,S01_c2,S01_c3,S01_c4,S01_c5,S02_c1,S02_c2,S02_c3,S02_c4,...,S18_c3,S18_c4,S18_c5,S19_c1,S19_c2,S19_c3,S19_c4,S19_c5,Race,Gender
0,126,0,1,-1,0,0,1,0,-1,0,...,0,1,0,-1,1,0,0,0,Black,Man
1,127,-1,0,0,1,0,0,-1,1,0,...,-1,0,0,-1,0,0,1,0,Other,Man
2,128,0,1,0,0,-1,0,0,0,-1,...,-1,1,0,0,-1,0,1,0,Other,Man
3,129,-1,1,0,0,0,-1,0,1,0,...,0,1,0,0,-1,0,0,1,White,Man
4,130,-1,1,0,0,0,-1,1,0,0,...,0,0,0,-1,0,0,1,0,Other,Woman


##Create Train and Test Split

In [4]:
# Create consistent 50/50 split
np.random.seed(42)
df['split'] = np.random.choice(['train', 'test'], size=len(df), p=[0.5, 0.5])


##One-hot Encode Demographic

In [5]:
# Define categories explicitly (so "Man" and "White" are baselines)
df['Gender'] = pd.Categorical(df['Gender'], categories=['Man', 'Woman', 'Other'])
df['Race'] = pd.Categorical(df['Race'], categories=['White', 'Black', 'Asian', 'Other'])

# One-hot encode (drops the first category = baseline)
df = pd.get_dummies(df, columns=['Gender', 'Race'], drop_first=True)

# Check which dummy columns were created
[col for col in df.columns if 'Gender' in col or 'Race' in col]


['Gender_Woman', 'Gender_Other', 'Race_Black', 'Race_Asian', 'Race_Other']

In [6]:
# Define predictors and target
demo_cols = ['Gender_Woman', 'Gender_Other', 'Race_Black', 'Race_Asian', 'Race_Other']
target_col = 'S01_c1'

X_train = df[df['split'] == 'train'][demo_cols]
y_train = df[df['split'] == 'train'][target_col]

X_test  = df[df['split'] == 'test'][demo_cols]
y_test  = df[df['split'] == 'test'][target_col]

# Fit model
linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_pred = linreg.predict(X_test)

# Evaluate
print(f"R² Score: {r2_score(y_test, y_pred):.3f}")
print(f"MSE: {mean_squared_error(y_test, y_pred):.3f}")

# Coefficients
pd.DataFrame({
    "Variable": demo_cols,
    "Coefficient": linreg.coef_
})



R² Score: -0.045
MSE: 0.380


Unnamed: 0,Variable,Coefficient
0,Gender_Woman,-0.317042
1,Gender_Other,0.100502
2,Race_Black,-0.693087
3,Race_Asian,-0.742551
4,Race_Other,-0.645196


# Logistic Regression Model

In [8]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv('UConn_SJI_signed_onehot.csv')


In [9]:
# Inspect key columns
print("Columns in dataset:", df.columns.tolist())
print("\nPreview of demographic columns:\n", df[['DQ4', 'DQ5']].head())

Columns in dataset: ['Participant', 'S01_c1', 'S01_c2', 'S01_c3', 'S01_c4', 'S01_c5', 'S02_c1', 'S02_c2', 'S02_c3', 'S02_c4', 'S02_c5', 'S03_c1', 'S03_c2', 'S03_c3', 'S03_c4', 'S03_c5', 'S04_c1', 'S04_c2', 'S04_c3', 'S04_c4', 'S04_c5', 'S05_c1', 'S05_c2', 'S05_c3', 'S05_c4', 'S05_c5', 'S06_c1', 'S06_c2', 'S06_c3', 'S06_c4', 'S06_c5', 'S07_c1', 'S07_c2', 'S07_c3', 'S07_c4', 'S07_c5', 'S08_c1', 'S08_c2', 'S08_c3', 'S08_c4', 'S08_c5', 'S09_c1', 'S09_c2', 'S09_c3', 'S09_c4', 'S09_c5', 'S10_c1', 'S10_c2', 'S10_c3', 'S10_c4', 'S10_c5', 'S11_c1', 'S11_c2', 'S11_c3', 'S11_c4', 'S11_c5', 'S12_c1', 'S12_c2', 'S12_c3', 'S12_c4', 'S12_c5', 'S13_c1', 'S13_c2', 'S13_c3', 'S13_c4', 'S13_c5', 'S14_c1', 'S14_c2', 'S14_c3', 'S14_c4', 'S14_c5', 'S15_c1', 'S15_c2', 'S15_c3', 'S15_c4', 'S15_c5', 'S16_c1', 'S16_c2', 'S16_c3', 'S16_c4', 'S16_c5', 'S17_c1', 'S17_c2', 'S17_c3', 'S17_c4', 'S17_c5', 'S18_c1', 'S18_c2', 'S18_c3', 'S18_c4', 'S18_c5', 'S19_c1', 'S19_c2', 'S19_c3', 'S19_c4', 'S19_c5', 'DQ4', 'DQ5']


In [10]:
# Select features and target
predictors = ['DQ4', 'DQ5']       # Neurodiversity and Gender
target = 'S01_c1'                  # Target question

# Drop any missing values in relevant columns
df = df.dropna(subset=predictors + [target])

In [11]:
# One-hot encode categorical predictors
encoder = OneHotEncoder(drop='first', sparse_output=False)
X_encoded = encoder.fit_transform(df[predictors])

# Create a DataFrame with encoded column names
encoded_cols = encoder.get_feature_names_out(predictors)
X = pd.DataFrame(X_encoded, columns=encoded_cols)

# Target variable
y = df[target]
if y.dtype == 'object':
    y = y.astype('category').cat.codes

In [12]:
# Split data into train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    train_size=0.5,  # 50%
    test_size=0.5,
    random_state=42,
    stratify=y       # keep class balance
)


In [13]:
# Build logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [14]:
# Predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Misclassification rates
train_misclass = 1 - accuracy_score(y_train, y_pred_train)
test_misclass = 1 - accuracy_score(y_test, y_pred_test)

# Create a table
misclass_table = pd.DataFrame({
    'Dataset': ['Train', 'Test'],
    'Misclassification Rate': [train_misclass, test_misclass],
    'Accuracy': [1 - train_misclass, 1 - test_misclass]
})

print("\nModel Misclassification Rates:")
print(misclass_table.to_string(index=False))



Model Misclassification Rates:
Dataset  Misclassification Rate  Accuracy
  Train                0.479167  0.520833
   Test                0.708333  0.291667


#Function to Create Multiple Models

In [19]:
# Imports
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

In [43]:
def build_and_eval_question_model(
    df: pd.DataFrame,
    target_col: str,
    demo_cols=('DQ4', 'DQ5'),
    C=1.0,
    random_state=42
):

    # Keep only required columns; drop rows missing either demos or target
    cols_needed = list(demo_cols) + [target_col]
    data = df[cols_needed].dropna(subset=cols_needed).copy()

    # Features: ONLY DQ4 & DQ5
    X = data[list(demo_cols)]
    y = data[target_col]

    # Preprocess demos with OHE (drop first level to avoid collinearity)
    preproc = ColumnTransformer(
        transformers=[('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), list(demo_cols))],
        remainder='drop'
    )

    # Multiclass works automatically; liblinear for binary, lbfgs for multi handled by default settings
    logit = LogisticRegression(
        class_weight='balanced',
        C=C,
        max_iter=1000,
        random_state=random_state
    )

    pipe = Pipeline(steps=[
        ('prep', preproc),
        ('clf', logit)
    ])

    # 50/50 split with stratification to preserve label distribution
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        train_size=0.5,
        test_size=0.5,
        random_state=random_state,
        stratify=y
    )

    # Fit and predict
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    misclass_rate = 1 - acc
    f1_macro = f1_score(y_test, y_pred, average='macro')
    f1_weighted = f1_score(y_test, y_pred, average='weighted')
    cm = confusion_matrix(y_test, y_pred)

    print(f"=== Logistic Regression (target: {target_col}) ===")
    print("Features used: DQ4 (Neurodiversity) + DQ5 (Gender) [one-hot encoded]")
    print("Train/Test split: 50% / 50% (stratified)\n")
    print(f"Accuracy: {acc:.3f} | Misclassification Rate: {misclass_rate:.3f}")
    print(f"F1 (macro): {f1_macro:.3f} | F1 (weighted): {f1_weighted:.3f}\n")


    # Build a coefficient table with odds ratios
    ohe = pipe.named_steps['prep'].named_transformers_['cat']
    feature_names = ohe.get_feature_names_out(list(demo_cols))
    clf = pipe.named_steps['clf']

    # Handle binary vs multiclass: coef_.shape = (n_classes, n_features) for multi
    if clf.coef_.ndim == 1 or clf.coef_.shape[0] == 1:
        coefs = clf.coef_.ravel()
        coef_table = pd.DataFrame({
            'feature': feature_names,
            'coef': coefs,
            'odds_ratio': np.exp(coefs)
        }).sort_values('feature').reset_index(drop=True)
    else:
        rows = []
        for k, cls in enumerate(clf.classes_):
            c = clf.coef_[k]
            rows.append(pd.DataFrame({
                'class': cls,
                'feature': feature_names,
                'coef': c,
                'odds_ratio': np.exp(c)
            }))
        coef_table = pd.concat(rows, ignore_index=True).sort_values(['class','feature']).reset_index(drop=True)

    return {
        'pipeline': pipe,
        'metrics': {
            'accuracy': acc,
            'misclassification_rate': misclass_rate,
            'f1_macro': f1_macro,
            'f1_weighted': f1_weighted,
            'confusion_matrix': cm
        },
        'coef_table': coef_table,
        'X_train': X_train, 'X_test': X_test,
        'y_train': y_train, 'y_test': y_test
    }


## Question 1
"Your first round of tests did not go well and your usual studying habits are not working"

In [25]:
# Question 1 Choice 1
# "Wait until the next round of tests to see if you’ll do better"

res = build_and_eval_question_model(df, target_col="S01_c1")
res['coef_table'].head()


=== Logistic Regression (target: S01_c1) ===
Features used: DQ4 (Neurodiversity) + DQ5 (Gender) [one-hot encoded]
Train/Test split: 50% / 50% (stratified)

Accuracy: 0.396 | Misclassification Rate: 0.604
F1 (macro): 0.323 | F1 (weighted): 0.358



Unnamed: 0,class,feature,coef,odds_ratio
0,-1,DQ4_No,0.482783,1.620578
1,-1,DQ4_Prefer not to answer,-0.149886,0.860806
2,-1,"DQ4_Yes, and I do not have an accommodation",0.406801,1.502006
3,-1,"DQ4_Yes, and I have an accommodation at my uni...",-0.517409,0.596063
4,-1,DQ5_Prefer not to answer,-0.267839,0.765031


In [27]:
# Question 1 Choice 2
# "Review your first round of tests once you get them back to understand your mistakes"

res = build_and_eval_question_model(df, target_col="S01_c2")
res['coef_table'].head()

=== Logistic Regression (target: S01_c2) ===
Features used: DQ4 (Neurodiversity) + DQ5 (Gender) [one-hot encoded]
Train/Test split: 50% / 50% (stratified)

Accuracy: 0.562 | Misclassification Rate: 0.438
F1 (macro): 0.426 | F1 (weighted): 0.558



Unnamed: 0,class,feature,coef,odds_ratio
0,-1,DQ4_No,-1.523292,0.217993
1,-1,DQ4_Prefer not to answer,-0.610557,0.543048
2,-1,"DQ4_Yes, and I do not have an accommodation",-0.84064,0.431434
3,-1,"DQ4_Yes, and I have an accommodation at my uni...",-0.758525,0.468357
4,-1,DQ5_Prefer not to answer,-0.184048,0.831896


In [28]:
# Question 1 Choice 3
# "Make and work with a study group with peers in your classes to better understand the material"

res = build_and_eval_question_model(df, target_col="S01_c3")
res['coef_table'].head()

=== Logistic Regression (target: S01_c3) ===
Features used: DQ4 (Neurodiversity) + DQ5 (Gender) [one-hot encoded]
Train/Test split: 50% / 50% (stratified)

Accuracy: 0.375 | Misclassification Rate: 0.625
F1 (macro): 0.265 | F1 (weighted): 0.396



Unnamed: 0,class,feature,coef,odds_ratio
0,-1,DQ4_No,0.198636,1.219738
1,-1,DQ4_Prefer not to answer,-0.175483,0.839051
2,-1,"DQ4_Yes, and I do not have an accommodation",-0.319608,0.726434
3,-1,"DQ4_Yes, and I have an accommodation at my uni...",0.63452,1.886116
4,-1,DQ5_Prefer not to answer,0.856739,2.355467


In [29]:
# Question 1 Choice 4
# "Start going to office hours regularly for each of your courses"

res = build_and_eval_question_model(df, target_col="S01_c4")
res['coef_table'].head()

=== Logistic Regression (target: S01_c4) ===
Features used: DQ4 (Neurodiversity) + DQ5 (Gender) [one-hot encoded]
Train/Test split: 50% / 50% (stratified)

Accuracy: 0.438 | Misclassification Rate: 0.562
F1 (macro): 0.323 | F1 (weighted): 0.477



Unnamed: 0,class,feature,coef,odds_ratio
0,-1,DQ4_No,1.180853,3.257152
1,-1,DQ4_Prefer not to answer,-0.133332,0.875174
2,-1,"DQ4_Yes, and I do not have an accommodation",-0.339095,0.712415
3,-1,"DQ4_Yes, and I have an accommodation at my uni...",0.652801,1.920913
4,-1,DQ5_Prefer not to answer,-0.214797,0.806705


In [30]:
# Question 1 Choice 5
# "Visit the student success center to learn a new study method"

res = build_and_eval_question_model(df, target_col="S01_c5")
res['coef_table'].head()

=== Logistic Regression (target: S01_c5) ===
Features used: DQ4 (Neurodiversity) + DQ5 (Gender) [one-hot encoded]
Train/Test split: 50% / 50% (stratified)

Accuracy: 0.479 | Misclassification Rate: 0.521
F1 (macro): 0.350 | F1 (weighted): 0.522





Unnamed: 0,class,feature,coef,odds_ratio
0,-1,DQ4_No,-0.20093,0.81797
1,-1,DQ4_Prefer not to answer,0.53338,1.704685
2,-1,"DQ4_Yes, and I do not have an accommodation",0.182837,1.200618
3,-1,"DQ4_Yes, and I have an accommodation at my uni...",0.633203,1.883635
4,-1,DQ5_Woman,-0.276729,0.75826


##Question 7
"You are taking a difficult course load and begin feeling overwhelmed"

In [44]:
# Question 7 Choice 1
# "Push through your challenging courses knowing the semester will eventually end"

res = build_and_eval_question_model(df, target_col="S07_c1")
res['coef_table'].head()

=== Logistic Regression (target: S07_c1) ===
Features used: DQ4 (Neurodiversity) + DQ5 (Gender) [one-hot encoded]
Train/Test split: 50% / 50% (stratified)

Accuracy: 0.333 | Misclassification Rate: 0.667
F1 (macro): 0.325 | F1 (weighted): 0.272



Unnamed: 0,class,feature,coef,odds_ratio
0,-1,DQ4_No,-0.315939,0.729104
1,-1,DQ4_Prefer not to answer,-0.156993,0.85471
2,-1,"DQ4_Yes, and I do not have an accommodation",-0.055303,0.946198
3,-1,"DQ4_Yes, and I have an accommodation at my uni...",0.318677,1.375307
4,-1,DQ5_Prefer not to answer,-0.18454,0.831486


In [32]:
# Question 7 Choice 2
# "Find instructional videos to watch online to support your learning outside of class"

res = build_and_eval_question_model(df, target_col="S07_c2")
res['coef_table'].head()

=== Logistic Regression (target: S07_c2) ===
Features used: DQ4 (Neurodiversity) + DQ5 (Gender) [one-hot encoded]
Train/Test split: 50% / 50% (stratified)

Accuracy: 0.542 | Misclassification Rate: 0.458
F1 (macro): 0.534 | F1 (weighted): 0.559



Unnamed: 0,feature,coef,odds_ratio
0,DQ4_No,-0.096577,0.90794
1,DQ4_Prefer not to answer,0.453732,1.574176
2,"DQ4_Yes, and I do not have an accommodation",-0.813318,0.443384
3,"DQ4_Yes, and I have an accommodation at my uni...",-0.33126,0.718018
4,DQ5_Prefer not to answer,-0.200712,0.818148


In [33]:
# Question 7 Choice 3
# "Make and work with a study group of peers from your classes"

res = build_and_eval_question_model(df, target_col="S03_c3")
res['coef_table'].head()

=== Logistic Regression (target: S03_c3) ===
Features used: DQ4 (Neurodiversity) + DQ5 (Gender) [one-hot encoded]
Train/Test split: 50% / 50% (stratified)

Accuracy: 0.354 | Misclassification Rate: 0.646
F1 (macro): 0.297 | F1 (weighted): 0.410



Unnamed: 0,class,feature,coef,odds_ratio
0,-1,DQ4_No,0.260384,1.297428
1,-1,DQ4_Prefer not to answer,-0.167598,0.845694
2,-1,"DQ4_Yes, and I do not have an accommodation",-0.336148,0.714517
3,-1,"DQ4_Yes, and I have an accommodation at my uni...",0.681204,1.976255
4,-1,DQ5_Prefer not to answer,-0.126612,0.881075


In [34]:
# Question 7 Choice 4
# "Ask your advisor for advice about time management"

res = build_and_eval_question_model(df, target_col="S07_c4")
res['coef_table'].head()

=== Logistic Regression (target: S07_c4) ===
Features used: DQ4 (Neurodiversity) + DQ5 (Gender) [one-hot encoded]
Train/Test split: 50% / 50% (stratified)

Accuracy: 0.458 | Misclassification Rate: 0.542
F1 (macro): 0.325 | F1 (weighted): 0.410



Unnamed: 0,class,feature,coef,odds_ratio
0,-1,DQ4_No,-0.584663,0.557293
1,-1,DQ4_Prefer not to answer,0.006145,1.006164
2,-1,"DQ4_Yes, and I do not have an accommodation",-0.222977,0.800133
3,-1,"DQ4_Yes, and I have an accommodation at my uni...",0.094549,1.099163
4,-1,DQ5_Prefer not to answer,-0.266696,0.765906


In [35]:
# Question 7 Choice 5
# "Get tutoring for your courses from the student success center"

res = build_and_eval_question_model(df, target_col="S07_c5")
res['coef_table'].head()

=== Logistic Regression (target: S07_c5) ===
Features used: DQ4 (Neurodiversity) + DQ5 (Gender) [one-hot encoded]
Train/Test split: 50% / 50% (stratified)

Accuracy: 0.208 | Misclassification Rate: 0.792
F1 (macro): 0.167 | F1 (weighted): 0.255





Unnamed: 0,class,feature,coef,odds_ratio
0,-1,DQ4_No,0.754887,2.12737
1,-1,DQ4_Prefer not to answer,-0.183,0.832768
2,-1,"DQ4_Yes, and I do not have an accommodation",-0.365985,0.693514
3,-1,"DQ4_Yes, and I have an accommodation at my uni...",0.854554,2.350325
4,-1,DQ5_Woman,-1.433327,0.238514


##Question 10

"You are at the risk of failing a required course and the withdraw/drop deadline is approaching"

In [37]:
# Question 10 Choice 1
# "Wait and see how you’re doing in the course once the deadline is closer"

res = build_and_eval_question_model(df, target_col="S10_c1")
res['coef_table'].head()

=== Logistic Regression (target: S10_c1) ===
Features used: DQ4 (Neurodiversity) + DQ5 (Gender) [one-hot encoded]
Train/Test split: 50% / 50% (stratified)

Accuracy: 0.458 | Misclassification Rate: 0.542
F1 (macro): 0.334 | F1 (weighted): 0.417



Unnamed: 0,class,feature,coef,odds_ratio
0,-1,DQ4_No,-0.098712,0.906003
1,-1,DQ4_Prefer not to answer,0.182422,1.200121
2,-1,"DQ4_Yes, and I do not have an accommodation",0.182422,1.200121
3,-1,"DQ4_Yes, and I have an accommodation at my uni...",0.821501,2.273911
4,-1,DQ5_Prefer not to answer,0.23866,1.269547


In [38]:
# Question 10 Choice 2
# "Spend more time studying for the course to try to bring up your grade"

res = build_and_eval_question_model(df, target_col="S10_c2")
res['coef_table'].head()

=== Logistic Regression (target: S10_c2) ===
Features used: DQ4 (Neurodiversity) + DQ5 (Gender) [one-hot encoded]
Train/Test split: 50% / 50% (stratified)

Accuracy: 0.312 | Misclassification Rate: 0.688
F1 (macro): 0.287 | F1 (weighted): 0.243





Unnamed: 0,feature,coef,odds_ratio
0,DQ4_No,0.014114,1.014214
1,"DQ4_Yes, and I do not have an accommodation",-0.936925,0.391831
2,"DQ4_Yes, and I have an accommodation at my uni...",-1.03831,0.354052
3,DQ5_Prefer not to answer,-0.500745,0.606079
4,DQ5_Woman,-0.250523,0.778394


In [39]:
# Question 10 Choice 3
# "Ask a friend for help on the material you are struggling with to try to bring up your grade"

res = build_and_eval_question_model(df, target_col="S10_c3")
res['coef_table'].head()

=== Logistic Regression (target: S10_c3) ===
Features used: DQ4 (Neurodiversity) + DQ5 (Gender) [one-hot encoded]
Train/Test split: 50% / 50% (stratified)

Accuracy: 0.167 | Misclassification Rate: 0.833
F1 (macro): 0.151 | F1 (weighted): 0.197



Unnamed: 0,class,feature,coef,odds_ratio
0,-1,DQ4_No,0.631996,1.881362
1,-1,DQ4_Prefer not to answer,-0.077676,0.925264
2,-1,"DQ4_Yes, and I do not have an accommodation",-0.368456,0.691802
3,-1,"DQ4_Yes, and I have an accommodation at my uni...",-0.19854,0.819927
4,-1,DQ5_Prefer not to answer,-0.056291,0.945264


In [42]:
# Question 10 Choice 4
# "Talk to your academic advisor for advice about dropping the course"

res = build_and_eval_question_model(df, target_col="S10_c4")
res['coef_table'].head()

=== Logistic Regression (target: S10_c4) ===
Features used: DQ4 (Neurodiversity) + DQ5 (Gender) [one-hot encoded]
Train/Test split: 50% / 50% (stratified)

Accuracy: 0.312 | Misclassification Rate: 0.688
F1 (macro): 0.305 | F1 (weighted): 0.321





Unnamed: 0,class,feature,coef,odds_ratio
0,-1,DQ4_No,-0.512414,0.599048
1,-1,DQ4_Prefer not to answer,-0.543074,0.58096
2,-1,"DQ4_Yes, and I have an accommodation at my uni...",-0.185845,0.830403
3,-1,DQ5_Woman,-0.182083,0.833532
4,0,DQ4_No,-0.03622,0.964428


In [41]:
# Question 10 Choice 5
# "Get tutoring for your courses from the student success center"

res = build_and_eval_question_model(df, target_col="S10_c5")
res['coef_table'].head()

=== Logistic Regression (target: S10_c5) ===
Features used: DQ4 (Neurodiversity) + DQ5 (Gender) [one-hot encoded]
Train/Test split: 50% / 50% (stratified)

Accuracy: 0.271 | Misclassification Rate: 0.729
F1 (macro): 0.250 | F1 (weighted): 0.307





Unnamed: 0,class,feature,coef,odds_ratio
0,-1,DQ4_No,-0.350891,0.704061
1,-1,DQ4_Prefer not to answer,-0.214795,0.806707
2,-1,"DQ4_Yes, and I do not have an accommodation",-0.4589,0.631979
3,-1,"DQ4_Yes, and I have an accommodation at my uni...",-0.828659,0.436635
4,-1,DQ5_Woman,-1.044353,0.35192
