#**Notebook 1**
## Using Demographics to Predict Responses to a Question or Category of Questions

##Import and read the cleaned dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# Load dataset
df = pd.read_csv('/content/UConn_SJI_Cleaned_Data_With_Demographics.csv')  # Update path if needed
df.head()



Unnamed: 0,Participant,S01_c1,S01_c2,S01_c3,S01_c4,S01_c5,S02_c1,S02_c2,S02_c3,S02_c4,...,S18_c3,S18_c4,S18_c5,S19_c1,S19_c2,S19_c3,S19_c4,S19_c5,Race,Gender
0,126,0,1,-1,0,0,1,0,-1,0,...,0,1,0,-1,1,0,0,0,Black,Man
1,127,-1,0,0,1,0,0,-1,1,0,...,-1,0,0,-1,0,0,1,0,Other,Man
2,128,0,1,0,0,-1,0,0,0,-1,...,-1,1,0,0,-1,0,1,0,Other,Man
3,129,-1,1,0,0,0,-1,0,1,0,...,0,1,0,0,-1,0,0,1,White,Man
4,130,-1,1,0,0,0,-1,1,0,0,...,0,0,0,-1,0,0,1,0,Other,Woman


##Create Train and Test Split

In [None]:
# Create consistent 50/50 split
np.random.seed(42)
df['split'] = np.random.choice(['train', 'test'], size=len(df), p=[0.5, 0.5])


##One-hot Encode Demographic

In [None]:
# Define categories explicitly (so "Man" and "White" are baselines)
df['Gender'] = pd.Categorical(df['Gender'], categories=['Man', 'Woman', 'Other'])
df['Race'] = pd.Categorical(df['Race'], categories=['White', 'Black', 'Asian', 'Other'])

# One-hot encode (drops the first category = baseline)
df = pd.get_dummies(df, columns=['Gender', 'Race'], drop_first=True)

# Check which dummy columns were created
[col for col in df.columns if 'Gender' in col or 'Race' in col]


KeyError: 'Gender'

In [None]:
# Define predictors and target
demo_cols = ['Gender_Woman', 'Gender_Other', 'Race_Black', 'Race_Asian', 'Race_Other']
target_col = 'S01_c1'

X_train = df[df['split'] == 'train'][demo_cols]
y_train = df[df['split'] == 'train'][target_col]

X_test  = df[df['split'] == 'test'][demo_cols]
y_test  = df[df['split'] == 'test'][target_col]

# Fit model
linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_pred = linreg.predict(X_test)

# Evaluate
print(f"R² Score: {r2_score(y_test, y_pred):.3f}")
print(f"MSE: {mean_squared_error(y_test, y_pred):.3f}")

# Coefficients
pd.DataFrame({
    "Variable": demo_cols,
    "Coefficient": linreg.coef_
})



R² Score: -0.045
MSE: 0.380


Unnamed: 0,Variable,Coefficient
0,Gender_Woman,-0.317042
1,Gender_Other,0.100502
2,Race_Black,-0.693087
3,Race_Asian,-0.742551
4,Race_Other,-0.645196


# Logistic Regression Model

In [2]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# Load your dataset
df = pd.read_csv('UConn_SJI_signed_onehot.csv')


In [3]:
# Inspect key columns ---
print("Columns in dataset:", df.columns.tolist())
print("\nPreview of demographic columns:\n", df[['DQ4', 'DQ5']].head())

Columns in dataset: ['Participant', 'S01_c1', 'S01_c2', 'S01_c3', 'S01_c4', 'S01_c5', 'S02_c1', 'S02_c2', 'S02_c3', 'S02_c4', 'S02_c5', 'S03_c1', 'S03_c2', 'S03_c3', 'S03_c4', 'S03_c5', 'S04_c1', 'S04_c2', 'S04_c3', 'S04_c4', 'S04_c5', 'S05_c1', 'S05_c2', 'S05_c3', 'S05_c4', 'S05_c5', 'S06_c1', 'S06_c2', 'S06_c3', 'S06_c4', 'S06_c5', 'S07_c1', 'S07_c2', 'S07_c3', 'S07_c4', 'S07_c5', 'S08_c1', 'S08_c2', 'S08_c3', 'S08_c4', 'S08_c5', 'S09_c1', 'S09_c2', 'S09_c3', 'S09_c4', 'S09_c5', 'S10_c1', 'S10_c2', 'S10_c3', 'S10_c4', 'S10_c5', 'S11_c1', 'S11_c2', 'S11_c3', 'S11_c4', 'S11_c5', 'S12_c1', 'S12_c2', 'S12_c3', 'S12_c4', 'S12_c5', 'S13_c1', 'S13_c2', 'S13_c3', 'S13_c4', 'S13_c5', 'S14_c1', 'S14_c2', 'S14_c3', 'S14_c4', 'S14_c5', 'S15_c1', 'S15_c2', 'S15_c3', 'S15_c4', 'S15_c5', 'S16_c1', 'S16_c2', 'S16_c3', 'S16_c4', 'S16_c5', 'S17_c1', 'S17_c2', 'S17_c3', 'S17_c4', 'S17_c5', 'S18_c1', 'S18_c2', 'S18_c3', 'S18_c4', 'S18_c5', 'S19_c1', 'S19_c2', 'S19_c3', 'S19_c4', 'S19_c5', 'DQ4', 'DQ5']


In [4]:
# Select features and target
predictors = ['DQ4', 'DQ5']       # Neurodiversity and Gender
target = 'S01_c1'                  # Target question

# Drop any missing values in relevant columns
df = df.dropna(subset=predictors + [target])

In [5]:
# One-hot encode categorical predictors
encoder = OneHotEncoder(drop='first', sparse_output=False)
X_encoded = encoder.fit_transform(df[predictors])

# Create a DataFrame with encoded column names
encoded_cols = encoder.get_feature_names_out(predictors)
X = pd.DataFrame(X_encoded, columns=encoded_cols)

# Target variable (convert to numeric if necessary)
y = df[target]
if y.dtype == 'object':
    y = y.astype('category').cat.codes

In [6]:
# Split data into train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=42
)

In [7]:
# Build logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [8]:
# Predictions
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]

In [9]:
# Evaluations
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"\nModel Performance for {target}:")
print(f"R² Score: {r2:.3f}")
print(f"MSE: {mse:.3f}")


Model Performance for S01_c1:
R² Score: -0.642
MSE: 0.604


In [10]:
# Coefficients summaries
coefficients = pd.DataFrame({
    'Variable': encoded_cols,
    'Coefficient': model.coef_[0]
})
coefficients.loc[len(coefficients)] = ['Intercept', model.intercept_[0]]

print("\nCoefficient Summary:\n", coefficients)


Coefficient Summary:
                                             Variable  Coefficient
0                                             DQ4_No     0.156622
1                           DQ4_Prefer not to answer    -0.193847
2        DQ4_Yes, and I do not have an accommodation     0.175108
3  DQ4_Yes, and I have an accommodation at my uni...     0.368905
4                           DQ5_Prefer not to answer     0.000000
5                                          DQ5_Woman     0.448600
6                                          Intercept    -0.139804
