#**Notebook 1**
## Using Demographics to Predict Responses to a Question or Category of Questions

##Import and read the cleaned dataset

In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# Load dataset
df = pd.read_csv('/content/UConn_SJI_Cleaned_Data_With_Demographics.csv')  # Update path if needed
df.head()



Unnamed: 0,Race,Gender,Participant,S01_c1,S01_c2,S01_c3,S01_c4,S01_c5,S02_c1,S02_c2,...,S18_c1,S18_c2,S18_c3,S18_c4,S18_c5,S19_c1,S19_c2,S19_c3,S19_c4,S19_c5
0,Black,Man,126,0,1,-1,0,0,1,0,...,-1,0,0,1,0,-1,1,0,0,0
1,Other,Man,127,-1,0,0,1,0,0,-1,...,0,1,-1,0,0,-1,0,0,1,0
2,Other,Man,128,0,1,0,0,-1,0,0,...,0,0,-1,1,0,0,-1,0,1,0
3,White,Man,129,-1,1,0,0,0,-1,0,...,-1,0,0,1,0,0,-1,0,0,1
4,Other,Woman,130,-1,1,0,0,0,-1,1,...,-1,1,0,0,0,-1,0,0,1,0


##Create Train and Test Split

In [39]:
# Create consistent 80/20 split
np.random.seed(42)
df['split'] = np.random.choice(['train', 'test'], size=len(df), p=[0.5, 0.5])


##One-hot Encode Demographic

In [40]:
# Define categories explicitly (so "Man" and "White" are baselines)
df['Gender'] = pd.Categorical(df['Gender'], categories=['Man', 'Woman', 'Other'])
df['Race'] = pd.Categorical(df['Race'], categories=['White', 'Black', 'Asian', 'Other'])

# One-hot encode (drops the first category = baseline)
df = pd.get_dummies(df, columns=['Gender', 'Race'], drop_first=True)

# Check which dummy columns were created
[col for col in df.columns if 'Gender' in col or 'Race' in col]


['Gender_Woman', 'Gender_Other', 'Race_Black', 'Race_Asian', 'Race_Other']

## Linear Regression Model

In [41]:
# Define predictors and target
demo_cols = ['Gender_Woman', 'Gender_Other', 'Race_Black', 'Race_Asian', 'Race_Other']
target_col = 'S01_c1'

X_train = df[df['split'] == 'train'][demo_cols]
y_train = df[df['split'] == 'train'][target_col]

X_test  = df[df['split'] == 'test'][demo_cols]
y_test  = df[df['split'] == 'test'][target_col]

# Fit model
linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_pred = linreg.predict(X_test)

# Evaluate
print(f"R² Score: {r2_score(y_test, y_pred):.3f}")
print(f"MSE: {mean_squared_error(y_test, y_pred):.3f}")

# Coefficients
pd.DataFrame({
    "Variable": demo_cols,
    "Coefficient": linreg.coef_
})



R² Score: -0.045
MSE: 0.380


Unnamed: 0,Variable,Coefficient
0,Gender_Woman,-0.317042
1,Gender_Other,0.100502
2,Race_Black,-0.693087
3,Race_Asian,-0.742551
4,Race_Other,-0.645196
