In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
import statsmodels.api as sm
import xgboost as xgb
from xgboost import XGBClassifier
pd.set_option('display.max_columns', None)
pd.set_option('future.no_silent_downcasting', True)


In [18]:
ug = pd.read_csv("raw data.csv")
#only select features we want
ug = ug.iloc[:,[
  1,       # B
  3,       # D
  44, 45, 46, 47, 48,                 # AS - AW
  50, 51, 52, 53, 54, 55,            # AY - BD
  85,                                # CH
  92, 93, 94, 95, 96, 97, 98, 99,    # CO - CV
  109                                # DF
]]


In [5]:
#checking missing values
missing_counts = ug.isna().sum()
missing_percent = ug.isna().mean() * 100

missing_summary = pd.DataFrame({
    'Missing Count': missing_counts,
    'Missing %': missing_percent
}).sort_values(by='Missing %', ascending=False)



In [19]:
#one hot encode variables
ug = ug.dropna()
ug = ug[ug['Did you apply to live in university housing for next academic year? - Selected Choice'] != 'No, I am leaving UC Berkeley (graduating, studying abroad, etc.)']
encoder = OneHotEncoder(sparse_output=False) 
cols_to_encode = ['Please select the unit where you live: - Selected Choice','What is your living accommodation? - Selected Choice']
encoded_array = encoder.fit_transform(ug[cols_to_encode])
categories = encoder.categories_
flat_categories = [item for sublist in categories for item in sublist]
encoded_df = pd.DataFrame(
    encoded_array,
    columns=flat_categories,
    index=ug.index  
)
df_remaining = ug.drop(columns=cols_to_encode)
ug = pd.concat([df_remaining, encoded_df], axis=1)


In [20]:
#data cleaning
for i in range(5):
    ug.iloc[:,i] = ug.iloc[:,i].replace({"Strongly Disagree":1,'Disagree':2,'No opinion/ I have no experience with my RA in this area':3,'Agree':4,'Strongly Agree':5}).apply(pd.to_numeric)
for i in range(5,9):
    ug.iloc[:,i] = ug.iloc[:,i].replace({"Strongly Disagree":1,'Disagree':2,'I have seen advertisements, but have not attended their events':3,'I have not seen their event advertisements':3,'Agree':4,'Strongly Agree':5}).apply(pd.to_numeric)
for i in range(9,11):
    ug.iloc[:,i] = ug.iloc[:,i].replace({"Strongly disagree":1,'Disagree':2,'I have not attended any Residential Life events':3,'Agree':4,'Strongly agree':5}).apply(pd.to_numeric)
ug.iloc[:,11] = ug.iloc[:,11].replace({'No, I will be a student, but plan to live in private housing':0,'Yes, as a resident':1,
       'Yes, as Residential Life or RHA staff (RA, RSC, etc.)':1,
       'No, other (please write in)':0,
       'No, I am leaving UC Berkeley (graduating, studying abroad, etc.)':0,
       'No, I will live with my family, fraternity, sorority, or in a co-op':0}).apply(pd.to_numeric)
for i in range(12,20):
    ug.iloc[:,i] = ug.iloc[:,i].replace({"Strongly disagree":1,'Disagree':2,'No opinion':3,'Agree':4,'Strongly agree':5}).apply(pd.to_numeric)
ug.iloc[:,20] = ug.iloc[:,20].replace({"No":0,"Yes":1}).apply(pd.to_numeric)
ug2 = ug.copy(deep=True)


In [41]:
#scale all the values
ug.iloc[:,np.r_[:11,12:20]] = StandardScaler().fit_transform(ug.iloc[:,np.r_[:11,12:20]])
ug = ug.apply(lambda col: pd.to_numeric(col) if col.dtypes == "object" else col)
#ug.head()

In [31]:
X = ug.drop('Did you apply to live in university housing for next academic year? - Selected Choice',axis=1)
corr = X.corr()
#plt.figure(figsize=(15, 12))
#sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", center=0, annot_kws={"size":8})
#plt.xticks(rotation=45, ha="right")
#plt.yticks(rotation=0)
#plt.title("Correlation Heatmap of Features", fontsize=14)
#plt.show()   

MODEL 1: Logistic Regression

In [32]:
y = ug['Did you apply to live in university housing for next academic year? - Selected Choice'].astype(int)
X = ug.drop('Did you apply to live in university housing for next academic year? - Selected Choice',axis=1)
X = X.drop('Unit 2',axis=1)
X = X.drop('Triple in a residence hall or suite',axis=1)

model1 = LogisticRegression(penalty=None, solver='lbfgs', max_iter=1000)
kf = KFold(n_splits=5, shuffle=True, random_state=491)
coefficients = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model1.fit(X_train, y_train)
    coefficients.append(model1.coef_[0])


coef_array = np.array(coefficients)
coef_means = coef_array.mean(axis=0)

coef_df = pd.DataFrame({
    'Feature': X.columns,
    'Mean_Coefficient': coef_means
}).sort_values(by='Mean_Coefficient', ascending=False)

#print(coef_df)          

In [33]:
X_const = sm.add_constant(X)

# Fit logistic regression (maximum likelihood)
logit_model = sm.Logit(y, X_const)
result = logit_model.fit()

# Summary table (includes coefficients, std errors, z, p-values)
#print(result.summary())

# Create a tidy table with coefficients, p-values, odds ratios, and confidence intervals
scaled_summary = pd.DataFrame({
    "Feature": X_const.columns,
    "Coef": result.params,
    "p_value": result.pvalues,
    "Odds_Ratio": np.exp(result.params),
}).sort_values(by="p_value")

#print("\nTidy Coefficient Table:")
#print(scaled_summary)


Optimization terminated successfully.
         Current function value: 0.607187
         Iterations 6


In [34]:
#unscaled version
ug2 = ug2.apply(lambda col: pd.to_numeric(col) if col.dtypes == "object" else col)

y2 = ug2['Did you apply to live in university housing for next academic year? - Selected Choice'].astype(int)
X2 = ug2.drop('Did you apply to live in university housing for next academic year? - Selected Choice',axis=1)
X2 = X2.drop('Unit 2',axis=1)
X2 = X2.drop('Triple in a residence hall or suite',axis=1)
X_const2 = sm.add_constant(X2)

# Fit logistic regression (maximum likelihood)
logit_model2 = sm.Logit(y2, X_const2)
result2 = logit_model2.fit()

# Summary table (includes coefficients, std errors, z, p-values)
#print(result.summary())

# Create a tidy table with coefficients, p-values, odds ratios, and confidence intervals
unscaled_summary = pd.DataFrame({
    "Feature": X_const2.columns,
    "Coef": result2.params,
    "p_value": result2.pvalues,
    "Odds_Ratio": np.exp(result2.params),
}).sort_values(by="p_value")

#print("\nTidy Coefficient Table:")
#print(unscaled_summary)


Optimization terminated successfully.
         Current function value: 0.607187
         Iterations 6


In [35]:
#groupings
ra_feedback_cols = [
    "My RA Is accessible. I know how to get in contact with them (via email, group chat, in-person, etc.)",
    "My RA gets back to me within three business days when I ask them for assistance",
    "My RA manages conflicts (among roommates, floor, individuals)",
    "My RA consistently and fairly addresses behaviors that violate community standards",
    "My RA encourages me to participate in events in my residential building and on campus",
]

event_cols = [
    "I enjoy the events hosted by my RA(s)",
    "I enjoy the events hosted by my building's hall association",
    "I enjoy the events hosted by RHA",
    "I enjoy the events hosted by the Resident Faculty",
    "Overall, I enjoy the Residential Life events I attend",
    "I have attended a Residential Life event that has made me think about something in a new way",
]

res_experience_cols = [
    "My residential experience is helping me transition to UC Berkeley's academic community\n\nExamples: Intellectual conversations and study groups with other residents. Learning about research, study abroad, and other opportunities. Resident Faculty events.",
    "My residential experience is helping me transition to UC Berkeley's social community",
    "As a result of my residential experience, I have found a supportive community of students",
    "My residential experience is helping me develop healthy wellness practices",
    "My residential experience is helping me develop a greater understanding of others who are different from me \n\nExamples: Class, race, gender, beliefs, etc.",
    "My residential experience is helping me explore different aspects of myself\n\nExamples: Values, emotions, self-expression, etc.",
    "Living in a university residential building has helped me feel that I belong at UC Berkeley",
    "I am satisfied with my overall experience living in university housing",
]

theme_program_cols = [
    "Do you live in a Theme Program?",
]

unit_cols = [
    "Martinez Commons",
    "Panoramic Berkeley",
    "Unit 1",
    "Unit 2",  # baseline dropped from X, still conceptually part of 'Unit'
    "Unit 3",
    "Unit 4 (Foothill/ Stern/ La Loma)",
]

accommodation_cols = [
    "Double in a residence hall or suite",
    "I share a room in an apartment",
    "My own room in an apartment",
    "Quad in a residence hall or suite",
    "Single in a residence hall or suite",
    "Triple in a residence hall or suite",  # baseline dropped from X
]

def assign_group(feature: str) -> str:
    if feature == "const":
        return "Intercept"
    elif feature in ra_feedback_cols:
        return "RA Feedback"
    elif feature in event_cols:
        return "Events"
    elif feature in res_experience_cols:
        return "Residential Experience"
    elif feature in theme_program_cols:
        return "Theme Program"
    elif feature in unit_cols:
        return "Unit"
    elif feature in accommodation_cols:
        return "Living Accommodation"
    else:
        return "Other"


In [36]:
grouped_scaled = scaled_summary.copy()
grouped_scaled["Group"] = grouped_scaled["Feature"].apply(assign_group)
grouped_scaled["abs_coef"] = grouped_scaled["Coef"].abs()

between_group_strength = (
    grouped_scaled.groupby("Group")["abs_coef"]
    .mean()
    .sort_values(ascending=False)
)

#print("\nBetween-group strength (mean |Coef|, SCALED model):")
#print(between_group_strength)

In [37]:
grouped_unscaled = unscaled_summary.copy()
grouped_unscaled["Group"] = grouped_unscaled["Feature"].apply(assign_group)
grouped_unscaled["abs_coef"] = grouped_unscaled["Coef"].abs()

#print("\nRA Feedback items (UNSCALED, per 1-point increase):")
#print(
#    grouped_unscaled[grouped_unscaled["Group"] == "RA Feedback"]
#    .sort_values("Coef", ascending=False)
#)

#print("\nEvents items (UNSCALED, per 1-point increase):")
#print(
#    grouped_unscaled[grouped_unscaled["Group"] == "Events"]
#    .sort_values("Coef", ascending=False)
#)

#print("\nResidential Experience items (UNSCALED, per 1-point increase):")
#print(
#    grouped_unscaled[grouped_unscaled["Group"] == "Residential Experience"]
#    .sort_values("Coef", ascending=False)
#)

#print("\nUnit effects vs baseline (UNSCALED, per category):")
#print(
#    grouped_unscaled[grouped_unscaled["Group"] == "Unit"]
#    .sort_values("Coef", ascending=False)
#)

#print("\nLiving Accommodation effects vs baseline (UNSCALED, per category):")
#print(
#    grouped_unscaled[grouped_unscaled["Group"] == "Living Accommodation"]
#    .sort_values("Coef", ascending=False)
#)

MODEL 2: Decision Trees

In [38]:
y = ug2['Did you apply to live in university housing for next academic year? - Selected Choice'].astype(int)
X = ug2.iloc[:,np.r_[:11,12:20]] #only doing for non-hot encoding variables 
model2 = DecisionTreeClassifier(max_depth=2, random_state=491, criterion = "entropy")
model2.fit(X, y)

#plt.figure(figsize=(50,24))
#plot_tree(model2, feature_names=X.columns, class_names=["Not Return","Return"], filled=True, fontsize=12)
#plt.show()   

MODEL 3: Random Forests

In [39]:
y = ug['Did you apply to live in university housing for next academic year? - Selected Choice'].astype(int)
X = ug.drop('Did you apply to live in university housing for next academic year? - Selected Choice',axis=1)
model3 = RandomForestClassifier(n_estimators=300, random_state=491, max_features="sqrt")
kf = KFold(n_splits=5, shuffle=True, random_state=491)
importances = []
for train_idx, test_idx in kf.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    model3.fit(X_train, y_train)
    importances.append(model3.feature_importances_)

avg_importances = np.mean(importances, axis=0)
feature_importances = pd.Series(avg_importances, index=X.columns).sort_values(ascending=False)

#print("Top 10 Important Features (averaged across folds):\n")
#print(feature_importances.head(10)) 


MODEL 4: Gradient Boosted Trees (XGBoost)

In [40]:
model4 = XGBClassifier(n_estimators=300, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8, random_state=491, eval_metric="logloss")
kf = KFold(n_splits=5, shuffle=True, random_state=491)
importances = []
for train_idx, test_idx in kf.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    model4.fit(X_train, y_train)
    importances.append(model4.feature_importances_)

avg_importances = np.mean(importances, axis=0)
feature_importances = pd.Series(avg_importances, index=X.columns).sort_values(ascending=False)
#print("Top 10 Important Features (averaged across folds):\n")
#print(feature_importances.head(10))

