In [124]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
import statsmodels.api as sm
import xgboost as xgb
from xgboost import XGBClassifier
pd.set_option('display.max_columns', None)
pd.set_option('future.no_silent_downcasting', True)


In [125]:
ug = pd.read_csv("raw data.csv")
#only select features we want
ug = ug.iloc[:,[
  1,       # B
  3,       # D
  44, 45, 46, 47, 48,                 # AS - AW
  50, 51, 52, 53, 54, 55,            # AY - BD
  85,                                # CH
  92, 93, 94, 95, 96, 97, 98, 99,    # CO - CV
  109                                # DF
]]
ug.head() # <----- commenting out to hide output


Unnamed: 0,Please select the unit where you live: - Selected Choice,What is your living accommodation? - Selected Choice,"My RA Is accessible. I know how to get in contact with them (via email, group chat, in-person, etc.)",My RA gets back to me within three business days when I ask them for assistance,"My RA manages conflicts (among roommates, floor, individuals)",My RA consistently and fairly addresses behaviors that violate community standards,My RA encourages me to participate in events in my residential building and on campus,I enjoy the events hosted by my RA(s),I enjoy the events hosted by my building's hall association,I enjoy the events hosted by RHA,I enjoy the events hosted by the Resident Faculty,"Overall, I enjoy the Residential Life events I attend",I have attended a Residential Life event that has made me think about something in a new way,Did you apply to live in university housing for next academic year? - Selected Choice,"My residential experience is helping me transition to UC Berkeley's academic community\n\nExamples: Intellectual conversations and study groups with other residents. Learning about research, study abroad, and other opportunities. Resident Faculty events.",My residential experience is helping me transition to UC Berkeley's social community,"As a result of my residential experience, I have found a supportive community of students",My residential experience is helping me develop healthy wellness practices,"My residential experience is helping me develop a greater understanding of others who are different from me \n\nExamples: Class, race, gender, beliefs, etc.","My residential experience is helping me explore different aspects of myself\n\nExamples: Values, emotions, self-expression, etc.",Living in a university residential building has helped me feel that I belong at UC Berkeley,I am satisfied with my overall experience living in university housing,Do you live in a Theme Program?
0,Unit 2,Triple in a residence hall or suite,Agree,Agree,No opinion/ I have no experience with my RA in...,Agree,Agree,"I have seen advertisements, but have not atten...","I have seen advertisements, but have not atten...","I have seen advertisements, but have not atten...",I have not seen their event advertisements,I have not attended any Residential Life events,I have not attended any Residential Life events,,Agree,Agree,Strongly agree,Disagree,Agree,Agree,Agree,Agree,No
1,Anchor House,My own room in an apartment,Strongly Agree,Strongly Agree,Strongly Agree,Strongly Agree,Strongly Agree,"I have seen advertisements, but have not atten...","I have seen advertisements, but have not atten...","I have seen advertisements, but have not atten...","I have seen advertisements, but have not atten...",I have not attended any Residential Life events,I have not attended any Residential Life events,,Strongly agree,Agree,Agree,No opinion,No opinion,No opinion,Strongly agree,Strongly agree,No
2,Anchor House,My own room in an apartment,,,,,,Strongly Agree,"I have seen advertisements, but have not atten...",Agree,"I have seen advertisements, but have not atten...",Agree,Agree,,Agree,Strongly agree,Strongly agree,Agree,Strongly agree,Agree,Agree,Strongly agree,No
3,Clark Kerr Campus,I share a room in an apartment,Strongly Agree,Strongly Agree,Disagree,Disagree,Agree,Agree,Agree,Agree,Agree,Agree,Agree,,Agree,Agree,Agree,Agree,Agree,Agree,Disagree,Agree,No
4,Martinez Commons,Double in a residence hall or suite,Agree,Disagree,No opinion/ I have no experience with my RA in...,No opinion/ I have no experience with my RA in...,No opinion/ I have no experience with my RA in...,Agree,"I have seen advertisements, but have not atten...","I have seen advertisements, but have not atten...","I have seen advertisements, but have not atten...",I have not attended any Residential Life events,I have not attended any Residential Life events,"No, I will be a student, but plan to live in p...",No opinion,Agree,No opinion,No opinion,No opinion,No opinion,Agree,Agree,No


In [126]:
#checking missing values
missing_counts = ug.isna().sum()
missing_percent = ug.isna().mean() * 100

missing_summary = pd.DataFrame({
    'Missing Count': missing_counts,
    'Missing %': missing_percent
}).sort_values(by='Missing %', ascending=False)

#print(missing_summary)   <----- commenting out to hide output

In [127]:
ug = ug.dropna()
ug = ug[ug['Did you apply to live in university housing for next academic year? - Selected Choice'] != 'No, I am leaving UC Berkeley (graduating, studying abroad, etc.)']
encoder = OneHotEncoder(sparse_output=False) 
cols_to_encode = ['Please select the unit where you live: - Selected Choice','What is your living accommodation? - Selected Choice']
encoded_array = encoder.fit_transform(ug[cols_to_encode])
categories = encoder.categories_
flat_categories = [item for sublist in categories for item in sublist]
encoded_df = pd.DataFrame(
    encoded_array,
    columns=flat_categories,
    index=ug.index  
)
df_remaining = ug.drop(columns=cols_to_encode)
ug = pd.concat([df_remaining, encoded_df], axis=1)
#ug     <----- commenting out to hide output

In [128]:
#data cleaning
for i in range(5):
    ug.iloc[:,i] = ug.iloc[:,i].replace({"Strongly Disagree":1,'Disagree':2,'No opinion/ I have no experience with my RA in this area':3,'Agree':4,'Strongly Agree':5}).apply(pd.to_numeric)
for i in range(5,9):
    ug.iloc[:,i] = ug.iloc[:,i].replace({"Strongly Disagree":1,'Disagree':2,'I have seen advertisements, but have not attended their events':3,'I have not seen their event advertisements':3,'Agree':4,'Strongly Agree':5}).apply(pd.to_numeric)
for i in range(9,11):
    ug.iloc[:,i] = ug.iloc[:,i].replace({"Strongly disagree":1,'Disagree':2,'I have not attended any Residential Life events':3,'Agree':4,'Strongly agree':5}).apply(pd.to_numeric)
ug.iloc[:,11] = ug.iloc[:,11].replace({'No, I will be a student, but plan to live in private housing':0,'Yes, as a resident':1,
       'Yes, as Residential Life or RHA staff (RA, RSC, etc.)':1,
       'No, other (please write in)':0,
       'No, I am leaving UC Berkeley (graduating, studying abroad, etc.)':0,
       'No, I will live with my family, fraternity, sorority, or in a co-op':0}).apply(pd.to_numeric)
for i in range(12,20):
    ug.iloc[:,i] = ug.iloc[:,i].replace({"Strongly disagree":1,'Disagree':2,'No opinion':3,'Agree':4,'Strongly agree':5}).apply(pd.to_numeric)
ug.iloc[:,20] = ug.iloc[:,20].replace({"No":0,"Yes":1}).apply(pd.to_numeric)
ug2 = ug.copy(deep=True)
#ug           <----- commenting out to hide output

In [129]:
#scale all the values
ug.iloc[:,np.r_[:11,12:20]] = StandardScaler().fit_transform(ug.iloc[:,np.r_[:11,12:20]])
ug = ug.apply(lambda col: pd.to_numeric(col) if col.dtypes == "object" else col)
#ug           # <----- commenting out to hide output

In [137]:
X = ug.drop('Did you apply to live in university housing for next academic year? - Selected Choice',axis=1)
corr = X.corr()
#plt.figure(figsize=(15, 12))
#sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", center=0, annot_kws={"size":8})
#plt.xticks(rotation=45, ha="right")
#plt.yticks(rotation=0)
#plt.title("Correlation Heatmap of Features", fontsize=14)
#plt.show()   # <----- commenting out to hide output

MODEL 1: Logistic Regression

In [130]:
y = ug['Did you apply to live in university housing for next academic year? - Selected Choice'].astype(int)
X = ug.drop('Did you apply to live in university housing for next academic year? - Selected Choice',axis=1)
model1 = LogisticRegression()
kf = KFold(n_splits=5, shuffle=True, random_state=491)
coefficients = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model1.fit(X_train, y_train)
    coefficients.append(model1.coef_[0])


coef_array = np.array(coefficients)
coef_means = coef_array.mean(axis=0)

coef_df = pd.DataFrame({
    'Feature': X.columns,
    'Mean_Coefficient': coef_means
}).sort_values(by='Mean_Coefficient', ascending=False)

#print(coef_df)          # <----- commenting out to hide output

MODEL 2: Decision Trees

In [131]:
y = ug2['Did you apply to live in university housing for next academic year? - Selected Choice'].astype(int)
X = ug2.iloc[:,np.r_[:11,12:20]] #only doing for non-hot encoding variables 
model2 = DecisionTreeClassifier(max_depth=3, random_state=491, criterion = "entropy")
model2.fit(X, y)

#plt.figure(figsize=(50,24))
#plot_tree(model2, feature_names=X.columns, class_names=["Not Return","Return"], filled=True, fontsize=12)
#plt.show()   # <----- commenting out to hide output

METHOD 3: Random Forests

In [132]:
y = ug['Did you apply to live in university housing for next academic year? - Selected Choice'].astype(int)
X = ug.drop('Did you apply to live in university housing for next academic year? - Selected Choice',axis=1)
model3 = RandomForestClassifier(n_estimators=300, random_state=491, max_features="sqrt")
kf = KFold(n_splits=5, shuffle=True, random_state=491)
importances = []
for train_idx, test_idx in kf.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    model3.fit(X_train, y_train)
    importances.append(model3.feature_importances_)

avg_importances = np.mean(importances, axis=0)
feature_importances = pd.Series(avg_importances, index=X.columns).sort_values(ascending=False)

print("Top 10 Important Features (averaged across folds):\n")
print(feature_importances.head(10)) # <----- commenting out to hide output


Top 10 Important Features (averaged across folds):

I have attended a Residential Life event that has made me think about something in a new way                                                                                                                                                                      0.049510
My residential experience is helping me explore different aspects of myself\n\nExamples: Values, emotions, self-expression, etc.                                                                                                                                  0.046155
My residential experience is helping me transition to UC Berkeley's academic community\n\nExamples: Intellectual conversations and study groups with other residents. Learning about research, study abroad, and other opportunities. Resident Faculty events.    0.045672
My residential experience is helping me develop healthy wellness practices                                                                         

METHOD 4: Gradient Boosted Trees (XGBoost)

In [133]:
model4 = XGBClassifier(n_estimators=300, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8, random_state=491, eval_metric="logloss")
kf = KFold(n_splits=5, shuffle=True, random_state=491)
importances = []
for train_idx, test_idx in kf.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    model4.fit(X_train, y_train)
    importances.append(model4.feature_importances_)

avg_importances = np.mean(importances, axis=0)
feature_importances = pd.Series(avg_importances, index=X.columns).sort_values(ascending=False)
print("Top 10 Important Features (averaged across folds):\n")
print(feature_importances.head(10)) # <----- commenting out to hide output



Top 10 Important Features (averaged across folds):

I share a room in an apartment                                                                                                                                  0.091555
Martinez Commons                                                                                                                                                0.081943
My own room in an apartment                                                                                                                                     0.079829
Quad in a residence hall or suite                                                                                                                               0.038365
Panoramic Berkeley                                                                                                                                              0.032645
Single in a residence hall or suite                                                                    