In [257]:
import pandas as pd
from scipy.stats import truncnorm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

In [258]:
df = pd.read_excel("../data/processed/preprocessed_final.xlsx")

In [None]:
# # Step 1: Map unique institutions to their known (non-null) acceptance rates
# institution_acceptance_map = (
#     df[df['acceptance_rate'].notnull()]
#     .groupby('institution')['acceptance_rate']
#     .first()
#     .to_dict()
# )

# # Step 2: Find rows where acceptance_rate is null
# null_acceptance_rows = df[df['acceptance_rate'].isnull()]

# # Step 3 & 4: Fill missing acceptance rates based on institution name
# df.loc[df['acceptance_rate'].isnull(), 'acceptance_rate'] = df.loc[
#     df['acceptance_rate'].isnull(), 'institution'
# ].map(institution_acceptance_map)

# # Optional: Save the updated DataFrame to a new Excel file
# df.to_excel("../data/processed/loaded_data.xlsx")

# df = pd.read_excel("../data/processed/loaded_data.xlsx")

# df = df.dropna(subset=['institution', 'program', 'degree_type', 'decision'])

# df.acceptance_rate.isna()
# 7734

In [259]:
df.acceptance_rate.isnull().sum()

0

In [260]:
# Step 1: Select features and target
feature_cols = [
    'acceptance_rate',
    'undergrad_gpa',
    'gre_quantitative_reasoning',
    'gre_verbal_reasoning',
    'analytical_writing'
]


In [261]:
target_col = 'decision_encoded'

In [262]:
# Step 2: Drop rows with missing target
df_model = df.dropna(subset=[target_col])


KeyError: ['decision_encoded']

In [None]:
# Step 3: Extract X and y
X = df_model[feature_cols]
y = df_model[target_col]


In [None]:
# Step 4: Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# Output the shape of splits
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((68768, 5), (17192, 5), (68768,), (17192,))

In [None]:
# Features to use for regression
feature_cols = [
    'acceptance_rate',
    'undergrad_gpa',
    'gre_quantitative_reasoning',
    'gre_verbal_reasoning',
    'analytical_writing'
]

In [None]:
# Drop rows with missing values in features or target
df_model = df[feature_cols + ['decision_encoded']].dropna()


In [None]:
# Split into features (X) and target (y)
X = df_model[feature_cols]
y = df_model['decision_encoded']


In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# Predict
y_pred = model.predict(X_test)

In [None]:
# Evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:
print(f"Mean Squared Error: {mse:.4f}")
print(f"R² Score: {r2:.4f}")

Mean Squared Error: 0.1984
R² Score: 0.0647


___________________________________________________________________________________

In [None]:

# Initialize the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train
rf_model.fit(X_train, y_train)

# Predict
y_pred_rf = rf_model.predict(X_test)

# Evaluate
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest - Mean Squared Error: {mse_rf:.4f}")
print(f"Random Forest - R² Score: {r2_rf:.4f}")


Random Forest - Mean Squared Error: 0.2106
Random Forest - R² Score: 0.0070


________________________________________________________________________________

In [None]:
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

In [None]:
y_pred_xgb = xgb_model.predict(X_test)


In [None]:
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

In [None]:
print(f"XGBoost - Mean Squared Error: {mse_xgb:.4f}")
print(f"XGBoost - R² Score: {r2_xgb:.4f}")

XGBoost - Mean Squared Error: 0.1968
XGBoost - R² Score: 0.0722


________________________________________________________________________________

In [None]:
# Define parameter grid to search
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

In [None]:
# Initialize base model
rf = RandomForestRegressor(random_state=42)


In [None]:
# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)


In [None]:
# Run grid search
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 24 candidates, totalling 120 fits


KeyboardInterrupt: 

In [None]:
# Best model from search
best_rf = grid_search.best_estimator_

In [None]:
y_pred_best = best_rf.predict(X_test)
mse_best = mean_squared_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)

In [None]:
print("Best Parameters:", grid_search.best_params_)
print(f"Best Random Forest - MSE: {mse_best:.4f}, R²: {r2_best:.4f}")

Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best Random Forest - MSE: 0.1980, R²: 0.0712


In [None]:
df['decision_encoded'].value_counts()


decision_encoded
1.0    37245
0.0    35986
0.5    12729
Name: count, dtype: int64

# Flow

- Make sure data is distributed evenly
- encode target variable
- use one hot encoding for "string" data columns
- run feature engineering
    -  to reduce number of features
- check correlation between columns
- run model training
- run eval on test dataset
- create api endpoint for the model