In [17]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
import joblib

In [18]:
df = pd.read_csv("/Users/ananyaagarwal/Desktop/NYU/VML/Updated_StudentsPerformance_1.csv")

print("Dataset shape:", df.shape)
print(df.head())

Dataset shape: (1000, 31)
   gender race/ethnicity parental level of education         lunch  \
0  female        group B           bachelor's degree      standard   
1  female        group C                some college      standard   
2  female        group B             master's degree      standard   
3    male        group A          associate's degree  free/reduced   
4    male        group C                some college      standard   

  test preparation course  math score  reading score  writing score  age  \
0                    none          72             72             74   24   
1               completed          69             90             88   21   
2                    none          90             95             93   28   
3                    none          47             57             44   25   
4                    none          76             78             75   22   

  country_region  ...  library_usage_hours  stress_level homesickness_level  \
0          Other 

In [19]:
# Identify categorical columns
cat_cols = df.select_dtypes(include=['object']).columns

# Label encode categorical columns
le_dict = {}   # store encoders for later use in prediction
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    le_dict[col] = le

In [20]:
target_column = "semester_gpa"   # <-- CHANGE THIS to your target column
X = df.drop(target_column, axis=1)
y = df[target_column]

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.25, random_state=42
)

In [28]:
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=200),
    "Gradient Boosting": GradientBoostingRegressor(),
    "SVR": SVR(),
    "KNN Regressor": KNeighborsRegressor()
}

results = {}

# Train & evaluate models
for name, model in models.items():
    print("\n---------------------------")
    print(f"Training: {name}")
    print("---------------------------")

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    print("RMSE:", rmse)
    print("R² Score:", r2)

    results[name] = (rmse, model)


---------------------------
Training: Linear Regression
---------------------------
RMSE: 0.002861477113157757
R² Score: 0.9999095579138415

---------------------------
Training: Random Forest
---------------------------
RMSE: 0.1440282355998295
R² Score: 0.7708682358292455

---------------------------
Training: Gradient Boosting
---------------------------
RMSE: 0.08049202051159106
R² Score: 0.9284358237883907

---------------------------
Training: SVR
---------------------------
RMSE: 0.08735502026470286
R² Score: 0.9157119987062639

---------------------------
Training: KNN Regressor
---------------------------
RMSE: 0.20679675045802828
R² Score: 0.5276352966698344


In [29]:
best_model_name = min(results, key=lambda k: results[k][0])
best_model = results[best_model_name][1]

print("\n===================================")
print("Best Model:", best_model_name)
print("===================================")


Best Model: Linear Regression


In [30]:
joblib.dump(best_model, "best_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(le_dict, "encoders.pkl")

print("\nModel, scaler, and encoders saved!")


Model, scaler, and encoders saved!


In [31]:
def predict_gpa(user_input_dict):
    # Load saved objects
    model = joblib.load("best_model.pkl")
    scaler = joblib.load("scaler.pkl")
    encoders = joblib.load("encoders.pkl")

    # Convert to DataFrame
    user_df = pd.DataFrame([user_input_dict])

    # Ensure ALL columns exist
    required_columns = X.columns  # X must be accessible from the outer scope

    for col in required_columns:
        if col not in user_df.columns:
            # If the model was trained on this column, add default values
            if col in encoders:
                user_df[col] = "unknown"
            else:
                user_df[col] = 0

    # Encode categorical features
    for col in encoders:
        le = encoders[col]
        if user_df[col].iloc[0] not in le.classes_:
            # Add unknown class if needed
            le.classes_ = np.append(le.classes_, user_df[col].iloc[0])
        user_df[col] = le.transform(user_df[col])

    # Reorder columns exactly as training data
    user_df = user_df[required_columns]

    # Scale numerical values
    user_scaled = scaler.transform(user_df)

    # Predict
    prediction = model.predict(user_scaled)[0]
    return prediction

In [32]:
print(predict_gpa({
    "gender": "male",
    "study_hours": 3,
    "parent_education": "highschool",
    "attendance": 85,
    "extra_classes": 0,
    "past_gpa": 7.5
}))

1.1981650342328574
