In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

In [31]:
def predict_scores(df):
    # train data
    data = pd.read_csv('/content/StudentsPerformance.csv')
    X = data.drop(columns=['average_score'])
    y = data['average_score']

    # One-hot encode the categorical columns in the dataset
    X_encoded = pd.get_dummies(X, drop_first=True)
    test = pd.get_dummies(df, drop_first=True)

    # Align test data columns to match the training data columns
    test = test.reindex(columns=X_encoded.columns, fill_value=0)

    # Define the base learners for the stacking regressor
    base_learners = [
        ('lr', LinearRegression()),
        ('ridge', Ridge()),
        ('rf', RandomForestRegressor(random_state=42)),
        ('svr', SVR())
    ]

    # Define the meta-model
    meta_model = Ridge()

    # Create the stacking regressor
    stacking_regressor = StackingRegressor(estimators=base_learners, final_estimator=meta_model)

    # Train the stacking regressor
    stacking_regressor.fit(X_encoded, y)

    # Predict on the test set
    y_pred = stacking_regressor.predict(test)

    # Return the prediction as a series
    return pd.Series(y_pred, index=df.index, name='average_score')

In [32]:
# function usuage ( i used the train data you can comment out the second line)
testdata = pd.read_csv('/content/StudentsPerformance.csv')
input=testdata.drop(columns=['average_score'])
result = predict_scores(input)


In [33]:

comparison_df = pd.DataFrame({
    'Actual': testdata['average_score'],
    'Predicted': result,
    'Difference': testdata['average_score'] - result
})

rmse = np.sqrt(mean_squared_error(testdata['average_score'], result))
print(f"RMSE: {rmse}")
print("Comparison DataFrame:")
print(comparison_df)

RMSE: 12.26229217378017
Comparison DataFrame:
        Actual  Predicted  Difference
0    80.333333  67.663255   12.670078
1    75.666667  77.070585   -1.403919
2    65.666667  58.044353    7.622313
3    57.333333  72.910122  -15.576789
4    77.666667  76.649900    1.016767
..         ...        ...         ...
695  95.666667  73.718726   21.947941
696  64.333333  70.273741   -5.940407
697  56.000000  70.541267  -14.541267
698  50.333333  65.773517  -15.440183
699  88.333333  72.605220   15.728113

[700 rows x 3 columns]
