In [None]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import OneHotEncoder,StandardScaler,PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import r2_score, root_mean_squared_error



In [3]:
df =  pd.read_csv("Student_DataSet.csv")
y = df["math_percentage"]
x =df.drop(columns=["math_percentage"])
encode_colums=["sex","race/ethnicity","parental_level_of_education","lunch","test_preparation_course"]
x

Unnamed: 0,sex,race/ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_percentage,writing_percentage
0,F,group B,bachelor's degree,standard,none,0.72,0.74
1,F,group C,some college,standard,completed,0.90,0.88
2,F,group B,master's degree,standard,none,0.95,0.93
3,M,group A,associate's degree,free/reduced,none,0.57,0.44
4,M,group C,some college,standard,none,0.78,0.75
...,...,...,...,...,...,...,...
995,F,group E,master's degree,standard,completed,0.99,0.95
996,M,group C,high school,free/reduced,none,0.55,0.55
997,F,group C,high school,free/reduced,completed,0.71,0.65
998,F,group D,some college,standard,completed,0.78,0.77


In [4]:
ct = ColumnTransformer(transformers=[("encoder",OneHotEncoder(),encode_colums)],remainder="passthrough")
x = ct.fit_transform(x)

In [5]:
x_train,x_test,y_train, y_test =train_test_split(x,y, test_size=0.2,random_state=0)

In [6]:
def report(models,y_test):
        for name, pred in models:
            print(f"{name} Report\nR2 score:{r2_score(y_test,pred)}\nRMSE:{root_mean_squared_error(y_test,pred)}") 
    

# Linear Regression

In [7]:
lin_reg = LinearRegression()
lin_reg.fit(x_train,y_train)
lin_reg_pred = lin_reg.predict(x_test)

# Polynomial Regression

In [8]:
poly = PolynomialFeatures(degree=2)
x_poly = poly.fit_transform(x_train)
poly_reg = LinearRegression()
poly_reg.fit(x_poly,y_train)
poly_reg_pred = poly_reg.predict(poly.transform(x_test))

# SVM

In [9]:
svm = SVR(kernel="rbf")
svm.fit(x_train,y_train)
svm_pred = svm.predict(x_test)


# Decision Tree

In [16]:
dt = DecisionTreeRegressor(max_depth=5,random_state=0)
dt.fit(x_train,y_train)
dt_pred = dt.predict(x_test)


# Random Forest

In [11]:
rf = RandomForestRegressor(n_estimators=150)
rf.fit(x_train,y_train)
rf_pred = rf.predict(x_test)

In [12]:
from sklearn.linear_model import Ridge

# Ridge handles correlated features better than simple LinearRegression
ridge = Ridge(alpha=1.0)
ridge.fit(x_train, y_train)
ridge_pred = ridge.predict(x_test)

In [13]:
from sklearn.ensemble import GradientBoostingRegressor

gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=0)
gb.fit(x_train, y_train)
gb_pred = gb.predict(x_test)

In [14]:
predictions =[("Linear Regression",lin_reg_pred),("Polynomial Regression",poly_reg_pred),("SVM",svm_pred),("Decision Tree", dt_pred),("Random Forest",rf_pred),("Ridge",ridge_pred),("XGBoost",gb_pred) ]
report(predictions,y_test)

Linear Regression Report
R2 score:0.8629895363812153
RMSE:0.055719646690156235
Polynomial Regression Report
R2 score:0.85777863657398
RMSE:0.056769347808934355
SVM Report
R2 score:0.8396048881919675
RMSE:0.06028746882054266
Decision Tree Report
R2 score:0.8338114153152061
RMSE:0.06136660263637688
Random Forest Report
R2 score:0.857852035808798
RMSE:0.05675469682983874
Ridge Report
R2 score:0.8661475859946599
RMSE:0.0550737425434957
XGBoost Report
R2 score:0.8622467573002877
RMSE:0.0558704798627532
