In [1]:
from sklearn.metrics import root_mean_squared_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [2]:
df = pd.read_csv('preprocessed_college_data.csv')
df.head()

Unnamed: 0,Private,Top10perc,Fundergrad,Pundergrad,Books,Personal,Terminal,SFRatio,percalumni,Expend,GradRate,YieldRate,TotalCost
0,1,3.178054,7.967627,6.287859,-0.493143,7.696667,-0.491305,-0.491305,2.564949,-0.491305,60,0.460728,9.281823
1,1,2.833213,7.895063,7.113142,0.54998,7.313887,0.551599,0.551599,2.833213,0.551599,56,0.235951,9.837935
2,1,3.135494,6.944087,4.60517,0.093289,7.061334,0.095869,0.095869,3.433987,0.095869,54,0.267191,9.615872
3,1,4.110874,6.23637,4.158883,1.731005,6.775366,1.72595,1.72595,3.637586,1.72595,59,0.331137,9.820704
4,1,2.833213,5.521461,6.768493,0.634925,7.313887,0.636244,0.636244,1.399039,0.636244,15,0.319698,9.365719


In [3]:
x = df.drop(columns=['GradRate'])
y = df['GradRate']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

print("X_train shape:", x_train.shape)
print("X_test shape:", x_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (620, 12)
X_test shape: (156, 12)
y_train shape: (620,)
y_test shape: (156,)


In [4]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [11]:
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()
lr_model.fit(x_train_scaled, y_train)
lr_y_pred = lr_model.predict(x_test_scaled)

lr_acc = lr_model.score(x_test_scaled, y_test)
lr_rmse = root_mean_squared_error(y_test, lr_y_pred)
lr_mse = mean_squared_error(y_test, lr_y_pred)
lr_r2 = r2_score(y_test, lr_y_pred)

print(f'Accuracy: {round(lr_acc*100,2)}%')
print(f"Root Mean Squared Error: {lr_rmse}")
print(f'Mean Squared Error: {round(lr_mse,2)}')
print(f'R-squared: {round(lr_r2, 2)}')

Accuracy: 46.53%
Root Mean Squared Error: 12.27022182977131
Mean Squared Error: 150.56
R-squared: 0.47


In [12]:
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor()
rf_model.fit(x_train_scaled, y_train)
rf_y_pred = rf_model.predict(x_test_scaled)

rf_acc = rf_model.score(x_test_scaled, y_test)
rf_rmse = root_mean_squared_error(y_test, rf_y_pred)
rf_mse = mean_squared_error(y_test, rf_y_pred)
rf_r2 = r2_score(y_test, rf_y_pred)

print(f'Accuracy: {round(rf_acc*100,2)}%')
print(f"Root Mean Squared Error: {rf_rmse}")
print(f'Mean Squared Error: {rf_mse}')
print(f'R-squared: {rf_r2}')

Accuracy: 43.86%
Root Mean Squared Error: 12.572688068046954
Mean Squared Error: 158.07248525641026
R-squared: 0.43862458151332906


In [13]:
from sklearn.ensemble import GradientBoostingRegressor
gb_model = GradientBoostingRegressor()
gb_model.fit(x_train_scaled, y_train)
gb_y_pred = gb_model.predict(x_test_scaled)

gb_model_acc = gb_model.score(x_test_scaled, y_test)
gb_rmse = root_mean_squared_error(y_test, gb_y_pred)
gb_mse = mean_squared_error(y_test, gb_y_pred)
gb_r2 = r2_score(y_test, gb_y_pred)

print(f'Accuracy: {round(gb_model_acc*100,2)}%')
print(f"Root Mean Squared Error: {gb_rmse}")
print(f'Mean Squared Error: {gb_mse}')
print(f'R-squared: {gb_r2}')

Accuracy: 47.26%
Root Mean Squared Error: 12.186356852407739
Mean Squared Error: 148.50729333422504
R-squared: 0.4725942101271381
