In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv(r"C:\Users\Aidmin\Downloads\archive (1)\StudentsPerformance.csv")

# Preview data
df.head()


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [2]:
# Map parental education to numerical scale
df['parent_edu_num'] = df['parental level of education'].map({
    "some high school": 1,
    "high school": 2,
    "some college": 3,
    "associate's degree": 4,
    "bachelor's degree": 5,
    "master's degree": 6
})

# Add binary flag for test prep completion
df['is_prepared'] = df['test preparation course'].apply(lambda x: 1 if x == 'completed' else 0)


In [3]:
from sklearn.preprocessing import LabelEncoder

df_encoded = df.copy()

# Encode gender, race/ethnicity, lunch
for col in ['gender', 'race/ethnicity', 'lunch']:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])


In [4]:
# Drop unnecessary columns
X = df_encoded.drop(['math score', 'reading score', 'writing score', 'parental level of education', 'test preparation course'], axis=1)
y = df_encoded['math score']  # We are predicting 'math score'

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Improved model with tuned hyperparameters
rf = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Tuned Random Forest Regressor:")
print("Mean Squared Error:", round(mse, 2))
print("R² Score:", round(r2, 2))


Tuned Random Forest Regressor:
Mean Squared Error: 246.34
R² Score: -0.01


In [7]:
df_encoded['average_score'] = df_encoded[['math score', 'reading score', 'writing score']].mean(axis=1)


In [8]:
# New target
y = df_encoded['average_score']

# Optional: you can now drop individual scores from features
X = df_encoded.drop(['math score', 'reading score', 'writing score', 'average_score', 'parental level of education', 'test preparation course'], axis=1)

# Train-test split again
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
rf = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Improved Model - Predicting Average Score:")
print("Mean Squared Error:", round(mse, 2))
print("R² Score:", round(r2, 2))


Improved Model - Predicting Average Score:
Mean Squared Error: 219.59
R² Score: -0.02
