In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 


In [49]:
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

In [9]:
df=pd.read_csv(r'C:\Users\Administrator\Downloads\StudentsPerformance.csv')

In [10]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


Preparing X and Y Variables

Creating a Total Score Column

In [11]:
df['Total Score']=df['math score']+df['writing score']+df['reading score']

In [12]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,Total Score
0,female,group B,bachelor's degree,standard,none,72,72,74,218
1,female,group C,some college,standard,completed,69,90,88,247
2,female,group B,master's degree,standard,none,90,95,93,278
3,male,group A,associate's degree,free/reduced,none,47,57,44,148
4,male,group C,some college,standard,none,76,78,75,229


Preparing X and Y Variables

In [28]:
corr=df['writing score'].corr(df['math score'])
print(corr)
#I will drop all the math,writing,reading score because they are strongly correlated
# we will now create an average score
df['Average score']=(df['math score']+df['reading score']+df['writing score'])/3

df.head()

0.8026420459498085


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,Total Score,Average score
0,female,group B,bachelor's degree,standard,none,72,72,74,218,72.666667
1,female,group C,some college,standard,completed,69,90,88,247,82.333333
2,female,group B,master's degree,standard,none,90,95,93,278,92.666667
3,male,group A,associate's degree,free/reduced,none,47,57,44,148,49.333333
4,male,group C,some college,standard,none,76,78,75,229,76.333333


In [112]:
x=df.drop(columns=['Average score','Total Score'],axis=1)
y=df['Total Score']
print(x.head())

   gender race/ethnicity parental level of education         lunch  \
0  female        group B           bachelor's degree      standard   
1  female        group C                some college      standard   
2  female        group B             master's degree      standard   
3    male        group A          associate's degree  free/reduced   
4    male        group C                some college      standard   

  test preparation course  math score  reading score  writing score  
0                    none          72             72             74  
1               completed          69             90             88  
2                    none          90             95             93  
3                    none          47             57             44  
4                    none          76             78             75  


In [102]:
#create column Transformer with 3 types of transformer
cat_features=x.select_dtypes(include='object').columns
num_features=x.select_dtypes(exclude='object').columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer


In [103]:
numeric_transformer=StandardScaler()
oh_transformer=OneHotEncoder()

preprocessor=ColumnTransformer([('OneHotEncoder',oh_transformer,cat_features),('StandardScaler',numeric_transformer,num_features)])

In [104]:
#transform features
X=preprocessor.fit_transform(x)

In [105]:
print(X.shape)

(1000, 20)


In [106]:
#separate datast into train and test
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [107]:
X_train.shape,X_test.shape

((800, 20), (200, 20))

Create an Evaluate Function to give all metrics after model Training

In [108]:
def evaluate_model(true,predicted):
  mae=mean_absolute_error(true,predicted)
  mse=mean_squared_error(true,predicted)
  rmse=np.sqrt(mean_squared_error(true,predicted))
  r2_square=r2_score(true,predicted)
  return mae,rmse,r2_square

In [109]:
models={
  "LinearRegression":LinearRegression(),
  "Lasso":Lasso(),
  "Ridge":Ridge(),
  "K-Neighbors Regressor":KNeighborsRegressor(),
  "Decision Tree":DecisionTreeRegressor(),
  "Random Forest Regressor":RandomForestRegressor(),
  "XGBRegressor":XGBRegressor(),
  "CatBoostRegressor":CatBoostRegressor(verbose=False),
  "AdaBoostRegressor":AdaBoostRegressor()
  
}
model_list=[]
r2_list=[]


In [110]:
for i in range(len(list(models))):
  model=list(models.values())[i]
  model.fit(X_train,y_train) #Train model

  #make predictions
  y_train_pred=model.predict(X_train)
  y_test_pred=model.predict(X_test)

  #Evaluate Train and Test dataset
  model_train_mae,model_train_rmse,model_train_r2=evaluate_model(y_train,y_train_pred)
  model_test_mae,model_test_rmse,model_test_r2=evaluate_model(y_test,y_test_pred)

  print(list(models.keys())[i])
  model_list.append(list(models.keys())[i])

  print('Model performance for Training Set')
  print('Root Mean Square Error: {:.4f}'.format(model_train_rmse))
  print('-Mean Absolute Error: {:.4f}'.format(model_train_mae))
  print('-R2 Score: {:.4f}'.format(model_train_r2))

  print('-------------------------')

  print('Model performance for Test set')
  print('- Root Mean Square Error: {:.4f}'.format(model_test_rmse))
  print('- Mean Absolute Error: {:.4f}'.format(model_test_mae))
  print('- R2 Score: {:.4f}'.format(model_test_r2))
  r2_list.append(model_test_r2)





LinearRegression
Model performance for Training Set
Root Mean Square Error: 0.0000
-Mean Absolute Error: 0.0000
-R2 Score: 1.0000
-------------------------
Model performance for Test set
- Root Mean Square Error: 0.0000
- Mean Absolute Error: 0.0000
- R2 Score: 1.0000
Lasso
Model performance for Training Set
Root Mean Square Error: 1.0644
-Mean Absolute Error: 0.8505
-R2 Score: 0.9994
-------------------------
Model performance for Test set
- Root Mean Square Error: 1.1139
- Mean Absolute Error: 0.8764
- R2 Score: 0.9994
Ridge
Model performance for Training Set
Root Mean Square Error: 0.0241
-Mean Absolute Error: 0.0195
-R2 Score: 1.0000
-------------------------
Model performance for Test set
- Root Mean Square Error: 0.0264
- Mean Absolute Error: 0.0201
- R2 Score: 1.0000
K-Neighbors Regressor
Model performance for Training Set
Root Mean Square Error: 6.7719
-Mean Absolute Error: 5.3040
-R2 Score: 0.9745
-------------------------
Model performance for Test set
- Root Mean Square Erro

RESULTS & CONCLUSION 

I tried different machine learning models to predict students’ Total Scores based on factors like gender, race/ethnicity, parental education, lunch type, test preparation, and their individual math, reading, and writing scores.


 Conclusion:

All models showed very high accuracy, and some even gave perfect results. But there's a big problem .I used math, reading, and writing scores to predict the Total Score. That’s like asking someone to add 2 + 3 + 4 and then testing if they can predict the sum. The models weren’t really learning; they were just adding.

This is called data leakage. It happens when the model sees the answer in advance. So, even though the results look great, they don’t really show the model’s true ability to make predictions.

To fix this, I should remove math, reading, and writing scores and use other features like gender, race, parental education, lunch, and test prep to see how well a model can predict Total Score without “cheating.”
