In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,AdaBoostRegressor

In [2]:
df=pd.read_csv("StudentsPerformance.csv")

In [3]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [5]:
X=df.drop(columns=['math score'],axis=1)

In [6]:
X.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [7]:
y=df['math score']

In [10]:
print(y)

0      72
1      69
2      90
3      47
4      76
       ..
995    88
996    62
997    59
998    68
999    77
Name: math score, Length: 1000, dtype: int64


In [11]:
X.shape

(1000, 7)

In [14]:
df.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
995    False
996    False
997    False
998    False
999    False
Length: 1000, dtype: bool

In [13]:
df.isnull().sum()

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64

In [15]:
print(df['gender'].unique())

['female' 'male']


In [16]:
#race/ethnicity
print(df['race/ethnicity'].unique())

['group B' 'group C' 'group A' 'group D' 'group E']


In [19]:
#handle Categorical Value and Standardize the Numerical data

num_feature=X.select_dtypes(exclude="object").columns
cat_feature=X.select_dtypes(include="object").columns

In [24]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

num_transformer=StandardScaler()
cat_transformer=OneHotEncoder()

preprocessor=ColumnTransformer(
    [
        ("OneHotEncoder",cat_transformer,cat_feature),
        ("StandardScaler",num_transformer,num_feature),
    ]
)


In [25]:
X=preprocessor.fit_transform(X)

In [27]:
X.shape

(1000, 19)

In [28]:
X

array([[ 1.        ,  0.        ,  0.        , ...,  1.        ,
         0.19399858,  0.39149181],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         1.42747598,  1.31326868],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.77010859,  1.64247471],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.12547206, -0.20107904],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.60515772,  0.58901542],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.15336989,  1.18158627]], shape=(1000, 19))

In [29]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2)


In [30]:
x_train.shape, x_test.shape

((800, 19), (200, 19))

In [31]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,root_mean_squared_error,r2_score
def eval_fun(true_value,predicted_value):
    mae=mean_absolute_error(true_value,predicted_value)
    mse=mean_squared_error(true_value,predicted_value)
    rmse=root_mean_squared_error(true_value,predicted_value)
    r_2_score=r2_score(true_value,predicted_value)

    return mae,mse,rmse,r_2_score

In [33]:
models={
    "linear_regression":LinearRegression(),
    "decision_tree":DecisionTreeRegressor(),
    "random_forest":RandomForestRegressor(),
    "gradient_boosting":GradientBoostingRegressor(),
    "ada_boost":AdaBoostRegressor()
}

model_list=[]
eval_list=[]

In [37]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    #Model Training
    print("Current Model Training:", model)
    model.fit(x_train,y_train)

    #model predition
    y_pred=model.predict(x_test)

    #Model Evaluation
    model_mae,model_mse,model_rmse,model_r2_score=eval_fun(true_value=y_test,predicted_value=y_pred)

    print("Mean Absolute Error",model_mae)
    print("Mean Squred Error",model_mse)
    print("Root Mean Squred Error",model_rmse)
    print("R2 Score",model_r2_score)
    print("\n")

Current Model Training: LinearRegression()
Mean Absolute Error 4.195164138300938
Mean Squred Error 27.849536688756046
Root Mean Squred Error 5.277266024065496
R2 Score 0.8746284266258266


Current Model Training: DecisionTreeRegressor()
Mean Absolute Error 7.21
Mean Squred Error 76.93
Root Mean Squred Error 8.770974860299168
R2 Score 0.65368058910764


Current Model Training: RandomForestRegressor()
Mean Absolute Error 5.068458333333334
Mean Squred Error 39.10062567361111
Root Mean Squred Error 6.253049310025559
R2 Score 0.823978868467338


Current Model Training: GradientBoostingRegressor()
Mean Absolute Error 4.602021572996649
Mean Squred Error 32.48195353131451
Root Mean Squred Error 5.699294125706666
R2 Score 0.8537744571480846


Current Model Training: AdaBoostRegressor()
Mean Absolute Error 4.972441424311671
Mean Squred Error 38.75788872178945
Root Mean Squred Error 6.225583404130849
R2 Score 0.8255217835751754


