In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")


In [3]:
df = pd.read_csv("stud.csv")

In [4]:
#splitting independent and dependent fetaures

X = df.drop('math_score', axis=1)
y = df['math_score']
X.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [5]:
#One-hot encoding categorical features with column transformer

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,StandardScaler

num_features = X.select_dtypes(exclude=['object']).columns
cat_features = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer([("encoder", OneHotEncoder(drop='first'),cat_features),
                       ("scaler", StandardScaler(),num_features) ])

preprocessor


In [6]:
X = preprocessor.fit_transform(X)
X

array([[ 0.        ,  1.        ,  0.        , ...,  1.        ,
         0.19399858,  0.39149181],
       [ 0.        ,  0.        ,  1.        , ...,  0.        ,
         1.42747598,  1.31326868],
       [ 0.        ,  1.        ,  0.        , ...,  1.        ,
         1.77010859,  1.64247471],
       ...,
       [ 0.        ,  0.        ,  1.        , ...,  0.        ,
         0.12547206, -0.20107904],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.60515772,  0.58901542],
       [ 0.        ,  0.        ,  0.        , ...,  1.        ,
         1.15336989,  1.18158627]])

In [7]:
#train test split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
X_train.shape, X_test.shape

((800, 14), (200, 14))

In [9]:
#Evaluation function
from sklearn.metrics import mean_squared_error,mean_absolute_error, r2_score

def evaluate_model(true,pred):    
    mse = mean_squared_error(true, pred)
    mae = mean_absolute_error(true, pred)
    r2 = r2_score(true, pred)
    rmse = np.sqrt(mse)
    return mse, mae, rmse, r2
     

In [14]:
%pip install xgboost

Collecting xgboost
  Using cached xgboost-2.1.4-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.4-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.8/124.9 MB 6.7 MB/s eta 0:00:19
    --------------------------------------- 1.6/124.9 MB 5.2 MB/s eta 0:00:24
   - -------------------------------------- 3.4/124.9 MB 6.3 MB/s eta 0:00:20
   - -------------------------------------- 4.2/124.9 MB 5.7 MB/s eta 0:00:22
   - -------------------------------------- 5.2/124.9 MB 5.3 MB/s eta 0:00:23
   - -------------------------------------- 5.5/124.9 MB 5.2 MB/s eta 0:00:23
   -- ------------------------------------- 6.8/124.9 MB 4.9 MB/s eta 0:00:24
   -- ------------------------------------- 6.8/124.9 MB 4.9 MB/s eta 0:00:24
   -- ------------------------------------- 8.7/124.9 MB 4.6 MB/s eta 0:00:26
   --- ------------------------------------ 9.4/124.9 MB 4.7 MB/s eta 0:00:25


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
my-package 0.0.1 requires catboost, which is not installed.


In [17]:
%pip install catboost

Collecting catboost
  Using cached catboost-1.2.8-cp38-cp38-win_amd64.whl.metadata (1.2 kB)
Collecting plotly (from catboost)
  Using cached plotly-6.1.2-py3-none-any.whl.metadata (6.9 kB)
Using cached catboost-1.2.8-cp38-cp38-win_amd64.whl (102.5 MB)
Using cached plotly-6.1.2-py3-none-any.whl (16.3 MB)
Installing collected packages: plotly, catboost
Note: you may need to restart the kernel to use updated packages.


ERROR: Could not install packages due to an OSError: [WinError 32] The process cannot access the file because it is being used by another process: 'e:\\Github Repositories\\Project1\\venv\\Lib\\site-packages\\plotly\\figure_factory\\_2d_density.py'
Consider using the `--user` option or check the permissions.



In [20]:
#list of models
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}

ModuleNotFoundError: No module named 'catboost'

In [None]:
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')