# <h1 style="text-align:center; color:blue;">Regression algorithms part 2</h1>

**We use the same data as part 1**

In [20]:
import numpy as np
import numpy.typing as npt
import pandas as pd


# 1. Generate synthetic regression dataset with both numerical and categorical features
np.random.seed(42)
n_samples: int = 500

# Numerical features
age: npt.NDArray[np.int64] = np.random.randint(18, 70, size=n_samples)
salary: npt.NDArray[np.float64] = np.random.normal(50000, 15000, size=n_samples)
experience: npt.NDArray[np.int64] = np.random.randint(0, 40, size=n_samples)

# Categorical features
departments: npt.NDArray[np.str_] = np.random.choice(['HR', 'Engineering', 'Marketing', 'Sales'], size=n_samples)
education: npt.NDArray[np.str_] = np.random.choice(['Bachelors', 'Masters', 'PhD'], size=n_samples)

# Target variable (continuous)
target: npt.NDArray[np.float64] = (
    20000 + age * 150 + experience * 300 +
    (departments == 'Engineering') * 10000 +
    (education == 'PhD') * 5000 +
    np.random.normal(0, 5000, size=n_samples)
)

# Create DataFrame
df: pd.DataFrame = pd.DataFrame({
    'Age': age,
    'Salary': salary,
    'Experience': experience,
    'Department': departments,
    'Education': education,
    'Target': target
})

# 3.2 Outlier removal (using IQR for numerical columns)
def remove_outliers_iqr(data : pd.DataFrame, column : str) -> pd.DataFrame:
    Q1: float = data[column].quantile(0.25)
    Q3: float = data[column].quantile(0.75)
    IQR: float = Q3 - Q1
    not_outliers: pd.DataFrame = data[(data[column] > (Q1 - 1.5 * IQR)) & (data[column] < (Q3 + 1.5 * IQR))]
    return not_outliers

outliers_cols : list[str] = ['Salary', 'Target']

for col in outliers_cols:
    df = remove_outliers_iqr(df, col)

# 4. Encode categorical data and scale numerical features

# Splitting categorical and numerical
categorical_df: pd.DataFrame = df.select_dtypes('object')
numerical_df: pd.DataFrame = df.select_dtypes('number')

# Getting the corresponding columns
categorical_cols: list[str] = categorical_df.columns.to_list()
numerical_cols: list[str] = numerical_df.columns.to_list()

# Importing the functions
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Encoding
le: LabelEncoder = LabelEncoder()

for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# Scaling
scaler: StandardScaler = StandardScaler()

for col in numerical_cols:
    df[[col]] = scaler.fit_transform(df[[col]])

# 5. Split the data into train and test sets

# Importing the function
from sklearn.model_selection import train_test_split

# Getting X and y

X: pd.DataFrame = df.drop('Target', axis=1)
y: pd.Series = df['Target']

# Splitting
X_train: pd.DataFrame
X_test: pd.DataFrame
y_train: pd.Series
y_test: pd.Series
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (396, 5)
Shape of X_test: (99, 5)
Shape of y_train: (396,)
Shape of y_test: (99,)


In [21]:
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator
from typing import Union
import numpy as np

In [23]:
def train_and_evaluate(model: BaseEstimator,
                       X_train: pd.DataFrame,
                       y_train: pd.Series,
                       X_test: pd.DataFrame,
                       y_test: pd.Series) -> dict[str, Union[float, np.float64]]:
    """
    Trains the given regression model and returns accuracy (R^2) and error (RMSE) for both train and test sets.
    """

    # Fit the model
    model.fit(X_train, y_train)
    
    # Predictions
    y_train_pred: npt.NDArray[np.float64] = model.predict(X_train)
    y_test_pred: npt.NDArray[np.float64] = model.predict(X_test)
    
    # Accuracy (R^2)
    train_r2: float = model.score(X_train, y_train)
    test_r2: float = model.score(X_test, y_test)
    
    # Error (RMSE)
    train_rmse: np.float64 = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse: np.float64 = np.sqrt(mean_squared_error(y_test, y_test_pred))
    
    results: dict[str, Union[float, np.float64]] = {
        "train_r2": train_r2,
        "test_r2": test_r2,
        "train_rmse": train_rmse,
        "test_rmse": test_rmse
    }
    return results


### 6- Support vector Regression (SVR)

In [24]:
from sklearn.svm import LinearSVR

svr_linear:BaseEstimator = LinearSVR()

train_and_evaluate(model= svr_linear,
                   X_train= X_train,
                   y_train= y_train,
                   X_test= X_test,
                   y_test= y_test
                   )


{'train_r2': 0.5034237800990844,
 'test_r2': 0.4208244085427282,
 'train_rmse': np.float64(0.7131576778295406),
 'test_rmse': np.float64(0.7201679580506448)}

In [25]:
from sklearn.svm import SVR

svr_kernel_rbf: BaseEstimator = SVR()

train_and_evaluate(model= svr_kernel_rbf,
                   X_train= X_train,
                   y_train= y_train,
                   X_test= X_test,
                   y_test= y_test
                   )


{'train_r2': 0.6688016148637084,
 'test_r2': 0.5108147574739019,
 'train_rmse': np.float64(0.5824205744628743),
 'test_rmse': np.float64(0.661858792696883)}

### 7- Decision Tree Regression

In [None]:
from sklearn.tree import DecisionTreeRegressor

DTR: BaseEstimator = DecisionTreeRegressor()

train_and_evaluate(model= DTR,
                   X_train= X_train,
                   y_train= y_train,
                   X_test= X_test,
                   y_test= y_test
                   )


{'train_r2': 1.0,
 'test_r2': -0.18541812475085195,
 'train_rmse': np.float64(0.0),
 'test_rmse': np.float64(1.0303017817568225)}

**Note the overfitting**

**Fixing this is not in the course content**

### 8- Random Forest Regression

In [26]:
from sklearn.ensemble import RandomForestRegressor

RFR: BaseEstimator = RandomForestRegressor()

train_and_evaluate(model= RFR,
                   X_train= X_train,
                   y_train= y_train,
                   X_test= X_test,
                   y_test= y_test
                   )


{'train_r2': 0.9433971470980185,
 'test_r2': 0.4290319231259069,
 'train_rmse': np.float64(0.24077520629005447),
 'test_rmse': np.float64(0.7150469901620776)}

### 9- K-Nearest Neighbors Regression (KNN Regression)

In [27]:
from sklearn.neighbors import KNeighborsRegressor

KNR:BaseEstimator = KNeighborsRegressor()

train_and_evaluate(model= KNR,
                   X_train= X_train,
                   y_train= y_train,
                   X_test= X_test,
                   y_test= y_test
                   )


{'train_r2': 0.6779505998597154,
 'test_r2': 0.43197131833104985,
 'train_rmse': np.float64(0.5743198810300899),
 'test_rmse': np.float64(0.7132040519695594)}

### 10- - Bayesian Regression

In [28]:
from sklearn.linear_model import BayesianRidge

BR: BaseEstimator = BayesianRidge()

train_and_evaluate(model= BR,
                   X_train= X_train,
                   y_train= y_train,
                   X_test= X_test,
                   y_test= y_test
                   )


{'train_r2': 0.5063930709411979,
 'test_r2': 0.43595656322585663,
 'train_rmse': np.float64(0.7110223081753506),
 'test_rmse': np.float64(0.710697756257328)}

**Note: bagging is not in the content**

## Boosting algorithms

### 11- Ada Boost Regression

In [29]:
from sklearn.ensemble import AdaBoostRegressor

ABR: BaseEstimator = AdaBoostRegressor()

train_and_evaluate(model= ABR,
                   X_train= X_train,
                   y_train= y_train,
                   X_test= X_test,
                   y_test= y_test
                   )


{'train_r2': 0.6880360292598227,
 'test_r2': 0.5238172057992055,
 'train_rmse': np.float64(0.5652555339106106),
 'test_rmse': np.float64(0.6530035147104616)}

### 12- Gradient Boost Regression

In [30]:
from sklearn.ensemble import GradientBoostingRegressor

GBR: BaseEstimator = GradientBoostingRegressor()

train_and_evaluate(model= GBR,
                   X_train= X_train,
                   y_train= y_train,
                   X_test= X_test,
                   y_test= y_test
                   )


{'train_r2': 0.8463574105180187,
 'test_r2': 0.47880041034185317,
 'train_rmse': np.float64(0.3966874054487283),
 'test_rmse': np.float64(0.6831730122163806)}

### 13- XGBoost

In [31]:
from xgboost import XGBRegressor

XGBR: BaseEstimator = XGBRegressor()

train_and_evaluate(model= XGBR,
                   X_train= X_train,
                   y_train= y_train,
                   X_test= X_test,
                   y_test= y_test
                   )


{'train_r2': 0.9997686847434863,
 'test_r2': 0.36175241234665123,
 'train_rmse': np.float64(0.015391988917463547),
 'test_rmse': np.float64(0.7560025498102566)}