In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


## Importing Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf

## Importing the dataset

In [None]:
dataset = pd.read_csv('amsterdam_weekdays.csv')
X = dataset.iloc[:, 2:-1].values
y = dataset.iloc[:, 1].values

In [None]:
print(X)

[['Private room' False True ... 98.25389587009934 6.846472824200016
  4.90569]
 ['Private room' False True ... 837.2807567422693 58.34292774344904
  4.90005]
 ['Private room' False True ... 95.3869549262145 6.646700254501156
  4.97512]
 ...
 ['Private room' False True ... 625.9475624336228 43.6169267144573
  4.88897]
 ['Entire home/apt' False False ... 336.5892365740761 23.454022262581173
  4.90688]
 ['Shared room' True False ... 807.4923072437515 56.26722572511592
  4.89295]]


## Missing Data

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 2:-1])
X[:, 2:-1] = imputer.transform(X[:, 2:-1])

## One-Hot Encoding

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X= np.array(ct.fit_transform(X))

In [None]:
print(X)

[[0.0 1.0 0.0 ... 98.25389587009934 6.846472824200016 4.90569]
 [0.0 1.0 0.0 ... 837.2807567422693 58.34292774344904 4.90005]
 [0.0 1.0 0.0 ... 95.3869549262145 6.646700254501156 4.97512]
 ...
 [0.0 1.0 0.0 ... 625.9475624336228 43.6169267144573 4.88897]
 [1.0 0.0 0.0 ... 336.5892365740761 23.454022262581173 4.90688]
 [0.0 0.0 1.0 ... 807.4923072437515 56.26722572511592 4.89295]]


## Training & Testing the Dataset

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

##  

## **BUILDING THE ALGORITHM**

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

## Polynomial Regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 4)
X_poly = poly_reg.fit_transform(X_train)
lin_reg_2 = LinearRegression()
lin_reg_2.fit(X_poly, y_train)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10, random_state = 42)
regressor.fit(X_train, y_train)

## Cat Boost

In [None]:
from catboost import CatBoostRegressor
regressor = CatBoostRegressor()
regressor.fit(X_train, y_train)

Learning rate set to 0.040139
0:	learn: 431.7619796	total: 50ms	remaining: 50s
1:	learn: 426.9078462	total: 52.6ms	remaining: 26.3s
2:	learn: 420.5601275	total: 55ms	remaining: 18.3s
3:	learn: 414.6202084	total: 57.4ms	remaining: 14.3s
4:	learn: 408.8227364	total: 59.9ms	remaining: 11.9s
5:	learn: 403.8039608	total: 62.2ms	remaining: 10.3s
6:	learn: 399.1975354	total: 64.8ms	remaining: 9.19s
7:	learn: 394.7192807	total: 67.3ms	remaining: 8.34s
8:	learn: 390.1269940	total: 69.6ms	remaining: 7.67s
9:	learn: 385.8837888	total: 72ms	remaining: 7.13s
10:	learn: 381.5570116	total: 74.3ms	remaining: 6.68s
11:	learn: 377.0373766	total: 76.7ms	remaining: 6.31s
12:	learn: 373.1866583	total: 79ms	remaining: 6s
13:	learn: 369.2190560	total: 81.4ms	remaining: 5.73s
14:	learn: 365.4557011	total: 83.7ms	remaining: 5.5s
15:	learn: 362.0770375	total: 86ms	remaining: 5.29s
16:	learn: 358.6813173	total: 88.3ms	remaining: 5.11s
17:	learn: 354.8335099	total: 90.6ms	remaining: 4.94s
18:	learn: 351.6618292	t

<catboost.core.CatBoostRegressor at 0x78e705b530d0>

## Predicting Results

In [None]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[ 579.97  368.62]
 [ 584.98  899.63]
 [ 587.87  548.12]
 [ 525.72  556.56]
 [ 819.08  632.95]
 [ 372.43  426.03]
 [ 411.98  310.97]
 [1119.9  1044.69]
 [ 282.43  233.64]
 [ 409.42  460.95]
 [ 231.4   295.03]
 [ 523.96  583.74]
 [ 276.48  261.76]
 [ 232.69  270.43]
 [ 660.6   624.28]
 [ 531.01  289.64]
 [ 420.89  399.55]
 [ 563.95  417.83]
 [ 606.91  678.41]
 [ 846.28  854.4 ]
 [ 335.52  356.43]
 [ 344.07  436.11]
 [ 254.7   288.71]
 [ 452.92  307.22]
 [1392.97  737.23]
 [ 553.09  540.62]
 [ 304.81  288.71]
 [ 710.28  546.72]
 [ 339.57  196.61]
 [ 224.44  243.25]
 [ 237.05  393.46]
 [ 581.    614.44]
 [ 342.61  410.56]
 [ 404.59  282.85]
 [ 307.23  221.22]
 [ 477.71  409.39]
 [ 641.56  614.44]
 [ 451.07  245.82]
 [ 303.08  350.1 ]
 [ 940.93  909.47]
 [ 391.19  491.65]
 [ 376.13  351.51]
 [ 459.34  319.41]
 [ 681.54  467.04]
 [ 259.86  510.16]
 [ 349.47  277.69]
 [ 270.89  188.18]
 [ 600.29  921.66]
 [ 799.82  526.09]
 [ 253.69  215.12]
 [ 321.97  319.64]
 [ 270.64  177.16]
 [ 540.06  7

## K-Fold Validation


In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
6:	learn: 415.5260900	total: 43.1ms	remaining: 6.12s
7:	learn: 411.0460141	total: 50.5ms	remaining: 6.27s
8:	learn: 406.0092216	total: 58.5ms	remaining: 6.45s
9:	learn: 401.5720878	total: 66.4ms	remaining: 6.58s
10:	learn: 397.0810090	total: 75.9ms	remaining: 6.83s
11:	learn: 393.0833310	total: 81.4ms	remaining: 6.7s
12:	learn: 388.8315060	total: 92.7ms	remaining: 7.04s
13:	learn: 384.6595279	total: 99.6ms	remaining: 7.01s
14:	learn: 380.6203206	total: 104ms	remaining: 6.82s
15:	learn: 377.1588294	total: 111ms	remaining: 6.86s
16:	learn: 373.4557969	total: 120ms	remaining: 6.94s
17:	learn: 369.6194517	total: 130ms	remaining: 7.08s
18:	learn: 366.7790131	total: 146ms	remaining: 7.54s
19:	learn: 363.2793953	total: 157ms	remaining: 7.69s
20:	learn: 359.9959217	total: 165ms	remaining: 7.71s
21:	learn: 357.6748382	total: 167ms	remaining: 7.43s
22:	learn: 354.3847623	total: 180ms	remaining: 7.66s
23:	learn: 351.7726488	total: 1

## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Part 2 - Building the ANN

### Initializing the ANN

In [None]:
ann = tf.keras.models.Sequential()

### Adding the input layer and the first hidden layer

In [None]:
ann.add(tf.keras.layers.Dense(units= 6, activation= 'relu'))

### Adding the second hidden layer

In [None]:
ann.add(tf.keras.layers.Dense(units= 6, activation= 'relu'))

### Adding the output layer

In [None]:
ann.add(tf.keras.layers.Dense(units= 1))

## Part 3 - Training the ANN

### Compiling the ANN

In [None]:
ann.compile(optimizer = 'adam', loss = 'mean_squared_error', metrics = ['mean_squared_error'])

### Training the ANN on the Training set

In [None]:
ann.fit(X_train, y_train, batch_size = 32, epochs = 200)

Epoch 1/200
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 98451.9922 - mean_squared_error: 98451.9922
Epoch 2/200
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 129843.3594 - mean_squared_error: 129843.3594
Epoch 3/200
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 84009.9375 - mean_squared_error: 84009.9375
Epoch 4/200
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 56900.7383 - mean_squared_error: 56900.7383
Epoch 5/200
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 60319.8750 - mean_squared_error: 60319.8750
Epoch 6/200
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 90219.2969 - mean_squared_error: 90219.2969
Epoch 7/200
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 104734.2188 - mean_squared_error: 104734.2188
Epoch 8/200
[1m28/28[0m [32m━━━━━━

<keras.src.callbacks.history.History at 0x7c487e166250>

### Predicting the Test set results

In [None]:
y_pred = ann.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[[ 539.24493408  368.61715839]
 [ 605.49041748  899.63208586]
 [ 754.16918945  548.12176318]
 [ 626.48840332  556.55801092]
 [ 760.31829834  632.95292105]
 [ 443.93789673  426.0305111 ]
 [ 389.89492798  310.96946547]
 [1020.92779541 1044.68867902]
 [ 321.12658691  233.63719448]
 [ 435.01211548  460.94720315]
 [ 307.74899292  295.03433084]
 [ 484.96734619  583.74147587]
 [ 312.9694519   261.75802029]
 [ 276.17492676  270.42860825]
 [ 636.815979    624.28233309]
 [ 724.11352539  289.64450589]
 [ 394.68536377  399.55006679]
 [ 613.8793335   417.82860357]
 [ 327.73654175  678.41492278]
 [1052.65588379  854.40442434]
 [ 422.49145508  356.4314672 ]
 [ 302.93411255  436.10714035]
 [ 257.5632019   288.70714503]
 [ 546.74487305  307.22002203]
 [ 893.15356445  737.23431678]
 [ 684.15252686  540.62287629]
 [ 168.66624451  288.70714503]
 [ 879.67822266  546.71572189]
 [ 236.9283905   196.61144049]
 [ 252.67544556  243.2451433 

## Model Accuracy

Linear Regression


In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, root_mean_squared_error
r2_lr = r2_score(y_test, y_pred)
mse_lr = mean_squared_error(y_test, y_pred)
mae_lr = mean_absolute_error(y_test, y_pred)
print(f"R-squared: {r2_lr:.2f}")
print(f"Mean Squared Error: {mse_lr:.2f}")
print(f"Mean Absolute Error: {mae_lr:.2f}")

R-squared: 0.55
Mean Squared Error: 43786.12
Mean Absolute Error: 154.20


Random Forest

In [None]:
r2_rf = r2_score(y_test, y_pred)
mse_rf = mean_squared_error(y_test, y_pred)
mae_rf = mean_absolute_error(y_test, y_pred)
print(f"R-squared: {r2_rf:.2f}")
print(f"Mean Squared Error: {mse_rf:.2f}")
print(f"Mean Absolute Error: {mae_rf:.2f}")

R-squared: 0.11
Mean Squared Error: 87162.34
Mean Absolute Error: 160.33


Cat Boost

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, root_mean_squared_error
r2_cb = r2_score(y_test, y_pred)
mse_cb = mean_squared_error(y_test, y_pred)
mae_cb = mean_absolute_error(y_test, y_pred)
rmse_cb = root_mean_squared_error(y_test, y_pred)
print(f"R-squared: {r2_cb:.2f}")
print(f"Mean Squared Error: {mse_cb:.2f}")
print(f"Mean Absolute Error: {mae_cb:.2f}")
print(f"Root Mean Squared Error: {rmse_cb:.2f}")

R-squared: 0.61
Mean Squared Error: 38207.22
Mean Absolute Error: 129.60
Root Mean Squared Error: 195.47


Artificial Neural Networks


In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, root_mean_squared_error
r2_ann = r2_score(y_test, y_pred)
mse_ann = mean_squared_error(y_test, y_pred)
mae_ann = mean_absolute_error(y_test, y_pred)
rmse_ann = root_mean_squared_error(y_test, y_pred)
print(f"R-squared: {r2_ann:.2f}")
print(f"Mean Squared Error: {mse_ann:.2f}")
print(f"Mean Absolute Error: {mae_ann:.2f}")
print(f"Root Mean Squared Error: {rmse_ann:.2f}")

R-squared: 0.58
Mean Squared Error: 40558.43
Mean Absolute Error: 145.97
Root Mean Squared Error: 201.39


# Project Summary

This notebook focuses on building and evaluating different regression models to predict a target variable based on the 'amsterdam_weekdays.csv' dataset. The process involved:

1. **Data Preprocessing**: Handling missing data using `SimpleImputer` with a 'mean' strategy and applying one-hot encoding to categorical features using `ColumnTransformer` and `OneHotEncoder`.
2. **Data Splitting**: Dividing the dataset into training and testing sets using `train_test_split`.
3. **Feature Scaling**: Scaling the features using `StandardScaler`.
4. **Model Building**: Implementing and training four different regression models:
    - Linear Regression
    - Polynomial Regression (with degree 4)
    - Random Forest Regressor
    - CatBoost Regressor
    - Artificial Neural Network (ANN) with two hidden layers.
5. **Model Evaluation**: Evaluating each model using common regression metrics: R-squared, Mean Squared Error (MSE), Mean Absolute Error (MAE), and Root Mean Squared Error (RMSE). K-Fold Cross-Validation was also performed for the CatBoost model.

# Model Evaluation and Comparison

## Best Model

Based on the R-squared value (which indicates the proportion of the variance in the dependent variable that is predictable from the independent variables), the **CatBoost Regressor** appears to be the best model among the evaluated ones, with an R-squared of 0.61. It also has the lowest Mean Squared Error (MSE) and Mean Absolute Error (MAE), indicating better overall prediction accuracy and less average error in predictions compared to the other models.
