# Requirements

In [2]:

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder


def missing_table(data: pd.DataFrame):
    statistics_missing_table = data.isnull().sum().reset_index().rename(columns={"index": "Feature", 0: "CountMissing"})
    percentage_missing = (
        (statistics_missing_table["CountMissing"] / len(data) * 100).reset_index()).reset_index().rename(
        columns={"CountMissing": "PercentageMissing"})
    statistics_missing_table["PercentageMissing"] = percentage_missing["PercentageMissing"]
    statistics_missing_table["Total"] = len(data)
    return statistics_missing_table


def balance_table(data: pd.DataFrame, target_column):
    balance_table = data.groupby(target_column).size().reset_index().rename(
        columns={target_column: "Class", 0: "Count"})
    tmp = (balance_table["Count"] / len(data) * 100).reset_index().rename(
        columns={target_column: "Class", "Count": "Percentage"})
    balance_table["Percentage"] = tmp["Percentage"]
    balance_table["Total"] = len(data)
    return balance_table


def get_numerical_features_names(data: pd.DataFrame):
    result_columns = []
    for column in data.columns:
        if pd.api.types.is_any_real_numeric_dtype(data[column]):
            result_columns.append(column)

    return result_columns


def get_categorical_features_names(data: pd.DataFrame, target_feature=None):
    categorical_features = []
    for column in data.columns:
        if target_feature is not None and column == target_feature:
            continue

        if not pd.api.types.is_any_real_numeric_dtype(data[column]):
            categorical_features.append(column)
    return categorical_features


def encode_data(data: pd.DataFrame, features_to_encode: list):
    encoders = {}
    data_copy = data.copy()
    for feature in features_to_encode:
        encoder = OrdinalEncoder()
        data_copy[[feature]] = encoder.fit_transform(data_copy[[feature]])
        encoders[feature] = encoder
    return data_copy, encoders

# Laboratory Exercise - Run Mode (8 points)

## Introduction
In this laboratory assignment, the focus is on time series forecasting, specifically targeting the prediction of the current **mean temperature** in the city of Delhi. Your task involves employing bagging and boosting methods to forecast the **mean temperature**. To accomplish this use data from the preceding three days, consisting of **mean temperature**, **humidity**, **wind speed**, and **mean pressure**.

**Note: You are required to perform this laboratory assignment on your local machine.**

## The Climate Dataset

## Downloading the Climate Dataset

## Exploring the Climate Dataset
This dataset consists of daily weather records for the city of Delhi spanning a period of 4 years (from 2013 to 2017). The dataset includes the following attributes:

- date - date in the format YYYY-MM-DD,
- meantemp - mean temperature averaged from multiple 3-hour intervals in a day,
- humidity - humidity value for the day (measured in grams of water vapor per cubic meter volume of air),
- wind_speed - wind speed measured in kilometers per hour, and
- meanpressure - pressure reading of the weather (measured in atm).

*Note: The dataset is complete, with no missing values in any of its entries.*

Load the dataset into a `pandas` data frame.

In [32]:
# Write your code here. Add as many boxes as you need.
df_original = pd.read_csv('climate-data.csv')
df = df_original.copy()
df.head(5)

Unnamed: 0,date,meantemp,humidity,wind_speed,meanpressure
0,2013-01-01,10.0,84.5,0.0,1015.666667
1,2013-01-02,7.4,92.0,2.98,1017.8
2,2013-01-03,7.166667,87.0,4.633333,1018.666667
3,2013-01-04,8.666667,71.333333,1.233333,1017.166667
4,2013-01-05,6.0,86.833333,3.7,1016.5


In [53]:
# plt.figure(figsize=(20,10))
# sns.palplot(df)
# plt.show()

<Figure size 2000x1000 with 0 Axes>

ValueError: Image size of 145900x100 pixels is too large. It must be less than 2^16 in each direction.

<Figure size 145900x100 with 1 Axes>

In [33]:
df = df.sort_values(by=['date'])
df.head(20)

Unnamed: 0,date,meantemp,humidity,wind_speed,meanpressure
0,2013-01-01,10.0,84.5,0.0,1015.666667
1,2013-01-02,7.4,92.0,2.98,1017.8
2,2013-01-03,7.166667,87.0,4.633333,1018.666667
3,2013-01-04,8.666667,71.333333,1.233333,1017.166667
4,2013-01-05,6.0,86.833333,3.7,1016.5
5,2013-01-06,7.0,82.8,1.48,1018.0
6,2013-01-07,7.0,78.6,6.3,1020.0
7,2013-01-08,8.857143,63.714286,7.142857,1018.714286
8,2013-01-09,14.0,51.25,12.5,1017.0
9,2013-01-10,11.0,62.0,7.4,1015.666667


Explore the dataset using visualizations of your choice.

# Feauture Extraction
Apply a lag of one, two, and three days to each feature, creating a set of features representing the meteorological conditions from the previous three days. To maintain dataset integrity, eliminate any resulting missing values at the beginning of the dataset.

Hint: Use `df['column_name'].shift(period)`. Check the documentation at https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.shift.html.

In [38]:
df.columns[1:]

Index(['meantemp', 'humidity', 'wind_speed', 'meanpressure'], dtype='object')

In [42]:
# Write your code here. Add as many boxes as you need.
for col in df.columns[1:]:
    df[f'{col}_shift1'] = df[col].shift(1)
    df[f'{col}_shift2'] = df[col].shift(2)
    df[f'{col}_shift3'] = df[col].shift(3)

df

Unnamed: 0,date,meantemp,humidity,wind_speed,meanpressure,meantemp_shift1,meantemp_shift2,meantemp_shift3,humidity_shift1,humidity_shift2,humidity_shift3,wind_speed_shift1,wind_speed_shift2,wind_speed_shift3,meanpressure_shift1,meanpressure_shift2,meanpressure_shift3
0,2013-01-01,10.000000,84.500000,0.000000,1015.666667,,,,,,,,,,,,
1,2013-01-02,7.400000,92.000000,2.980000,1017.800000,10.000000,,,84.500000,,,0.000000,,,1015.666667,,
2,2013-01-03,7.166667,87.000000,4.633333,1018.666667,7.400000,10.000000,,92.000000,84.500000,,2.980000,0.000000,,1017.800000,1015.666667,
3,2013-01-04,8.666667,71.333333,1.233333,1017.166667,7.166667,7.400000,10.000000,87.000000,92.000000,84.500000,4.633333,2.980000,0.000000,1018.666667,1017.800000,1015.666667
4,2013-01-05,6.000000,86.833333,3.700000,1016.500000,8.666667,7.166667,7.400000,71.333333,87.000000,92.000000,1.233333,4.633333,2.980000,1017.166667,1018.666667,1017.800000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1457,2016-12-28,17.217391,68.043478,3.547826,1015.565217,16.850000,17.142857,14.000000,67.550000,74.857143,94.300000,8.335000,8.784211,9.085000,1017.200000,1016.952381,1014.350000
1458,2016-12-29,15.238095,87.857143,6.000000,1016.904762,17.217391,16.850000,17.142857,68.043478,67.550000,74.857143,3.547826,8.335000,8.784211,1015.565217,1017.200000,1016.952381
1459,2016-12-30,14.095238,89.666667,6.266667,1017.904762,15.238095,17.217391,16.850000,87.857143,68.043478,67.550000,6.000000,3.547826,8.335000,1016.904762,1015.565217,1017.200000
1460,2016-12-31,15.052632,87.000000,7.325000,1016.100000,14.095238,15.238095,17.217391,89.666667,87.857143,68.043478,6.266667,6.000000,3.547826,1017.904762,1016.904762,1015.565217


In [43]:
df = df.dropna(axis=0)
df

Unnamed: 0,date,meantemp,humidity,wind_speed,meanpressure,meantemp_shift1,meantemp_shift2,meantemp_shift3,humidity_shift1,humidity_shift2,humidity_shift3,wind_speed_shift1,wind_speed_shift2,wind_speed_shift3,meanpressure_shift1,meanpressure_shift2,meanpressure_shift3
3,2013-01-04,8.666667,71.333333,1.233333,1017.166667,7.166667,7.400000,10.000000,87.000000,92.000000,84.500000,4.633333,2.980000,0.000000,1018.666667,1017.800000,1015.666667
4,2013-01-05,6.000000,86.833333,3.700000,1016.500000,8.666667,7.166667,7.400000,71.333333,87.000000,92.000000,1.233333,4.633333,2.980000,1017.166667,1018.666667,1017.800000
5,2013-01-06,7.000000,82.800000,1.480000,1018.000000,6.000000,8.666667,7.166667,86.833333,71.333333,87.000000,3.700000,1.233333,4.633333,1016.500000,1017.166667,1018.666667
6,2013-01-07,7.000000,78.600000,6.300000,1020.000000,7.000000,6.000000,8.666667,82.800000,86.833333,71.333333,1.480000,3.700000,1.233333,1018.000000,1016.500000,1017.166667
7,2013-01-08,8.857143,63.714286,7.142857,1018.714286,7.000000,7.000000,6.000000,78.600000,82.800000,86.833333,6.300000,1.480000,3.700000,1020.000000,1018.000000,1016.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1457,2016-12-28,17.217391,68.043478,3.547826,1015.565217,16.850000,17.142857,14.000000,67.550000,74.857143,94.300000,8.335000,8.784211,9.085000,1017.200000,1016.952381,1014.350000
1458,2016-12-29,15.238095,87.857143,6.000000,1016.904762,17.217391,16.850000,17.142857,68.043478,67.550000,74.857143,3.547826,8.335000,8.784211,1015.565217,1017.200000,1016.952381
1459,2016-12-30,14.095238,89.666667,6.266667,1017.904762,15.238095,17.217391,16.850000,87.857143,68.043478,67.550000,6.000000,3.547826,8.335000,1016.904762,1015.565217,1017.200000
1460,2016-12-31,15.052632,87.000000,7.325000,1016.100000,14.095238,15.238095,17.217391,89.666667,87.857143,68.043478,6.266667,6.000000,3.547826,1017.904762,1016.904762,1015.565217


## Dataset Splitting
Partition the dataset into training and testing sets with an 80:20 ratio.

**WARNING: DO NOT SHUFFLE THE DATASET.**



In [44]:
df = df.drop(columns='date', axis=1)
df

Unnamed: 0,meantemp,humidity,wind_speed,meanpressure,meantemp_shift1,meantemp_shift2,meantemp_shift3,humidity_shift1,humidity_shift2,humidity_shift3,wind_speed_shift1,wind_speed_shift2,wind_speed_shift3,meanpressure_shift1,meanpressure_shift2,meanpressure_shift3
3,8.666667,71.333333,1.233333,1017.166667,7.166667,7.400000,10.000000,87.000000,92.000000,84.500000,4.633333,2.980000,0.000000,1018.666667,1017.800000,1015.666667
4,6.000000,86.833333,3.700000,1016.500000,8.666667,7.166667,7.400000,71.333333,87.000000,92.000000,1.233333,4.633333,2.980000,1017.166667,1018.666667,1017.800000
5,7.000000,82.800000,1.480000,1018.000000,6.000000,8.666667,7.166667,86.833333,71.333333,87.000000,3.700000,1.233333,4.633333,1016.500000,1017.166667,1018.666667
6,7.000000,78.600000,6.300000,1020.000000,7.000000,6.000000,8.666667,82.800000,86.833333,71.333333,1.480000,3.700000,1.233333,1018.000000,1016.500000,1017.166667
7,8.857143,63.714286,7.142857,1018.714286,7.000000,7.000000,6.000000,78.600000,82.800000,86.833333,6.300000,1.480000,3.700000,1020.000000,1018.000000,1016.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1457,17.217391,68.043478,3.547826,1015.565217,16.850000,17.142857,14.000000,67.550000,74.857143,94.300000,8.335000,8.784211,9.085000,1017.200000,1016.952381,1014.350000
1458,15.238095,87.857143,6.000000,1016.904762,17.217391,16.850000,17.142857,68.043478,67.550000,74.857143,3.547826,8.335000,8.784211,1015.565217,1017.200000,1016.952381
1459,14.095238,89.666667,6.266667,1017.904762,15.238095,17.217391,16.850000,87.857143,68.043478,67.550000,6.000000,3.547826,8.335000,1016.904762,1015.565217,1017.200000
1460,15.052632,87.000000,7.325000,1016.100000,14.095238,15.238095,17.217391,89.666667,87.857143,68.043478,6.266667,6.000000,3.547826,1017.904762,1016.904762,1015.565217


In [45]:
from sklearn.model_selection import train_test_split

# Write your code here. Add as many boxes as you need.
X = df.drop(columns='meantemp', axis=1)
Y = df['meantemp']
x_train, x_test, y_train, y_test = train_test_split(X,Y, random_state=0, shuffle=False, test_size=0.2)

## Ensemble Learning Methods

### Bagging

Create an instance of a Random Forest model and train it using the `fit` function.

In [47]:
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor

# Write your code here. Add as many boxes as you need.
random = RandomForestRegressor()
random.fit(x_train, y_train)
pred1 = random.predict(x_test)
r2_score(y_test, pred1)

0.9370212886349112

In [49]:
# mean_squared_error(y_test, pred1)

Use the trained model to make predictions for the test set.

In [None]:
# Write your code here. Add as many boxes as you need.

Assess the performance of the model by using different metrics provided by the `scikit-learn` library.

In [None]:
# Write your code here. Add as many boxes as you need.

### Boosting

Create an instance of an XGBoost model and train it using the `fit` function.

In [50]:
from xgboost import XGBRegressor

# Write your code here. Add as many boxes as you need.
xgb = XGBRegressor()
xgb.fit(x_train, y_train)
pred1 = xgb.predict(x_test)
r2_score(y_test, pred1)

0.928485182381532

Use the trained model to make predictions for the test set.

In [None]:
# Write your code here. Add as many boxes as you need.

Assess the performance of the model by using different metrics provided by the `scikit-learn` library.

In [None]:
# Write your code here. Add as many boxes as you need.

# Laboratory Exercise - Bonus Task (+ 2 points)

As part of the bonus task in this laboratory assignment, your objective is to fine-tune the number of estimators (`n_estimators`) for the XGBoost model using a cross-validation with grid search and time series split. This involves systematically experimenting with various values for `n_estimators` and evaluating the model's performance using cross-validation. Upon determining the most suitable `n_estimators` value, evaluate the model's performance on a test set for final assessment.

Hints:
- For grid search use the `GridCVSearch` from the `scikit-learn` library. Check the documentation at https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html.
- For cross-validation use the `TimeSeriesSplit` from the `scikit-learn` library. Check the documentation at https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html.


## Dataset Splitting
Partition the dataset into training and testing sets with an 90:10 ratio.

**WARNING: DO NOT SHUFFLE THE DATASET.**

In [55]:
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
x_train, x_test, y_train, y_test = train_test_split(X,Y, random_state=0, shuffle=False, test_size=0.1)

In [62]:
# Write your code here. Add as many boxes as you need.
params = {
    'n_estimators': [3, 5, 8, 10, 15, 30, 40, 50]
}
model = XGBRegressor()

tscv = TimeSeriesSplit(n_splits=2)
gsearch = GridSearchCV(estimator=model, cv=tscv,
                        param_grid=params)
gsearch.fit(x_train, y_train)
gsearch.best_params_

{'n_estimators': 30}

## Fine-tuning the XGBoost Hyperparameter
Experiment with various values for `n_estimators` and evaluate the model's performance using cross-validation.

In [63]:
# Write your code here. Add as many boxes as you need.
best = gsearch.best_estimator_
pred = best.predict(x_test)
r2_score(y_test, pred)

0.9439702103339025

## Final Assessment of the Model Performance
Upon determining the most suitable `n_estimators` value, evaluate the model's performance on a test set for final assessment.

In [37]:
# Write your code here. Add as many boxes as you need.