In [2]:
%config InlineBackend.figure_format = "svg"

# Time series analysis imports
from statsmodels.tsa.seasonal import seasonal_decompose

# Data pipeline imports
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

# Data acquisition, processing and visualization tools
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

## Loading data

In [3]:
datapath = "data/train.csv"
df = pd.read_csv(datapath)
df

Unnamed: 0,country,city,tavg,tmin,tmax,wdir,wspd,pres
0,Hungary,Budapest,3.1,0.6,5.4,48.0,6.5,1011.7
1,Hungary,Budapest,5.0,2.3,7.7,335.0,11.4,1009.5
2,Hungary,Budapest,4.0,0.7,6.6,223.0,12.2,1007.9
3,Hungary,Budapest,4.7,2.2,10.4,190.0,8.0,1005.8
4,Hungary,Budapest,6.4,3.7,9.5,144.0,6.3,1007.3
...,...,...,...,...,...,...,...,...
1726,Hungary,Budapest,12.0,6.8,17.3,300.0,5.3,1020.6
1727,Hungary,Budapest,12.2,5.5,18.2,98.0,8.3,1017.7
1728,Hungary,Budapest,13.7,11.0,16.5,73.0,6.1,1014.5
1729,Hungary,Budapest,14.6,12.4,18.1,327.0,7.9,1008.4


## Creating custom transformers for our data pipeline

As already discussed in the exploratory data analysis, this dataset needs the following preprocessing steps:
- Linear Interpolation for dealing with missing values
- Classical decomposition of the time-series signals

As these are not natively supported by sklearn as pipeline transformers, we will write our own custom transformers for performing these operations:

In [5]:
class LinearInterpolator(BaseEstimator, TransformerMixin):
    """
    Linear interpolator transformer class.
    
    The fit method does not change the LinearInterpolator, as it does not need to learn any parameters from data.
    The transform method performs the linear interpolation operation.
    """
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Perform linear interpolation
        X = pd.DataFrame(X).interpolate(method="linear")
        return X
    
    
class TSADecomposer(BaseEstimator, TransformerMixin):
    """
    The classical decomposer for time-series data.
    
    This transformer also does not need to learn any parameters from data, therefore the fit method leaves it unchanged.
    The transform method performs the decomposition.
    """
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Transform to dataframe
        X = pd.DataFrame(X)
        
        # Iterate and decompose each column
        original_cols = X.columns
        for key in original_cols:
            
            # Perform decomposition
            decomposed = seasonal_decompose(df[key].values, model="additive", period=365, extrapolate_trend="freq")
            
            # Add new columns to dataframe
            X[key + "_t"] = decomposed.trend
            X[key + "_s"] = decomposed.seasonal
            X[key + "_r"] = decomposed.resid
            
        # Drop undecomposed columns
        X = X.drop(columns=original_cols)
        
        return X

SyntaxError: incomplete input (2649326479.py, line 34)