Write a code to prepare a dataset with time series in the columns for a classification task, using diferent moving averages, fourier features, calendar features

Here is an example of how you can prepare a time series dataset for a classification task using different moving averages, Fourier features, and calendar features in Python. This example assumes that you have a pandas DataFrame df with a datetime index and a target column y.

In [None]:
'''
This code assumes that your DataFrame df has a DateTime index. 
The prepare_dataset function first detrends and deseasonalizes the target column using
the seasonal_decompose function from the statsmodels library.
It then applies a BoxCox transformation to the target column to remove heteroscedasticity.

It uses the transformation pipeline provided by sktime library
'''


import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os, pathlib, shutil, random, urllib, zipfile
# sktime
from sktime.transformations.series.detrend import Deseasonalizer
from sktime.transformations.series.fourier import FourierFeatures
from sktime.transformations.series.date import DateTimeFeatures

# Import necessary libraries
#import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

# Assuming df is your DataFrame and it has a datetime index and a value column

# Create a new DataFrame with the original data and some shifted data
df_lagged = pd.concat([df.shift(i) for i in range(0, 5)], axis=1)
df_lagged.columns = ['value_t-%d' % i for i in range(0, 5)]

# Calculate moving averages
df_lagged['moving_average_3'] = df_lagged[['value_t-0', 'value_t-1', 'value_t-2']].mean(axis=1)
df_lagged['moving_average_5'] = df_lagged.mean(axis=1)

# Calculate exponential moving averages
alpha = 2 / (1 + 3)  # 3-day EMA
df_lagged['ema_3'] = df_lagged['value_t-0'].ewm(alpha=alpha).mean()

# Calculate rolling means
df_lagged['rolling_mean_3'] = df_lagged['value_t-0'].rolling(window=3).mean()

# Calculate Fourier features
def fourier_features(df, column_name, n_harmonics=3):
    t = np.arange(len(df))
    X = np.zeros((len(df), 2*n_harmonics))
    for i in range(n_harmonics):
        X[:, 2*i] = np.sin(2 * np.pi * (i+1) * t / len(df))
        X[:, 2*i+1] = np.cos(2 * np.pi * (i+1) * t / len(df))
    X = pd.DataFrame(X, columns=[f'{column_name}_sin_{i+1}' for i in range(n_harmonics)] + [f'{column_name}_cos_{i+1}' for i in range(n_harmonics)])
    return X

fourier_features_df = fourier_features(df_lagged, 'value_t-0')
df_lagged = pd.concat([df_lagged, fourier_features_df], axis=1)

# Calculate calendar features
df_lagged['day_of_week'] = df_lagged.index.dayofweek
df_lagged['hour'] = df_lagged.index.hour

# Drop rows with missing values
df_lagged.dropna(inplace=True)

# Prepare a pipeline for transformations
def wide_to_long(df):
    return df.melt(id_vars=['day_of_week', 'hour'] + [col for col in df.columns if 'sin' in col or 'cos' in col], var_name='feature', value_name='value')

transformer = ColumnTransformer(
    transformers=[
        ('wide_to_long', FunctionTransformer(wide_to_long), ['value_t-0', 'value_t-1', 'value_t-2', 'value_t-3', 'value_t-4', 'moving_average_3', 'moving_average_5', 'ema_3', 'rolling_mean_3']),
        ('label_encoder', LabelEncoder(), ['day_of_week', 'hour'] + [col for col in df_lagged.columns if 'sin' in col or 'cos' in col])
    ],
    remainder='drop'
)

pipeline = Pipeline(steps=[('transformer', transformer)])

# Fit the pipeline and transform the data
X = pipeline.fit_transform(df_lagged)

# Assuming y is your target vector
y = df['target']  # Replace 'target' with the actual name of your target column


In [None]:

def prepare_ts_data(df):
    
    # Assuming df is your DataFrame and it has a datetime index
    df.index = pd.to_datetime(df.index)

    # Create a new DataFrame with calendar features
    df_calendar = pd.DataFrame(index=df.index)

    df_calendar['year'] = df.index.year
    df_calendar['month'] = df.index.month
    df_calendar['dayofweek'] = df.index.dayofweek
    df_calendar['day'] = df.index.day
    df_calendar['quarter'] = df.index.quarter
    #df_calendar['is_month_end'] = df.index.is_month_end.astype(int)
    #df_calendar['is_quarter_end'] = df.index.is_quarter_end.astype(int)
    #df_calendar['is_year_end'] = df.index.is_year_end.astype(int)

    # Create a new DataFrame with moving averages
    moving_averages = [3, 7, 15, 30]
    df_moving_averages = pd.DataFrame(index=df.index)
    for ma in moving_averages:
        df_moving_averages[f'ma_{ma}'] = df['y'].rolling(ma).mean()

    # Create a new DataFrame with Fourier features
    fourier_transformer = FourierFeatures(n_terms=5)
    df_fourier = fourier_transformer.fit_transform(df[['y']])
    df_fourier.index = df.index

    # Create a new DataFrame with detrended data for the target
    detrender = Detrender(transformer=AdaptiveMeanTransformer())
    df_detrended = detrender.fit_transform(df[['y']])
    df_detrended.index = df.index

    # Exponential moving average
    ema = ExponentialMovingAverage(decay='exp', window_size=7)
    df_ema = ema.fit_transform(df[['y']])
    df_ema.index = df.index

    # Deseasonalize the data
    deseasonalizer = Deseasonalizer(sp=12)
    df_deseasonalized = deseasonalizer.fit_transform(df[['y']])
    df_deseasonalized.index = df.index

    # Concatenate all the feature DataFrames
    X = pd.concat([df_calendar, df_moving_averages, df_fourier, df_detrended, df_deseasonalized, df_ema], axis=1)

    # Drop rows with missing values
    X.dropna(inplace=True)

    # Encode categorical variables
    le = LabelEncoder()
    X['year'] = le.fit_transform(X['year'])
    X['dayofweek'] = le.fit_transform(X['dayofweek'])
    X['month'] = le.fit_transform(X['month'])
    X['quarter'] = le.fit_transform(X['quarter'])
    X['day'] = le.fit_transform(X['day'])
   
    y = df['y']
    return X, y


Write a code to classify time series using LightGBM, XGBoost classifier and Histogram Gradient Boost classifier

This example assumes that you have already preprocessed your time series data and converted it into a suitable format for these models, such as a feature matrix X and a target vector y.

In [None]:

 # Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LightGBM Classifier
lgb_clf = lgb.LGBMClassifier(objective='multiclass')
lgb_clf.fit(X_train, y_train)
y_pred_lgb = lgb_clf.predict(X_test)
print('LightGBM accuracy:', accuracy_score(y_test, y_pred_lgb))

# XGBoost Classifier
xgb_clf = xgb.XGBClassifier(objective='multi:softmax')
xgb_clf.fit(X_train, y_train)
y_pred_xgb = xgb_clf.predict(X_test)
print('XGBoost accuracy:', accuracy_score(y_test, y_pred_xgb))

# Histogram Gradient Boosting Classifier
hgb_clf = HistGradientBoostingClassifier()
hgb_clf.fit(X_train, y_train)
y_pred_hgb = hgb_clf.predict(X_test)
print('HistGradientBoostingClassifier accuracy:', accuracy_score(y_test, y_pred_hgb))


In [None]:
import pandas as pd
import numpy as np

# Assuming df is your DataFrame and it has a datetime index
# Also assuming that 'value' is the column with the sequence of numerical values
# And 'target' is the column with the categorical target variable

# Create moving averages
df['ma7'] = df['value'].rolling(window=7).mean()
df['ma30'] = df['value'].rolling(window=30).mean()

# Create Fourier features
df['fourier_sin'] = np.sin(2 * np.pi * df.index.dayofyear / 365)
df['fourier_cos'] = np.cos(2 * np.pi * df.index.dayofyear / 365)

# Create calendar features
df['day_of_week'] = df.index.dayofweek
df['day_of_year'] = df.index.dayofyear
df['month'] = df.index.month
df['quarter'] = df.index.quarter
df['year'] = df.index.year

# Drop missing values
df = df.dropna()

# Prepare the dataset for the classification task
X = df.drop(['value', 'target'], axis=1)
y = df['target']

# Convert categorical target into numerical values
y = pd.Categorical(y).codes