In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline
from feature_engine.outliers import Winsorizer
from statsmodels.tsa.seasonal import STL
from statsmodels.tsa.stattools import pacf
from statsmodels.graphics.tsaplots import plot_pacf
from feature_engine.encoding import OneHotEncoder


In [None]:
timestamp = "Month"
target = "Passengers"
df = pd.read_csv(r"C:\Users\Omar Anwar\Desktop\airline-passengers.csv" , index_col = timestamp)


df["shifted_1"] = df[target].shift(1)
df.dropna(inplace=True)

print(df.head())
# Assuming you have X and y as your input data and target variable
train = df[:int(len(df)*0.8)]
test = df[int(len(df)*0.8):]

In [46]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd
from scipy.fft import fft, fftfreq
from sklearn.pipeline import Pipeline

class FourierSeriesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, power_threshold=2, length_threshold=30, num_terms=3):
        self.power_threshold = power_threshold
        self.length_threshold = length_threshold
        self.num_terms = num_terms
        self.significant_periods = None
        self.feature_columns = None
        self.fourier_series_features = None

    def fourier_series_features_creator(self, time_series , time , n):
        
        # Generate Fourier series features
        self.fourier_series_features = np.zeros((n, self.num_terms * len(self.significant_periods) * 2))
        for i, period in enumerate(self.significant_periods):
            for j in range(self.num_terms):
                coefficient_sin = np.sin(2 * np.pi * (j + 1) * time / period)
                coefficient_cos = np.cos(2 * np.pi * (j + 1) * time / period)
                self.fourier_series_features[:, i * self.num_terms + j] = coefficient_sin
                self.fourier_series_features[:, i * self.num_terms + j + self.num_terms] = coefficient_cos


    def fit(self, X, y=None):

        # Assuming you have an existing DataFrame named X
        time_series = X[target]
        time = X[target].rank(method='dense').astype(int)

        # Linear detrend
        detrended_series = time_series.values.astype("float") - np.polyval(np.polyfit(time, time_series.values.astype("float"), 1), time)
        
        # Compute the periodogram with boxcar window and spectrum scaling
        n = len(detrended_series)
        power_spectrum = np.abs(fft(detrended_series * np.hanning(n)))**2 / n

        # Apply power threshold and length threshold
        significant_indices = np.where(power_spectrum > self.power_threshold)[0][1:]
        significant_periods = 1 / fftfreq(n)[significant_indices]
        self.significant_periods = significant_periods[np.where(significant_periods > self.length_threshold)[0]]

        self.fourier_series_features_creator(time_series, time, n)

        # Create DataFrame for Fourier series features
        self.feature_columns = [f'Feature_{i+1}' for i in range(self.fourier_series_features.shape[1])]
        self.fourier_df = pd.DataFrame(self.fourier_series_features, columns=self.feature_columns)
        
        return self

    def transform(self, X):
        # Assuming you have an existing DataFrame named X
        time_series = X[target]
        time = X[target].rank(method='dense').astype(int)

        self.fourier_series_features_creator(time_series, time ,len(X))

        # Create DataFrame for Fourier series features
        fourier_df = pd.DataFrame(self.fourier_series_features, columns=self.feature_columns ).set_index(X.index)

        # Concatenate the input DataFrame and Fourier series DataFrame
        combined_df = pd.concat([X, fourier_df], axis=1)
        return combined_df

In [47]:
class StationarityCheckTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.diff=[]

    def fit(self, X, y=None):
        data = X.copy()
        self.columns=data.select_dtypes(include="number").columns.tolist()
        for column in self.columns:
          trial=0
          gg = data[column].values.tolist()
          cleanedList = [x for x in gg if str(x) != 'nan']
          result = adfuller(cleanedList)

          # Extract the p-value from the test result
          p_value = result[1]

          for i in range(1, 4):
            # print(column,trial,p_value)
            if p_value < 0.05:
                self.diff.append(trial)
                break
            else:
              if trial==2:
                self.diff.append(0)
                break
              data[column] = data[column].diff(1)
              gg = data[column].values.tolist()
              cleanedList = [x for x in gg if str(x) != 'nan']
              result = adfuller(cleanedList)
              p_value = result[1]
              trial+=1
        return self

    def transform(self, X):
        data = X.copy()
        # print(self.diff,self.columns)
        for i in range(len(self.columns)):
          data[f"{self.columns[i]}_found"]=data[self.columns[i]]
          for j in range(self.diff[i]):
            data[f"{self.columns[i]}_found"] = data[f"{self.columns[i]}_found"].diff(1)      
        return data

In [49]:
class RemoveConstantFeatures(BaseEstimator, TransformerMixin):
    def __init__(self) :
        self.columns_to_drop=[]
    def fit(self, X, y=None):
        # Check if any column has the same value for all rows
        for column in X.columns:
            if X[column].nunique() == 1:
                self.columns_to_drop.append(column)
        return self
    def transform(self, X):
        X = X.drop(columns=self.columns_to_drop)
        return X

In [50]:
class CustomOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoder = None
        self.categorical_cols = []

    def fit(self, X, y=None):
        # Identify categorical columns
        self.categorical_cols = [col for col in X.columns if X[col].dtype == 'object']
        if self.categorical_cols:
            self.encoder = OneHotEncoder(variables=self.categorical_cols).fit(X)
        return self

    def transform(self, X):
        if self.categorical_cols:
            X = self.encoder.transform(X)
        return X

In [51]:
from feature_engine.datetime import DatetimeFeatures

In [52]:
class LagFeatureExtractor(BaseEstimator, TransformerMixin):
    
    def __init__(self, max_lag=30,alpha=0.05):
        self.max_lag = max_lag
        self.alpha = alpha
        self.significant_lags = []
        self.sig_fet = {}
    
    def fit(self, X, y=None):
        cols = [col for col in X.columns if 'found' in col]
        for col in cols:
            self.significant_lags = []
            m = 0
            pacf_values, conf_interval = pacf(X[col], nlags=self.max_lag, method='ols', alpha=self.alpha)

            result = np.zeros((len(pacf_values), 2))
            result[:, 0] = conf_interval[:len(pacf_values), 0] - pacf_values
            result[:, 1] = conf_interval[:len(pacf_values), 1] - pacf_values 

            new = result[1]
            for i in range(len(pacf_values)): 
                if i == 0:
                    continue
                if pacf_values[i] < new[0] or pacf_values[i] > new[1]:
                    self.significant_lags.append(i)
                    self.sig_fet[col] = self.significant_lags
                    
                    m = 0
                else:
                    m += 1
                    if m == 3:
                        break
                    
        return self
    
    def transform(self, X, y=None):
        for key, value in self.sig_fet.items():
            for i in value:
                lag_column = X[key].shift(i)
                new_column_name = f"lag_{key}_{i}"
                X[new_column_name] = lag_column
        return X


In [53]:
class DropNansTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.dropna()
        return X

In [54]:
features_pipeline = make_pipeline(
    StationarityCheckTransformer(), 
    FourierSeriesTransformer(),
    CustomOneHotEncoder(),
    DatetimeFeatures(variables="index",features_to_extract = ["hour", "day_of_week", "month"]),
    DropNansTransformer(),
    LagFeatureExtractor(),
    DropNansTransformer(),
    RemoveConstantFeatures(),
    )

In [55]:
features_pipeline.fit_transform(train)

Unnamed: 0_level_0,Passengers,shifted_1,Passengers_found,shifted_1_found,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,...,Feature_11,Feature_12,day_of_week,month,lag_Passengers_found_1,lag_Passengers_found_2,lag_Passengers_found_4,lag_shifted_1_found_1,lag_shifted_1_found_2,lag_shifted_1_found_4
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1949-08,148,148.0,-13.0,-1.0,0.837166,0.915773,0.164595,0.915773,-0.735724,-0.324699,...,0.945817,-0.879474,0,8,-1.0,22.0,-17.0,22.0,-5.0,8.0
1949-09,136,148.0,-12.0,-13.0,0.656752,0.990522,0.837166,0.990522,0.272103,-0.915773,...,-0.401695,0.986361,3,9,-13.0,-1.0,-5.0,-1.0,22.0,-17.0
1949-10,119,136.0,-5.0,-12.0,0.272103,0.523673,0.735724,0.523673,0.892254,0.996584,...,-0.082579,-0.789141,5,10,-12.0,-13.0,22.0,-13.0,-1.0,-5.0
1949-11,104,119.0,2.0,-5.0,0.055088,0.110008,0.164595,0.110008,0.218681,0.324699,...,0.945817,0.879474,1,11,-5.0,-12.0,-1.0,-12.0,-13.0,22.0
1949-12,118,104.0,29.0,2.0,0.218681,0.426776,0.614213,0.426776,0.771917,0.969400,...,0.245485,-0.401695,3,12,2.0,-5.0,-13.0,-5.0,-12.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1958-03,362,318.0,66.0,-26.0,-0.981451,0.376309,0.837166,0.376309,-0.697297,0.915773,...,-0.401695,-0.986361,5,3,-26.0,-27.0,15.0,-27.0,73.0,6.0
1958-04,348,362.0,-58.0,66.0,-0.936511,0.656752,0.475947,0.656752,-0.990522,0.837166,...,0.546948,0.082579,1,4,66.0,-26.0,73.0,-26.0,-27.0,15.0
1958-05,363,348.0,29.0,-58.0,-0.990522,0.272103,0.915773,0.272103,-0.523673,0.735724,...,-0.677282,-0.945817,3,5,-58.0,66.0,-27.0,66.0,-26.0,73.0
1958-06,435,363.0,57.0,29.0,-0.969400,-0.475947,0.735724,-0.475947,0.837166,-0.996584,...,-0.082579,0.789141,6,6,29.0,-58.0,-26.0,-58.0,66.0,-27.0


In [56]:
features_pipeline.transform(test)

Unnamed: 0_level_0,Passengers,shifted_1,Passengers_found,shifted_1_found,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,...,Feature_11,Feature_12,day_of_week,month,lag_Passengers_found_1,lag_Passengers_found_2,lag_Passengers_found_4,lag_shifted_1_found_1,lag_shifted_1_found_2,lag_shifted_1_found_4
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1959-02,342,360.0,-41.0,-4.0,0.164595,0.324699,0.4759474,0.324699,0.614213,0.8371665,...,0.546948,0.082579,6,2,-4.0,76.0,56.0,76.0,-4.0,-115.0
1959-03,406,342.0,82.0,-41.0,0.614213,0.9694,0.9157733,0.9694,0.475947,-0.7357239,...,-0.677282,0.945817,6,3,-41.0,-4.0,-4.0,-4.0,76.0,56.0
1959-04,396,406.0,-74.0,82.0,0.475947,0.837166,0.9965845,0.837166,0.915773,0.1645946,...,-0.986361,-0.245485,2,4,82.0,-41.0,76.0,-41.0,-4.0,-4.0
1959-05,420,396.0,34.0,-74.0,0.771917,0.981451,0.4759474,0.981451,-0.376309,-0.8371665,...,0.546948,-0.082579,4,5,-74.0,82.0,-4.0,82.0,-41.0,76.0
1959-06,472,420.0,28.0,34.0,0.892254,0.805765,-0.1645946,0.805765,-0.954405,0.3246995,...,0.945817,-0.879474,0,6,34.0,-74.0,-41.0,-74.0,82.0,-4.0
1959-07,548,472.0,24.0,28.0,0.9694,0.475947,-0.7357239,0.475947,-0.837166,0.9965845,...,-0.082579,0.789141,2,7,28.0,34.0,82.0,34.0,-74.0,-41.0
1959-08,559,548.0,-65.0,24.0,0.981451,0.376309,-0.8371665,0.376309,-0.697297,0.9157733,...,-0.401695,0.986361,5,8,24.0,28.0,-74.0,28.0,34.0,82.0
1959-09,463,559.0,-107.0,-65.0,0.866025,0.866025,1.224647e-16,0.866025,-0.866025,-2.449294e-16,...,1.0,-1.0,1,9,-65.0,24.0,34.0,24.0,28.0,-74.0
1959-10,407,463.0,40.0,-107.0,0.656752,0.990522,0.8371665,0.990522,0.272103,-0.9157733,...,-0.401695,0.986361,3,10,-107.0,-65.0,28.0,-65.0,24.0,34.0
1959-11,362,407.0,11.0,40.0,0.324699,0.614213,0.8371665,0.614213,0.9694,0.9157733,...,-0.401695,-0.986361,6,11,40.0,-107.0,24.0,-107.0,-65.0,28.0
