In [2]:
import numpy as np
import pandas as pd
import sweetviz as sv

In [3]:
df = pd.read_csv("Data/df.csv")

In [7]:
df

Unnamed: 0,date,hour,france,italy,belgium,spain,uk,germany
0,2022/01/01,00:00 - 01:00,89.06,170.28,82.02,145.86,0.27,50.05
1,2022/01/01,01:00 - 02:00,78.48,155.72,67.07,114.90,-0.01,41.33
2,2022/01/01,02:00 - 03:00,85.16,147.09,75.11,113.87,0.27,43.22
3,2022/01/01,03:00 - 04:00,50.00,91.00,50.91,97.80,-0.01,45.46
4,2022/01/01,04:00 - 05:00,37.67,104.00,37.67,97.80,-0.01,37.67
...,...,...,...,...,...,...,...,...
8756,2022/12/31,19:00 - 20:00,18.11,333.00,22.09,18.11,228.00,-1.01
8757,2022/12/31,20:00 - 21:00,7.60,310.00,10.55,7.60,204.38,-1.39
8758,2022/12/31,21:00 - 22:00,3.69,270.00,5.22,3.69,175.23,-1.04
8759,2022/12/31,22:00 - 23:00,1.88,217.78,2.39,1.88,177.39,-1.07


In [4]:
def load_data(input_path: str) -> pd.DataFrame:
    """
    This functuion loads the data from the input path
    and returns it as a pandas dataframe.
    args:
        input_path: str, the path to the input file
    returns:
        df: pd.DataFrame, the dataframe of the data
    """
    return pd.read_csv(input_path)
    

In [5]:
def set_data_type(df: pd.DataFrame) -> pd.DataFrame:
    """
    Set the data type of the dataframe
    args:
        df: pd.DataFrame
            it is the dataframe of the csv file
    return:
        df: pd.DataFrame
    """
    df["timestamp"]=pd.to_datetime(df["timestamp"] )
    df["spain"]=df["spain"].astype("float")
    return df
    
    
    

In [6]:
def combine_date_time(df: pd.DataFrame) -> pd.DataFrame:
    """
    Combine the date and time column into one column timestamp
    args:
        df: pd.DataFrame, it is the dataframe of the csv file
    return:
        df: pd.DataFrame, it is the dataframe of the csv file
    """
    df['hour'] = df['hour'].str.split(' - ').str[0]
    df['timestamp'] = (df['date'] + ' ' + df['hour'])
    return df
    
    

In [531]:
df[df.is_holiday==True]

Unnamed: 0,spain,day,month,year,hour,day_of_week,week_of_year,day_of_year,is_weekend,is_month_beginning,...,lag_diff_3,lag_diff_6,lag_diff_12,lag_diff_24,lag_diff_48,lag_diff_96,lag_diff_192,lag_diff_384,cumulative_sum,cumulative_mean
2184,95.90,18,4,2022,0,0,16,108,0,0,...,-93.69,-4.00,45.82,-64.32,-126.53,-136.55,-186.58,-200.52,580354.78,225.906882
2185,81.00,18,4,2022,1,0,16,108,0,0,...,-90.46,-99.51,55.79,-58.97,-110.00,-126.63,-185.30,-206.33,580435.78,225.850498
2186,84.00,18,4,2022,2,0,16,108,0,0,...,-74.07,-107.54,73.00,-44.46,-94.09,-116.14,-186.93,-196.00,580519.78,225.795325
2187,81.00,18,4,2022,3,0,16,108,0,0,...,-14.90,-108.59,77.30,-42.38,-103.99,-117.05,-175.79,-179.54,580600.78,225.739028
2188,85.25,18,4,2022,4,0,16,108,0,0,...,4.25,-86.21,70.25,-48.71,-99.74,-114.72,-161.64,-173.45,580686.03,225.684427
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8228,46.24,25,12,2022,19,6,51,359,1,0,...,42.12,42.13,42.13,-24.86,-18.77,-47.40,-106.81,-124.18,1461463.14,169.681080
8229,50.14,25,12,2022,20,6,51,359,1,0,...,27.44,46.03,46.03,-8.55,-17.76,-45.62,-102.76,-116.70,1461513.28,169.667202
8230,50.26,25,12,2022,21,6,51,359,1,0,...,10.26,46.15,46.23,5.26,-9.74,-39.74,-94.54,-108.74,1461563.54,169.653342
8231,50.17,25,12,2022,22,6,51,359,1,0,...,3.93,46.05,48.17,27.57,7.36,-18.73,-81.13,-84.83,1461613.71,169.639474


Unnamed: 0,spain,day,month,year,hour,day_of_week,week_of_year,day_of_year,is_weekend,is_month_beginning,...,lag_diff_3,lag_diff_6,lag_diff_12,lag_diff_24,lag_diff_48,lag_diff_96,lag_diff_192,lag_diff_384,cumulative_sum,cumulative_mean
3960,147.64,1,7,2022,0,4,26,182,0,1,...,-8.36,48.28,20.63,8.31,-19.37,5.21,-38.86,-46.43,894817.50,205.941887
3961,146.48,1,7,2022,1,4,26,182,0,1,...,-12.12,31.78,25.08,9.61,-10.52,16.94,-24.31,-37.02,894963.98,205.928205
3962,143.70,1,7,2022,2,4,26,182,0,1,...,-6.30,3.89,27.70,6.83,-1.30,23.70,-9.71,-33.97,895107.68,205.913890
3963,142.40,1,7,2022,3,4,26,182,0,1,...,-5.24,-13.60,31.80,2.40,-2.60,27.15,-9.54,-30.62,895250.08,205.899282
3964,144.96,1,7,2022,4,4,26,182,0,1,...,-1.52,-13.64,44.55,3.58,-1.04,24.46,-6.88,-28.22,895395.04,205.885270
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4699,117.48,31,7,2022,19,6,30,212,1,0,...,15.50,22.48,10.49,-25.30,-15.53,-34.26,-42.52,-32.51,1000143.49,196.723739
4700,152.16,31,7,2022,20,6,30,212,1,0,...,52.18,49.16,57.16,4.16,-1.17,-7.98,-26.13,-22.84,1000295.65,196.714975
4701,165.00,31,7,2022,21,6,30,212,1,0,...,56.83,60.52,74.13,15.71,8.07,0.80,-50.00,-19.01,1000460.65,196.708740
4702,183.03,31,7,2022,22,6,30,212,1,0,...,65.55,81.05,103.03,33.03,32.74,20.77,-38.07,2.97,1000643.68,196.706051


In [265]:
df.head()

Unnamed: 0,timestamp,spain
0,2022-01-01 00:00:00,145.86
1,2022-01-01 01:00:00,114.9
2,2022-01-01 02:00:00,113.87
3,2022-01-01 03:00:00,97.8
4,2022-01-01 04:00:00,97.8


In [74]:
df.columns

Index(['france', 'italy', 'belgium', 'spain', 'uk', 'germany', 'timestamp'], dtype='object')

In [8]:
def drop_columns(df: pd.DataFrame) -> pd.DataFrame:
    """
    Drop all columns except the timestamp and spain
    args:
        df: pd.DataFrame, it is the dataframe of the csv file
    return:
        df: pd.DataFrame, it is the dataframe of the csv file
    """
    return df[["timestamp", "spain"]]
    

In [245]:
df

Unnamed: 0,timestamp,spain
0,2022-01-01 00:00:00,145.86
1,2022-01-01 01:00:00,114.90
2,2022-01-01 02:00:00,113.87
3,2022-01-01 03:00:00,97.80
4,2022-01-01 04:00:00,97.80
...,...,...
8756,2022-12-31 19:00:00,18.11
8757,2022-12-31 20:00:00,7.60
8758,2022-12-31 21:00:00,3.69
8759,2022-12-31 22:00:00,1.88


In [9]:
def split_train_test(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    split the data in two diffrent dataset one for train and the other
    for test. i want the split ratio to 0.8 for train and 0.2 for test
    args:
        df: pd.DateFrame
    return:
        tr_df: pd.DateFrame, the train dataframes
        te_df: pd.DateFrame, the test dataframes
    """
    train_size = int(len(df) * 0.8)
    train_df = df[:train_size]
    test_df = df[train_size:].reset_index()
    return train_df, test_df

In [246]:
train_df, test_df = split_train_test(df)

In [247]:
train_df

Unnamed: 0,timestamp,spain
0,2022-01-01 00:00:00,145.86
1,2022-01-01 01:00:00,114.90
2,2022-01-01 02:00:00,113.87
3,2022-01-01 03:00:00,97.80
4,2022-01-01 04:00:00,97.80
...,...,...
7003,2022-10-19 19:00:00,117.00
7004,2022-10-19 20:00:00,136.74
7005,2022-10-19 21:00:00,125.04
7006,2022-10-19 22:00:00,110.00


In [248]:
test_df

Unnamed: 0,index,timestamp,spain
0,7008,2022-10-20 00:00:00,80.00
1,7009,2022-10-20 01:00:00,72.34
2,7010,2022-10-20 02:00:00,68.74
3,7011,2022-10-20 03:00:00,65.00
4,7012,2022-10-20 04:00:00,66.74
...,...,...,...
1748,8756,2022-12-31 19:00:00,18.11
1749,8757,2022-12-31 20:00:00,7.60
1750,8758,2022-12-31 21:00:00,3.69
1751,8759,2022-12-31 22:00:00,1.88


In [10]:
def calculate_holiday_distances(df: pd.DataFrame, holidays: list) -> pd.DataFrame:
    """
    Calculate days to next holiday and days since last holiday.
    Holidays are given as (month, day) tuples.
    """

    # Create a list of holidays as day-of-year
    holiday_days = [pd.Timestamp(month=month, day=day, year=2000).dayofyear for month, day in holidays]

    

    days_to_next = []
    days_since_last = []

    for doy in df['day_of_year']:
        future_holidays = [h - doy for h in holiday_days if h - doy >= 0]
        past_holidays = [doy - h for h in holiday_days if doy - h >= 0]

        days_to_next.append(min(future_holidays) if future_holidays else (365 - doy + min(holiday_days)))
        days_since_last.append(min(past_holidays) if past_holidays else (doy + (365 - max(holiday_days))))

    # Assign back to dataframe
    df['days_to_next_holiday'] = days_to_next
    df['days_since_last_holiday'] = days_since_last

    return df


In [11]:
x = calculate_holiday_distances(df, [
        (1, 1),   # January 1 - New Year's Day
        (1, 6),   # January 6 - Epiphany
        (4, 18),  # April 18 - Good Friday (specific to the year, so this is just an example)
        (5, 1),   # May 1 - Labour Day
        (8, 15),  # August 15 - Assumption of the Virgin Mary
        (10, 12), # October 12 - National Day of Spain
        (11, 1),  # November 1 - All Saints' Day
        (12, 6),  # December 6 - Constitution Day
        (12, 8),  # December 8 - Immaculate Conception
        (12, 25)  # December 25 - Christmas Day
    ])

KeyError: 'day_of_year'

In [251]:
x = df.copy()
x

Unnamed: 0,timestamp,spain
0,2022-01-01 00:00:00,145.86
1,2022-01-01 01:00:00,114.90
2,2022-01-01 02:00:00,113.87
3,2022-01-01 03:00:00,97.80
4,2022-01-01 04:00:00,97.80
...,...,...
8756,2022-12-31 19:00:00,18.11
8757,2022-12-31 20:00:00,7.60
8758,2022-12-31 21:00:00,3.69
8759,2022-12-31 22:00:00,1.88


In [378]:

is_bad_weather()


In [484]:
# df["summer"] =np.where(df['month'] in [6,7,8], 1, 0)
# df["spring"]= np.where(df['month'] in [3,4,5], 1, 0)
# df["winter"] =np.where(df['month'] in [12,1,2], 1, 0)
# df["fall"] =np.where(df['month'] in [9,10,11], 1, 0)

In [12]:
bad_weather_days = [
    ("11", "12", # Storm (Valencia)
    ),
    ("9", "25", # Tropical Storm Hermine
    ),
    ("10", "", # Storm
    ),
    ("12", "3", # Extreme Cold
    ),
    ("12", "4", # Extreme Cold
    ),
    ("12", "5", # Extreme Cold
    ),
    ("7", "", # Wildfires
    )
]
bad_days = set((int(month), int(day)) for month, day in bad_weather_days if day != "")
bad_full_months = set(int(month) for month, day in bad_weather_days if day == "")



def add_date_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Add date features to the dataframe
        - day
        - month
        - year
        - hour
        - day of week
        - week of year
        - is weekend
        - is holiday
        - which season
        - which quarter
        - is month begining
        - is month ending
        - day of the year
        - days to next holidays
        - days since last holidays
        - is good weather
        - is bad weather
        - 2_peak 1_shoulder 0_low season
        - day or night
        ...
    drop timestamp column
    args:
        df: pd.DataFrame, it is the dataframe of the csv file
    return:
        df: pd.DataFrame, it is the dataframe of the csv file
    """
    
    # Extract date and time components
    df['day'] = df['timestamp'].dt.day
    df['month'] = df['timestamp'].dt.month
    df['year'] = df['timestamp'].dt.year
    df['hour'] = df['timestamp'].dt.hour
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    df['week_of_year'] = df['timestamp'].dt.isocalendar().week.apply(int)
    df['day_of_year'] = df['timestamp'].dt.dayofyear
    df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
    df['is_month_beginning'] = df['timestamp'].dt.is_month_start.astype(int)
    df['is_month_end'] = df['timestamp'].dt.is_month_end.astype(int)

    # Add seasons based on month 
    season_map = {1: 'Winter', 2: 'Winter', 3: 'Spring', 4: 'Spring', 5: 'Spring', 6: 'Summer', 
                  7: 'Summer', 8: 'Summer', 9: 'Fall', 10: 'Fall', 11: 'Fall', 12: 'Winter'}
    
#     df["summer"] np.where(df['month'] in [6,7,8], 1, 0)
#     df["spring"] np.where(df['month'] in [3,4,5], 1, 0)
#     df["winter"] np.where(df['month'] in [12,1,2], 1, 0)
#     df["fall"] np.where(df['month'] in [9,10,11], 1, 0)
    # represent each season occurring as 1, and 0 otherwise
    df["summer"]=df["month"].apply(lambda x: 1 if season_map[x]=="Summer" else 0)
    df["spring"]=df["month"].apply(lambda x: 1 if season_map[x]=="Spring" else 0)
    df["winter"]=df["month"].apply(lambda x: 1 if season_map[x]=="Winter" else 0)
    df["fall"]=df["month"].apply(lambda x: 1 if season_map[x]=="Fall" else 0)
    
    

    
    # Add quarters
    df['quarter'] = df['timestamp'].dt.quarter
    
    # Determine 'day or night' based on the hour, 1 for day, 0 for night
    df['day_or_night'] = df['hour'].apply(lambda x: 1 if 6 <= x < 18 else 0)

         

    
    
    # Seasonal categorization for '2_peak 1_shoulder 0_low season'
    df['season_category'] = df["month"].apply(lambda x: 2 if season_map[x]=="Summer" or  season_map[x]=="Spring"  else (1 if season_map[x]=="Fall" else 0))
    
    # List of holidays (using month/day format for all years)
    holidays = [
        (1, 1),   # January 1 - New Year's Day
        (1, 6),   # January 6 - Epiphany
        (4, 15),  # April 15 - Good Friday 
        (5, 1),   # May 1 - Labour Day
        (8, 15),  # August 15 - Assumption of the Virgin Mary
        (10, 12), # October 12 - National Day of Spain
        (11, 1),  # November 1 - All Saints' Day
        (12, 6),  # December 6 - Constitution Day
        (12, 8),  # December 8 - Immaculate Conception
        (12, 25)  # December 25 - Christmas Day
    ]
    #assign good and bad weather
   
    
    df['is_bad_weather'] = df.apply(
    lambda row: int((row['month'], row['day']) in bad_days or row['month'] in bad_full_months),
    axis=1
)
    df['is_good_weather'] = 1-df["is_bad_weather"]
    
    # Add holiday column (1 if holiday, 0 if not)
    df['is_holiday'] = df.apply(lambda row: 1 if (row['month'], row['day']) in holidays else 0, axis=1)

    df = calculate_holiday_distances(df,holidays )
    # Drop the 'timestamp' column
    df.drop(columns=['timestamp'], inplace=True)
    
    return df


In [270]:
df

Unnamed: 0,spain,day,month,year,hour,day_of_week,week_of_year,day_of_year,is_weekend,is_month_beginning,is_month_end,season,quarter,day_or_night,is_good_weather,is_bad_weather,season_category,is_holiday,days_to_next_holiday,days_since_last_holiday
0,145.86,1,1,2022,0,5,52,1,1,1,0,Winter,1,night,0,1,0,1,0,0
1,114.90,1,1,2022,1,5,52,1,1,1,0,Winter,1,night,0,1,0,1,0,0
2,113.87,1,1,2022,2,5,52,1,1,1,0,Winter,1,night,0,1,0,1,0,0
3,97.80,1,1,2022,3,5,52,1,1,1,0,Winter,1,night,0,1,0,1,0,0
4,97.80,1,1,2022,4,5,52,1,1,1,0,Winter,1,night,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8756,18.11,31,12,2022,19,5,52,365,1,0,1,Winter,4,night,0,1,0,0,1,5
8757,7.60,31,12,2022,20,5,52,365,1,0,1,Winter,4,night,0,1,0,0,1,5
8758,3.69,31,12,2022,21,5,52,365,1,0,1,Winter,4,night,0,1,0,0,1,5
8759,1.88,31,12,2022,22,5,52,365,1,0,1,Winter,4,night,0,1,0,0,1,5


In [13]:
def add_target_value_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Add target value features to the dataframe
        - lags [1, 3, 6, 12, 24, 48, 96, 192, 384]
        - moving average [1, 3, 6, 12, 24, 48, 96, 192, 384]
        - exponential moving average [1, 3, 6, 12, 24, 48, 96, 192, 384]
        - Rolling Standard Deviation features [1, 3, 6, 12, 24, 48, 96, 192, 384]
        - lagged difference features [1, 3, 6, 12, 24, 48, 96, 192, 384]
        - cumulative sum
        - cumulative mean
    args:
        df: pd.DataFrame, it is the dataframe of the csv file
    return:
        df: pd.DataFrame, it is the dataframe of the csv file
    """
    # Add lag features
    for lag in [1, 3, 6, 12, 24, 48, 96, 192, 384]:
        df[f'lag_{lag}'] = df['spain'].shift(lag)
    # Add moving average features
    for window in [1, 3, 6, 12, 24, 48, 96, 192, 384]:
        df[f'moving_avg_{window}'] = df['spain'].rolling(window=window).mean()
    # Add exponential moving average features
    for window in [1, 3, 6, 12, 24, 48, 96, 192, 384]:
        df[f'exp_moving_avg_{window}'] = df['spain'].ewm(span=window, adjust=False).mean()
    # Add rolling standard deviation features
    for window in [3, 6, 12, 24, 48, 96, 192, 384]:
        df[f'rolling_std_{window}'] = df['spain'].rolling(window=window, min_periods=1).std()
    # Add lagged difference features
    for lag in [1, 3, 6, 12, 24, 48, 96, 192, 384]:
        df[f'lag_diff_{lag}'] = df['spain'].diff(lag)
    # Add cumulative sum feature
    df['cumulative_sum'] = df['spain'].cumsum()
    # Add cumulative mean feature
    df['cumulative_mean'] = df['spain'].expanding().mean()
    return df

In [14]:
def impute_missing_values(df: pd.DataFrame) -> pd.DataFrame:
    """
    Impute missing values in the dataframe. The method of imputation
    is up to the developer to decide. Therefore the developer is Incentivised

    args:
        df: pd.DataFrame, a dataframe with missing values
    return:
        df: pd.DataFrame, a dataframe with no missing values
    """
    df['spain'] = df['spain'].ffill()
    df = df.dropna()
    df = df.reset_index(drop=True)
    return df

In [288]:
df = impute_missing_values(df)

In [408]:
df.head()

Unnamed: 0,index,spain,day,month,year,hour,day_of_week,week_of_year,day_of_year,is_weekend,...,lag_diff_3,lag_diff_6,lag_diff_12,lag_diff_24,lag_diff_48,lag_diff_96,lag_diff_192,lag_diff_384,cumulative_sum,cumulative_mean
0,384,252.53,17,1,2022,0,0,3,17,0,...,4.33,-4.92,47.32,26.61,30.52,62.53,143.35,106.67,72022.83,187.072286
1,385,242.23,17,1,2022,1,0,3,17,0,...,13.71,-23.13,47.21,26.23,27.31,53.51,152.62,127.33,72265.06,187.215181
2,386,232.25,17,1,2022,2,0,3,17,0,...,19.43,-24.88,40.35,24.63,22.64,44.38,152.01,118.38,72497.31,187.33155
3,387,215.01,17,1,2022,3,0,3,17,0,...,-37.52,-33.19,18.09,17.79,18.31,32.13,159.55,117.21,72712.32,187.402887
4,388,209.55,17,1,2022,4,0,3,17,0,...,-32.68,-18.97,8.53,22.85,16.51,22.9,163.73,111.75,72921.87,187.45982


In [342]:
def save_data(df: pd.DataFrame, path: str) -> int:
    """
    Save data to a csv file.
    
    args:
        df: pd.DataFrame
            it is the dataframe of the csv file
        path: str
            it is the path where the csv file will be saved
            
    return:
        0: int, if the data is saved successfully
        1: int, if the data is not saved successfully
    """
    try:
        # Save the dataframe to the provided path
        df.to_csv(path, index=False)  # index=False prevents writing row indices
        return 0  # Return 0 for success
    except Exception as e:
        print(f"Error saving file: {e}")
        return 1  # Return 1 for failure



In [15]:
def visualize_data(df: pd.DataFrame) -> int:
    """
    Visualize the data using Sweetviz to get to know more about it.
    
    args:
        df: pd.DataFrame
            It is the dataframe of the CSV file
            
    return:
        0: int, if the data is shown successfully
        1: int, if the data is not shown successfully
    """
    try:

        # analyzing the dataset
        report = sv.analyze(df, target_feat='spain')

        
        # Show the report in the browser
        
        report.show_html('sweetviz_report.html')  # Saves the report as an HTML file
        return 0  # Return 0 for success
    except Exception as e:
        print(f"Error visualizing data: {e}")
        return 1  # Return 1 for failure


In [480]:
df.columns

Index(['spain', 'day', 'month', 'year', 'hour', 'day_of_week', 'week_of_year',
       'day_of_year', 'is_weekend', 'is_month_beginning', 'is_month_end',
       'summer', 'spring', 'winter', 'fall', 'quarter', 'day_or_night',
       'is_good_weather', 'is_bad_weather', 'season_category', 'is_holiday',
       'days_to_next_holiday', 'days_since_last_holiday', 'lag_1', 'lag_3',
       'lag_6', 'lag_12', 'lag_24', 'lag_48', 'lag_96', 'lag_192', 'lag_384',
       'moving_avg_1', 'moving_avg_3', 'moving_avg_6', 'moving_avg_12',
       'moving_avg_24', 'moving_avg_48', 'moving_avg_96', 'moving_avg_192',
       'moving_avg_384', 'exp_moving_avg_1', 'exp_moving_avg_3',
       'exp_moving_avg_6', 'exp_moving_avg_12', 'exp_moving_avg_24',
       'exp_moving_avg_48', 'exp_moving_avg_96', 'exp_moving_avg_192',
       'exp_moving_avg_384', 'rolling_std_3', 'rolling_std_6',
       'rolling_std_12', 'rolling_std_24', 'rolling_std_48', 'rolling_std_96',
       'rolling_std_192', 'rolling_std_384', 'lag

In [472]:
df.week_of_year = df.week_of_year.apply(int)

In [473]:
df.week_of_year

0        3
1        3
2        3
3        3
4        3
        ..
6930    52
6931    52
6932    52
6933    52
6934    52
Name: week_of_year, Length: 6935, dtype: int64

0        3
1        3
2        3
3        3
4        3
        ..
6930    52
6931    52
6932    52
6933    52
6934    52
Name: week_of_year, Length: 6935, dtype: UInt32

In [568]:
visualize_data(df)

                                             |                                             | [  0%]   00:00 ->…

Report sweetviz_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


0

In [17]:
df = load_data("Data/df.csv")

df = combine_date_time(df)
df = set_data_type(df)
df = drop_columns(df)
df = impute_missing_values(df)

df = add_date_features(df)
df = add_target_value_features(df)
df = impute_missing_values(df)

In [18]:
df

Unnamed: 0,spain,day,month,year,hour,day_of_week,week_of_year,day_of_year,is_weekend,is_month_beginning,...,lag_diff_3,lag_diff_6,lag_diff_12,lag_diff_24,lag_diff_48,lag_diff_96,lag_diff_192,lag_diff_384,cumulative_sum,cumulative_mean
0,252.53,17,1,2022,0,0,3,17,0,0,...,4.33,-4.92,47.32,26.61,30.52,62.53,143.35,106.67,72022.83,187.072286
1,242.23,17,1,2022,1,0,3,17,0,0,...,13.71,-23.13,47.21,26.23,27.31,53.51,152.62,127.33,72265.06,187.215181
2,232.25,17,1,2022,2,0,3,17,0,0,...,19.43,-24.88,40.35,24.63,22.64,44.38,152.01,118.38,72497.31,187.331550
3,215.01,17,1,2022,3,0,3,17,0,0,...,-37.52,-33.19,18.09,17.79,18.31,32.13,159.55,117.21,72712.32,187.402887
4,209.55,17,1,2022,4,0,3,17,0,0,...,-32.68,-18.97,8.53,22.85,16.51,22.90,163.73,111.75,72921.87,187.459820
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8372,18.11,31,12,2022,19,5,52,365,1,0,...,16.61,17.95,18.00,-3.67,-14.39,-141.55,-46.90,-172.61,1467720.73,167.605428
8373,7.60,31,12,2022,20,5,52,365,1,0,...,3.49,6.60,6.60,-7.40,-23.97,-139.96,-60.30,-173.40,1467728.33,167.587158
8374,3.69,31,12,2022,21,5,52,365,1,0,...,-16.31,2.69,2.69,-0.43,-24.30,-125.11,-56.31,-143.25,1467732.02,167.568446
8375,1.88,31,12,2022,22,5,52,365,1,0,...,-16.23,0.38,1.48,-2.23,-20.08,-105.96,-40.93,-128.62,1467733.90,167.549532
