In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import random
import numpy as np

random.seed(42)

def scatter_plot(DataFrame):
    fig, ax = plt.subplots()
    ax.scatter(DataFrame.index, DataFrame['Aggregate'], s=.5) # 's=10' makes the dots smaller
    plt.xlabel('Time') # Set the x-axis label to 'Time'
    plt.ylabel('Aggregate') # Set the y-axis label to 'Aggregate'
    plt.show()


def big_plot(dataframe):
    dataframe.plot(linewidth=1, figsize=(20,20))


def read_clean_house(number_of_house, number_of_rows, first_rows=True, drop_appliances=False):
    filename = f'CLEAN_REFIT_081116/CLEAN_House{number_of_house}.csv'
    
    if first_rows:
        house = pd.read_csv(filename, nrows=number_of_rows)
    else:
        with open(filename, 'r') as f:
            total_rows = sum(1 for line in f)        
            skiprows = total_rows - number_of_rows
            house = pd.read_csv(filename, skiprows=range(1, skiprows))
    house['Time'] = pd.to_datetime(house['Time'])
    house.set_index('Time', inplace=True)
    house.drop(columns=['Unix', 'Issues'], inplace=True)
    if drop_appliances:
        appliance_columns = ['Appliance1', 'Appliance2', 'Appliance3', 'Appliance4', 'Appliance5', 'Appliance6', 'Appliance7', 'Appliance8', 'Appliance9']
        house.drop(columns=[col for col in appliance_columns if col in house.columns], inplace=True)
    peaks_cut = house[house['Aggregate'] < 10500]
    return peaks_cut


def process_sequences(dataframe):
    sequences = []
    current_sequence = {'start_time': None, 'end_time': None, 'values': []}
    prev_idx = None  # Keep track of the previous row's index

    for idx, row in dataframe.iterrows():
        value = row['Aggregate']
        
        # Start a new sequence if it's the first value or the value changes significantly
        if not current_sequence['values'] or abs(value - current_sequence['values'][-1]) / current_sequence['values'][-1] > 0.05:
            if current_sequence['values']:
                # Finalize the current sequence using the previous row's timestamp as the end time
                current_sequence['end_time'] = prev_idx
                sequences.append(current_sequence)
                # Start a new sequence
                current_sequence = {'start_time': idx, 'end_time': None, 'values': [value]}
            else:
                # This is the first value in the sequence
                current_sequence['start_time'] = idx
                current_sequence['values'].append(value)
        else:
            # Continue the current sequence
            current_sequence['values'].append(value)
        
        prev_idx = idx  # Update the previous index at the end of the loop

    # Finalize the last sequence using the last known timestamp as the end time
    if current_sequence['values']:
        current_sequence['end_time'] = prev_idx
        sequences.append(current_sequence)

    start_time_data = {
        'Time': [seq['start_time'] for seq in sequences],
        'Aggregate': [sum(seq['values']) / len(seq['values']) for seq in sequences]
    }
    end_time_data = {
        'Time': [seq['end_time'] for seq in sequences],
        'Aggregate': [sum(seq['values']) / len(seq['values']) for seq in sequences]
    }

    start_time_data_df = pd.DataFrame(start_time_data)
    end_time_data_df = pd.DataFrame(end_time_data)

    result_df = pd.concat([start_time_data_df, end_time_data_df], ignore_index=True)
    concatenated_result_df = result_df.sort_values(by='Time').set_index('Time')


    return concatenated_result_df[~concatenated_result_df.index.duplicated(keep='first')]


def correlation_between_original_and_processed(original, processed):
    original_data = original['2013-11-29 06:00:00':'2013-11-29 9:00:00']
    processed_data = processed
    expanded_processed_data = processed_data.reindex(original_data.index, method='ffill')
    correlation = original_data['Aggregate'].corr(expanded_processed_data['Aggregate'])

    print(f"Correlation coefficient: {correlation}")
    return (f"Correlation coefficient: {correlation}")


def cumulate_consumption(dataframe):
    end_times = dataframe.index.to_series().shift(-1) - pd.Timedelta(seconds=1)

    time_diffs_seconds = (end_times - dataframe.index.to_series()).dt.total_seconds().fillna(method='ffill')

    time_diffs_hours = time_diffs_seconds / 3600

    dataframe['Energy'] = dataframe['Aggregate'] * time_diffs_hours

    dataframe['Cumulative Energy'] = dataframe['Energy'].cumsum()

    return dataframe


def thirty_min_steps_and_explicit_time_features(house:pd.DataFrame):
    resampled = house.resample('30T').mean()
    resampled['hour'] = resampled.index.hour
    resampled['day_of_week'] = resampled.index.dayofweek  # Monday=0, Sunday=6
    resampled['month'] = resampled.index.month
    resampled['is_weekend'] = resampled['day_of_week'].isin([5, 6]).astype(int)
    resampled.dropna(inplace=True)
    return resampled

def standardizing_and_sequencing(resampled_and_feature_engineered_house:pd.DataFrame, scaler=StandardScaler()):
    resampled_and_feature_engineered_house['Aggregate_standardized'] = scaler.fit_transform(resampled_and_feature_engineered_house[['Aggregate']])
    resampled_and_feature_engineered_house.drop(columns=['Aggregate'], inplace=True)
    for i in range(1, 7):
        resampled_and_feature_engineered_house[f'Aggregate_t+{i}'] = resampled_and_feature_engineered_house['Aggregate_standardized'].shift(-i)
    
    sequence_length = 20  # Assuming each record is 30 minutes, so 20 records for 10 hours
    num_sequences = 3000  # Desired number of sequences

    # Generate random start points based on index positions, not datetime values
    max_start_pos = len(resampled_and_feature_engineered_house) - sequence_length
    start_pos = random.sample(range(max_start_pos), k=min(num_sequences, max_start_pos))

    sequences = []
    for pos in start_pos:
        # Extract sequences by index positions
        sequence = resampled_and_feature_engineered_house.iloc[pos:pos + sequence_length]
        sequences.append(sequence)
    
    return scaler, sequences


def print_sequences(sequences, start=0 , end = 10):
    for i, sequence in enumerate(sequences[start:end]):
        print(f"Sequence with index {i}, being sequence no. {i+1}:")
        print(sequence)
        print("\n" + "-"*50 + "\n")


def features_list_and_labels_list (sequences, aggregate_columns = ['Aggregate_t+1', 'Aggregate_t+2', 'Aggregate_t+3', 'Aggregate_t+4', 'Aggregate_t+5', 'Aggregate_t+6']):
    features_list = []
    labels_list = []
    for sequence in sequences:
        features = sequence.drop(columns=aggregate_columns) #'features' is a pd.DataFrame
        features_list.append(features)
        labels = sequence.iloc[-1][aggregate_columns] #'labels' is a pd.Series
        labels_list.append(labels)
    return features_list, labels_list

def split_and_to_numpy(features_list, labels_list, train=.7, test=.15):
    if not len(features_list) == len(labels_list):
        print('feature liste und label liste nicht gleich lang')
        return
    else:
        size = len(features_list)
        train_end = int(train*size)
        test_end = int(train_end + size*test)

        # Split the features
        X_train = features_list[:train_end]
        X_test = features_list[train_end:test_end]
        X_validate = features_list[test_end:]

        # Split the labels
        y_train = labels_list[:train_end]
        y_test = labels_list[train_end:test_end]
        y_validate = labels_list[test_end:]

        # Convert features into 3D array
        X_train_arr = np.array([df.to_numpy() for df in X_train])
        X_test_arr = np.array([df.to_numpy() for df in X_test])
        X_validate_arr = np.array([df.to_numpy() for df in X_validate])

        # Convert labels into 2D array
        y_train_arr = np.array([series.to_numpy() for series in y_train])
        y_test_arr = np.array([series.to_numpy() for series in y_test])
        y_validate_arr = np.array([series.to_numpy() for series in y_validate])

        return X_train_arr, X_test_arr, X_validate_arr, y_train_arr, y_test_arr, y_validate_arr


def forward_fill_not_first_NaNs (arr:np.array):
    mask = np.isnan(arr)
    idx = np.where(~mask,np.arange(mask.shape[1]),0)
    np.maximum.accumulate(idx,axis=1, out=idx)
    arr = arr[np.arange(idx.shape[0])[:,None], idx]
    return arr


def encode_col_cyclical(data:pd.DataFrame, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    data = data.drop(columns=col)
    return data

def encode_cyclical(house:pd.DataFrame):
    house = encode_col_cyclical(house, 'hour', 24)
    house = encode_col_cyclical(house, 'day_of_week', 7)
    house = encode_col_cyclical(house, 'month', 12)
    return house

