# Parse data from ebike datalogger

In [None]:
!pip3 install autokeras

# Load all the functions

In [None]:
import random
import numpy as np
import pandas as pd
import tensorflow as tf
import autokeras as ak
import gc


import xgboost as xgb
from xgboost import plot_importance, plot_tree, to_graphviz
from sklearn.datasets import load_boston

import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, accuracy_score

import glob 
import os
import scipy as sp
import scipy.signal as sg


from butter_filter import signal_filter
from gen_plots import display_interesting_variables, display_all_variables
from Battery_Kalman.soc_estimator import SocEstimator

import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
pio.renderers.default = 'browser'


import tensorflow as tf
from sklearn.preprocessing import StandardScaler

# DANGEROUS DONT DO
pd.options.mode.chained_assignment = None  # default='warn'

#px.set_mapbox_access_token(open(".mapbox_token").read())

BATTERY_ENERGY_CAPACITY = 752.4 # Kilo Joules

#raw_data_path = "D:/OneDrive - Imperial College London/University Storage/Masters project/data_storage/"
raw_data_path = "/home/medad/Downloads/MastersProject/Bike_logger/Data_analysis/data_storage/data_storage/"

# Number of PAS magnets
N_PAS_MAGNETS = 12

# pressure at sea level where the readings are being taken.  
qnh=1032.57

# read raw data
def read_file(filepath):
    my_cols = range(19)

    date_parser=lambda x: pd.to_datetime(x, errors="coerce", format = "%Y-%m-%dT%H:%M:%S.%fZ", utc=True)

    
    df = pd.read_csv(filepath,
                names=my_cols,
                engine='c',
                parse_dates=[0],
                date_parser=date_parser)


    df.rename(columns={0: 'Datetime',
                           1: 'sensor',
                          }, inplace=True)    
    
    df.dropna(inplace=True, subset=['Datetime'])
    
    df.sort_values(by='Datetime',inplace = True)


    df = df[~(df['Datetime'] < '2020-03-12 18:46:00')]
    
        
    return df


def filter_df_signal(df, input_name, output_name, highcut_f):
    df[output_name] = signal_filter(df[input_name], highcut=highcut_f, method='butterworth_ba', order=5)
    return df


def energy_from_power_time(datetime_series, power_series):
    """
    Return power in kilo joules
    """
    max_seconds = 1
    
    time_delta = datetime_series.diff().dt.total_seconds().fillna(0)
    energy = power_series*time_delta
    
    energy = energy[time_delta < max_seconds]
    
    return energy.sum()/1000000

def pulse_width_pas_to_rpm(pulse_width):   
    return 1000000/pulse_width/N_PAS_MAGNETS

def pulse_width_to_rpm(pulse_width):   
    return 1000000/pulse_width

def get_altitude(pressure,temperature):
    # The temperature should be the outdoor temperature. 
    # Use the manual_temperature variable if temperature adjustments are required.
    altitude = ((pow((qnh / pressure), (1.0 / 5.257)) - 1) * (temperature + 273.15)) / 0.0065
    return altitude

def insert_time(row):
    return row['Datetime'].replace(minute=int(row['minute']),second=int(row['second']),microsecond=int(row['millisecond']*1000))

def process_gps(df):
    mask = df["sensor"] == 'gps'
    df_gps = df[mask]

    df_gps.rename(columns={2: 'hour',
                           3: 'minute',
                           4: 'second',
                           5: 'millisecond',
                           6: 'latitude',
                           7: 'longitude',
                           8: 'altitude',
                           9: 'GPS Speed',
                           10: 'sats',
                           11: 'gnssFixOK',
                           12: 'fix_type',
                           13: 'vehicle_heading',
                           14: 'horizontal_accuracy', # Horizontal accuracy estimate: mm
                           15: 'vertical_accuracy',   #  Vertical accuracy estimate: mm
                           16: 'speed_accuracy',     # Speed accuracy estimate: mm/s
                           17: 'heading_accuracy'    # Heading accuracy estimate (both motion and vehicle): deg
                          }, inplace=True)

    
    df_gps['Datetime'] = df_gps.apply(lambda r: insert_time(r), axis=1)
    df_gps.sort_values(by='Datetime',inplace = True)
    
    df_gps = df_gps[df_gps['gnssFixOK'] == 1]

    
    offset = 9.5 # seconds
    df_gps["Datetime"] = df_gps["Datetime"] - pd.Timedelta(offset, unit='s')

    

    time_delta = df_gps["Datetime"].diff().dt.total_seconds().fillna(0)
    df_gps['gps_acceleration'] = df_gps["GPS Speed"].diff()/time_delta
    
    
    x = df_gps["longitude"].diff().fillna(0)
    y = df_gps["latitude"].diff().fillna(0)
    
    x = signal_filter(x, highcut=100, method='butterworth_ba', order=5)
    y = signal_filter(y, highcut=100, method='butterworth_ba', order=5)

    phi, df_gps['heading'] = cart2pol(x, y)
    
    



    df_gps.dropna(axis=1, how='all',inplace=True)
    df_gps.head()
    
    return df_gps

def cart2pol(x, y):
    rho = np.sqrt(x**2 + y**2)
    phi = np.arctan2(y, x)
    return(rho, phi)

def process_imu(df):
    mask = df["sensor"] == 'imu'
    df_imu = df[mask]

    df_imu.rename(columns={2: 'acceleration_x',
                           3: 'acceleration_y',
                           4: 'acceleration_z',
                           5: 'gyro_x',
                           6: 'gyro_y',
                           7: 'gyro_z',
                          }, inplace=True)

    df_imu.dropna(axis=1, how='all',inplace=True)
    
    df_imu = filter_df_signal(df_imu, "gyro_x", "gyro_x_filtered", 10 )
    df_imu = filter_df_signal(df_imu, "acceleration_x", "acceleration_x_filtered", 10 )


    return df_imu

def process_brake(df):
    mask = df["sensor"] == 'brake_state'
    df_brake = df[mask]

    df_brake.rename(columns={2: 'brake_state',
                          }, inplace=True)

    df_brake.dropna(axis=1, how='all',inplace=True)
    return df_brake


def process_pas(df):
    mask = df["sensor"] == 'pas'
    df_pas = df[mask]


    df_pas.rename(columns={2: 'pulse_delay_us',
                          }, inplace=True)
    df_pas.dropna(axis=1, how='all',inplace=True)
    df_pas = df_pas[df_pas['pulse_delay_us'] > 4000]

    df_pas['pas_rpm'] = df_pas.apply(lambda x: pulse_width_pas_to_rpm(x['pulse_delay_us']), axis=1)

    df_pas.head()
    
    return df_pas
    
def process_motor_speed(df, df_gps):
    mask = df["sensor"] == 'motor_speed'
    df_ms = df[mask]

    df_ms.rename(columns={2: 'pulse_delay_us',
                          }, inplace=True)
    
    df_ms.dropna(axis=1, how='all',inplace=True)
    
    df_ms = df_ms[df_ms['pulse_delay_us'] > 15000]


    df_ms['motor_rpm'] = df_ms.apply(lambda x: pulse_width_to_rpm(x['pulse_delay_us']), axis=1)
    
    df_ms_merged = pd.merge_asof(df_ms, df_gps, on = 'Datetime', direction = 'nearest')
    
    multiplier = df_ms_merged['GPS Speed'].div(df_ms_merged['motor_rpm'], axis = 0).mean()

    
    
    df_ms['motor_rpm'] = df_ms['motor_rpm'] * multiplier
    
    
    time_delta = df_ms["Datetime"].diff().dt.total_seconds().fillna(0)
    df_ms['filtered_motor_rpm'] = signal_filter(df_ms['motor_rpm'], highcut=100, method='butterworth_ba', order=5)
    df_ms['motor_acceleration'] = df_ms["filtered_motor_rpm"].diff()/time_delta


    
    return df_ms

def process_ina(df):    
    mask = df["sensor"] == 'ina226'
    df_ina = df[mask]
    
    SHUNT_RESISTANCE = 0.00215 # ohms

    df_ina.rename(columns={2: 'INA226 ID',
                           3: 'Voltage_V',
                           4: 'V_shunt',
                           5: 'Current_uncalibrated',
                           6: 'Power_uncalibrated',
                          }, inplace=True)
    df_ina.dropna(axis=1, how='all',inplace=True)
    df_ina.reset_index()


    df_ina = df_ina[df_ina["Voltage_V"] != 0]
    
    df_ina["Current"] = df_ina["V_shunt"] / SHUNT_RESISTANCE
    df_ina["Power"] = df_ina["Current"] * df_ina["Voltage_V"]

    df_ina["Power_averaged"] = signal_filter(df_ina['Power'], highcut=30, method='butterworth_ba', order=2)
#     df_ina["Current_averaged"] = signal_filter(df_ina['Current'], highcut=6, method='butterworth_ba', order=2)
#     df_ina["Voltage_V_averaged"] = signal_filter(df_ina['Voltage_V'], highcut=6, method='butterworth_ba', order=2)


    print("Total Energy Consumption[KiloJoules]",energy_from_power_time(df_ina["Datetime"],df_ina["Power"]))
    df_ina.head()
    
    return df_ina

def process_baro(df,df_ms, df_gps):
    mask = df["sensor"] == 'baro'
    df_baro = df[mask]

    df_baro.rename(columns={2: 'temperature',
                           3: 'Pressure',
                           4: 'humidity',
                          }, inplace=True)





    df_baro['Baro_Altitude'] = df_baro.apply(lambda x: get_altitude(x['Pressure'], x['temperature']), axis=1)
    df_baro['filtered_Baro_Altitude'] = signal_filter(df_baro['Baro_Altitude'], highcut=10, method='butterworth_ba', order=2)

    df_baro_ms = pd.merge_asof(df_baro, df_ms, on = 'Datetime', direction = 'forward')
    df_baro_merged = pd.merge_asof(df_baro, df_gps, on = 'Datetime', direction = 'forward')
    
    
    
    offset = df_baro_merged['Baro_Altitude'].sub(df_baro_merged['altitude'], axis = 0).mean()
    

    time_delta = df_baro["Datetime"].diff().dt.total_seconds().fillna(0)
    df_baro['vertical_distance'] = df_baro["filtered_Baro_Altitude"].diff()
    df_baro['vertical_velocity'] = df_baro['vertical_distance']/time_delta
    df_baro['slope'] = df_baro["filtered_Baro_Altitude"].diff()/(time_delta * df_baro_ms["filtered_motor_rpm"])
    
    df_baro['Barometric_Altitude_Uncalibrated'] = df_baro['Baro_Altitude']
    df_baro['Baro_Altitude'] = df_baro['Baro_Altitude'] - offset
    df_baro['filtered_Baro_Altitude'] = df_baro['filtered_Baro_Altitude'] - offset



    df_baro.dropna(axis=1, how='all',inplace=True)

    df_baro.head()
    
    return df_baro

def process(df):
    print("Start GPS process")
    df_gps = process_gps(df)
    print("GPS process DONE....")
    
    df_pas = process_pas(df)
    print("PAS process DONE....")
    
    df_ms = process_motor_speed(df, df_gps)
    print("Motor Speed process DONE....")

    df_ina = process_ina(df)
    print("INA226 process DONE....")

    df_baro = process_baro(df,df_ms,df_gps)
    print("Barometer process DONE....")

    df_imu = process_imu(df)
    print("IMU process DONE....")

    df_brake = process_brake(df)
    print("Brake process DONE....")


    return df_ina, df_gps, df_baro, df_pas, df_ms, df_imu, df_brake

def display_gps_positions(df_gps):
    """
    Display GPS positions
    """
    # Display GPS positions
    fig = px.line_mapbox(df_gps,
                            lat="latitude",
                            lon="longitude",
                            color="trip",#"slope",#"LOCATION Altitude ( m)",,#"Speed(km/h)", # "abs_acceleration" or "gps_acceleration" or "power"
                            zoom=14,
                            hover_data=["Datetime", "altitude","sats", "heading"],
                            #size="LOCATION Accuracy ( m)"
                           )



    fig.update_layout(mapbox_style="carto-positron")
    fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
    fig.write_html("output/GPS_track.html")
    fig.show()

def process_charge_data(fps):
    dfs  = [read_file(fp) for fp in fps]
    df = pd.concat(dfs, ignore_index=True)
    df_ina = process_ina(df)
    return df_ina

def display_charge_data(df_ina):
    
    
    FEATURES = ["Voltage_V_averaged","Current_averaged","Power_averaged"]
    TITLES = ["Battery Voltage[V]","Current[mA]","Power[mW]"]

    N_FEATURES = len(FEATURES)
    fig = make_subplots(rows=N_FEATURES, cols=1,
                        shared_xaxes=True,
                        vertical_spacing=0.01)

    fig.update_layout(hovermode="x unified")

    for i, feature in enumerate(FEATURES):
        fig.add_trace(go.Scatter(
            x=df_ina.index,
            y=df_ina[feature],
            name=feature,
            hoverinfo='y'),
            row=i+1, col=1)
        
#         non_averaged_feature = feature.replace("_averaged","")
#         fig.add_trace(go.Scatter(
#             x=df_ina.index,
#             y=df_ina[non_averaged_feature],
#             name=non_averaged_feature,
#             hoverinfo='y'),
#             row=i+1, col=1)

        fig.update_yaxes(title_text=TITLES[i], row=i+1, col=1)


    fig.update_layout(title_text="Power Parameters")

    fig.write_html("output/Charging_Power_variables.html")

    fig.show()

# params_x_for_ml = ['Voltage_V', 'V_shunt', 'Current', 'temperature', 'Pressure', 'humidity',
#                   'Baro_Altitude', 'filtered_Baro_Altitude', 'motor_rpm_x', 'motor_acceleration_x', 'filtered_motor_rpm_x', 'slope',
#                   'latitude', 'longitude', 'altitude', 'GPS Speed', 'sats', 'gnssFixOK', 'fix_type', 'gps_acceleration',
#                   'pas_rpm', 'motor_rpm_y', 'motor_acceleration_y', 'filtered_motor_rpm_y','acceleration_x','acceleration_y','acceleration_z',
#                    'gyro_x','gyro_y','gyro_z', 'brake_state']


# params_x_for_ml = ['temperature', 'Pressure', 'humidity','filtered_Baro_Altitude', 'filtered_motor_rpm_x', 'slope',
#                   'latitude', 'longitude', 'altitude', 'GPS Speed','acceleration_x','acceleration_y',
#                     'acceleration_z','gyro_x','gyro_y','gyro_z'
#                  ]


params_x_for_ml = ['temperature', 
                   'Pressure', 
                   'humidity',
                   'filtered_Baro_Altitude',
                   'slope',
                   'latitude',
                   'longitude',
                   'altitude',
                   "heading",
                   "SOC",
                   #"vertical_distance",
                   "vertical_velocity",
                   "GPS Speed",
                   'acceleration_x','acceleration_y', 'acceleration_z',
                   'gyro_x','gyro_y','gyro_z',
                   'filtered_motor_rpm',
                   'pas_rpm'
                   
                   
                 ]


# params_x_for_ml = ['temperature', 
#                    'Pressure', 
#                    'humidity',
#                    'filtered_Baro_Altitude',
#                    'slope',
#                    'latitude',
#                    'longitude',
#                    'altitude',
#                    "heading",
#                    "SOC",
#                    "vertical_distance",
# #                    "vertical_velocity",
# #                    "GPS Speed"
#                  ]

params_y_for_ml = "Power"

params_x_for_ml_soc = ['temperature', "Voltage_V", "Current", "Power"]
params_y_for_ml_soc = "SOC"



def drop_sensor_column(df, column_name):
    return df[df.columns.difference([column_name])]

    
    
def gen_ml_data(dataframes):
    """
    The first data frame in dataframes should have the highest datarate
    """
    
    column_name = "sensor"
    
    
    print("Dropping Sensor column....")
    for i in dataframes:
        i.drop(column_name, axis=1, inplace=True)

    
        
    print("Merging Dataframes....")

    df_ml = dataframes[0]

    for df in dataframes[1:]:
        df_ml = pd.merge_asof(df_ml, df, on = 'Datetime', direction = 'forward')
    print("Done merging Dataframes....")


    return df_ml

def split_x_y(df_ml, params_x_for_ml, params_y_for_ml):
    x, y = df_ml[params_x_for_ml], df_ml[params_y_for_ml]
    return x, y

def concat_dfs(fps):
    dfs = []
    trip_counter = 0
    
    for fp in fps:
        df = read_file(fp)
        print("Trip Count: ", trip_counter)

        df["trip"] = trip_counter
        
        dfs.append(df)
        trip_counter+=1
    
    df = pd.concat(dfs, ignore_index=True)
    return df

def get_raw_dfs(fps):
    df = concat_dfs(fps)
    raw_dfs = process(df)
    
    del df
    gc.collect()

    return raw_dfs
    

def remove_rows_with_na_in_column(df, column):
    return df[df[column].notna()]

def process_for_ml(fps):
    """
    Takes in list of file paths, concats and processes them
    Set display_variables = True to visualise the variables.
    """
    raw_dfs = get_raw_dfs(fps)
    
    # Highest datarate must be in the start of the list
    df_ml = gen_ml_data(raw_dfs)
    #df_ml = remove_rows_with_na_in_column(df_ml, "trip")
    
    
    return df_ml, raw_dfs

def get_energy_error(predicted_energy, actual_energy):
    Error = 100 * (predicted_energy - actual_energy)/actual_energy
    print("Actual energy:",actual_energy, "Predicted Energy:", predicted_energy, "Error[%](ideal should be 0%):", Error, "%")
    return Error
    

def print_power_consumption_score(timestamps, ytest, ypred):
    actual_energy = energy_from_power_time(timestamps, ytest)
    predicted_energy = energy_from_power_time(timestamps, ypred)
    error = get_energy_error(predicted_energy, actual_energy)
    return error


def print_test_results(xgbr, df_ml):
    x, y = split_x_y(df_ml, params_x_for_ml, params_y_for_ml)
    ypred = xgbr.predict(x)
    
    timestamps = df_ml["Datetime"]
    print("Test Results :: ")
    error = print_power_consumption_score(timestamps, y, ypred)
    return error

def do_ml(x,y):

    xgbr = xgb.XGBRegressor(verbosity=1,tree_method='gpu_hist', gpu_id=0)
    print(xgbr)

    xgbr.fit(x, y)

    print("Training score: ", xgbr.score(x, y))

    _ = plot_importance(xgbr, height=0.9)
    
    return xgbr


## SOC Calculations

def coulomb_counting(df):
    """
    Return SOC series, determined using coulomb counting
    """
    total_capacity_As = 8.708 * 3600 # in As
    time_interval =  df["Datetime"].diff().dt.total_seconds().fillna(0)
    energy  = (df["Current"]/1000) * time_interval # Convert power from mA to A
    remaining_energy = total_capacity_As - energy.cumsum()
    soc = remaining_energy / total_capacity_As
    return soc

def thevenin_model(df):
    """
    Return SOC series, determined from Current and Voltage only.
    """
    total_capacity_As = 8.708 * 3600 # in As
    time_interval = df["Datetime"].diff().dt.total_seconds().fillna(0)
    energy  = (df["Current"]/1000) * time_interval # Convert power from mA to A
    remaining_energy = total_capacity_As - energy.cumsum()
    soc = remaining_energy / total_capacity_As
    return soc

def ML_trained_by_coulomb_counting(df):
    """
    Return SOC series, determined from ML model, trained on Coulomb counting. 
    Uses Voltage, Current, Power and Temperature to determine SOC
    """
    x = df[params_x_for_ml_soc] # WARNING: df_ml_test must be a full discharge of battery from full to empty
    soc = xgbr_SOC.predict(x)
    return soc

def add_soc_feature(df, method):
    soc = method(df)
    df.loc[:,"SOC"] = soc

def plot_predicted_data(timestamps, ytest, ypred):
    
    fig = go.Figure()
    fig.add_traces(go.Scatter(x=timestamps, y=ytest, name='Actual data'))
    fig.add_traces(go.Scatter(x=timestamps, y=ypred, name='Regression Fit'))

    fig.update_layout(
        title="Power consumption, predicted",
        xaxis_title="Time(UTC)",
        yaxis_title="Power[mW]",
    )
    fig.write_html("output/Predicted_plot.html")

    fig.show()
    
def score_predicted_data_xgboost(model, df_ml_test):
    x, y  = split_x_y(df_ml_test, params_x_for_ml, params_y_for_ml)
    ypred = model.predict(x)
    error = print_power_consumption_score(df_ml_test["Datetime"], y, ypred)
    return y, ypred, error

def make_predictions_xgboost(model, df_ml_test, plot=True):
    y, ypred, error = score_predicted_data_xgboost(model, df_ml_test)
    if plot == True:
        plot_predicted_data(df_ml_test["Datetime"], y, ypred)
    return error

def plot_3d_plot(df_gps, df_baro, df_ina):
    
    df = pd.merge_asof(df_gps,df_baro , on = 'Datetime', direction = 'nearest')
    df = pd.merge_asof(df,df_ina , on = 'Datetime', direction = 'nearest')

    fig = go.Figure(data=go.Scatter3d(
        x=df["longitude"],
        y=df["latitude"],
        z=df["Baro_Altitude"],
        marker=dict(
            size=4,
            color=df["Power"],
            colorscale='Viridis',
            showscale=True,
            colorbar=dict(
                title="Power[mW]"
            )

        ),
        text = 'Power:' +df["Power"].astype(str),
        line=dict(
            color='darkblue',
            width=2
        ),
    ))

    fig.update_layout(
        width=800,
        #height=700,
        #autosize=False,
        scene=dict(
            xaxis = dict(title='Longitude'),
            yaxis = dict(title='Latitude'),
            zaxis = dict(title='Barometric Altitude'),
        ),
    )




    fig.write_html("output/gps_speed_3d.html")
    fig.show()

def show_correlations_in_df(df):
    """
    Show correlations between all the columns in df
    """
    fig = px.imshow(df.corr(), title='Heatmap of co-relation between variables')
    fig.write_html("output/Correlation_heatmap.html")
    fig.show()

def remove_values_from_list(lst, value):
    """
    Remove all occurances of value from lst, and return the list minus those values
    """
    return list(filter((value).__ne__, lst))

def plot_linear_correlations(df):
    """
    plot all X-features against output variable Power
    """
    col_= df.columns.tolist()
    col_ = remove_values_from_list(col_, "sensor_y")
    col_ = remove_values_from_list(col_, "sensor_x")

    for i in col_[10:]:
        fig = px.scatter(df, x=i, y="Power", title='{0} vs Power'.format(i))
        fig.write_html("output/Correlation_display_{}_vs_Power.html".format(i))

        fig.show()

def get_masked_items(df, column, items):
    return df[df[column].isin(items)]



def special_test_train_split(df_ml, test_size=0.15, random_state=42):
    """
    return test, train dataframes
    """

    train_frac = 1 - test_size
    l = df_ml["trip"].unique()
    
    print(l)
    
    sz = len(l)
    cut = int(train_frac * sz) #80% of the list
    print(cut)
    random.seed(random_state)
    random.shuffle(l) # inplace shuffle
    train_trips = l[:cut] # first 80% of shuffled list
    test_trips = l[cut:] # last 20% of shuffled list
    
    
    print(l, train_trips, test_trips)
    
    return get_masked_items(df_ml, "trip", test_trips), get_masked_items(df_ml, "trip", train_trips)

# Do machine learning

## Generate training data

In [None]:
# No need to run if df_ml has been saved

df_ml, raw_dfs = process_for_ml([
# raw_data_path+"hampsted_trip_1-4-2021.csv",
# raw_data_path+"data_19-4-21.csv",
# raw_data_path+"data_icah_20-4-21.csv",
# raw_data_path+"data_27-4-21.csv",
# raw_data_path+"data_28-4-21.csv",                                          
# raw_data_path+"data_6-5-21.csv",
raw_data_path+"data_8-5-2021.csv",
raw_data_path+"data_10-5-21.csv",
raw_data_path+"data_10-5-21-v2.csv",
raw_data_path+"data_11-5-21.csv",
raw_data_path+"data_12-5-21-v1.csv",
raw_data_path+"data_12-5-21-v2.csv",
raw_data_path+"data_13-5-21-clean.csv",
raw_data_path+"data_14-5-21-putney-heath-circuit.csv",
raw_data_path+"data_15-5-21.csv",
raw_data_path+"data_18-5-21_enoch.csv",
raw_data_path+"data_20-5-21_v1.csv",
raw_data_path+"data_20-5-21_v2.csv",
raw_data_path+"data_20-5-21_v3.csv",
raw_data_path+"data_21-5-21_v1.csv",
raw_data_path+"data_21-5-21_v2.csv",
])

df_ml.to_pickle("df_ml.pkl")

In [None]:
df_ml = pd.read_pickle("df_ml.pkl")
df_ml_test, df_ml_train = special_test_train_split(df_ml, test_size=0.1, random_state=58)
del df_ml
gc.collect()

In [None]:
# No need to run if df_ml_soc_trainer has been saved
df_ml_soc_trainer, _ = process_for_ml([raw_data_path+"data_17-5-21.csv"])
df_ml_soc_trainer.to_pickle("df_ml_soc_trainer.pkl")

### Train ML model to predict SOC from voltage and Current

In [None]:
# No need to run if model has been saved.
df_ml_soc_trainer = pd.read_pickle("df_ml_soc_trainer.pkl")
add_soc_feature(df_ml_soc_trainer, coulomb_counting)
x, y = split_x_y(df_ml_soc_trainer, params_x_for_ml_soc, params_y_for_ml_soc) # WARNING: df_ml_test must be a full discharge of battery from full to empty
xgbr_SOC = do_ml(x, y)
xgbr_SOC.save_model('xgbr_SOC_model.json')

#### Add SOC as a feature

In [None]:
xgbr_SOC = xgb.XGBRegressor()
xgbr_SOC.load_model('xgbr_SOC_model.json')
add_soc_feature(df_ml_train, ML_trained_by_coulomb_counting)
add_soc_feature(df_ml_test, ML_trained_by_coulomb_counting)

## Display variables for initial viewing

In [None]:
display_all_variables(*raw_dfs)

In [None]:
display_gps_positions(raw_dfs[1]) # df_gps is index 1 TODO: don't use indexes. use labels for readiblity

### Process and Display Charging data

In [None]:
# WARNING: Ensure 1970 filter is removed, because the data is not timestamped
df_ina = process_charge_data([raw_data_path+"data_charge_14-5-21.csv"])

In [None]:
def resample_ina_df(df_ina):
    df_ina = df_ina.set_index("Datetime")
    df_ina = df_ina.resample('1S').mean()

    display_charge_data(df_ina)

df_ina_subset = df_ina.head(2500000)
# Display charging profile
resample_ina_df(df_ina_subset)

## View Correlations

In [None]:
show_correlations_in_df(df_ml_train)

In [None]:
from string import ascii_letters
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="white")

# Generate a large random dataset
rs = np.random.RandomState(33)
d = df_ml_train

# Compute the correlation matrix
corr = d.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
list(df_ml_train)

In [None]:
## WARNING: It will open a lot of tabs!!
plot_linear_correlations(df_ml_train)

## View distributions

In [None]:
# more data exploration through visualizing

def showDistributions(df2, category1, category2, category3):
    fig, axes = plt.subplots(1,3, figsize=(25, 5))
    sns.histplot(data=df2, x=category1, kde=True, color="darkseagreen", ax=axes[0])
    axes[0].set_title("Distribution of {}".format(category1))
    sns.histplot(data=df2, x=category2, kde=True, color="darkseagreen", ax=axes[1])
    axes[1].set_title("Distribution of {}".format(category2))
    sns.histplot(data=df2, x=category3, kde=True, color="darkseagreen", ax=axes[2])
    axes[2].set_title("Distribution of {}".format(category3))

showDistributions(df_ml_train, "Voltage_V", "Current", "Power")
showDistributions(df_ml_train, "GPS Speed", "altitude", "gps_acceleration")
showDistributions(df_ml_train, "heading", "Baro_Altitude", "Pressure")
showDistributions(df_ml_train, "humidity", "slope", "temperature")
showDistributions(df_ml_train, "pas_rpm", "motor_rpm", "brake_state")

## Show pairplots of data

In [None]:
%matplotlib qt
sns.set_theme()

features_to_pair_plot = [
    "Voltage_V",
    "Current",
    "Power",
    "SOC",
    #"heading",
    "Baro_Altitude",
    #"filtered_Baro_Altitude"
    #"Pressure",
    "slope",
    "temperature",
    "pas_rpm",
    "motor_rpm",
#     "brake_state",
#     "GPS Speed",
#     "altitude",
#     "gps_acceleration"
#     "humidity"
]

battery_params = [
    "Voltage_V",
    "Current",
    "Power",
    "SOC",
    "Baro_Altitude",
]

def do_pair_plot(df, features, title):
    g = sns.pairplot(
        df[features].sample(n=10000, random_state=1),
        plot_kws={"s": 1}
    )
    
    g.fig.suptitle(
        title,
        y=1.001 # y= some height>1
    ) 
    plt.show()

In [None]:
do_pair_plot(df_ml_train, features_to_pair_plot, "Pair Plot of E-bike parameters")

In [None]:
do_pair_plot(df_ml_train, battery_params, "Pair Plot of E-bike Battery parameters")

In [None]:
params_of_interest = [
#     'Current',
#  'Current_uncalibrated',
#  'Datetime',
#  'INA226 ID',
#  'Power',
#  'Power_averaged',
#  'Power_uncalibrated',
#  'V_shunt',
#  'Voltage_V',
 'GPS Speed',
#  'altitude',
#  'fix_type',
#  'gnssFixOK',
#  'gps_acceleration',
#  'heading',
#  'hour',
#  'latitude',
#  'longitude',
#  'millisecond',
#  'minute',
#  'sats',
#  'second',
#  'Baro_Altitude',
#  'Pressure',
#  'filtered_Baro_Altitude',
#  'humidity',
#  'slope',
#  'temperature',
#  'pas_rpm',
#  'pulse_delay_us_x',
#  'filtered_motor_rpm',
#  'motor_acceleration',
#  'motor_rpm',
#  'pulse_delay_us_y',
  'acceleration_x',
#   'acceleration_x_filtered',
#  'acceleration_y',
#  'acceleration_z',
#  'gyro_x',
#   'gyro_x_filtered',
  'gyro_y',
#  'gyro_z',
#  'brake_state',
  'SOC'
]
do_pair_plot(df_ml_train, params_of_interest, "Pair Plot of E-bike parameters")

In [None]:
params_of_interest = [
#     'Current',
#  'Current_uncalibrated',
#  'Datetime',
#  'INA226 ID',
#  'Power',
#  'Power_averaged',
#  'Power_uncalibrated',
#  'V_shunt',
#  'Voltage_V',
 'GPS Speed',
#  'altitude',
#  'fix_type',
#  'gnssFixOK',
#  'gps_acceleration',
#  'heading',
#  'hour',
#  'latitude',
#  'longitude',
#  'millisecond',
#  'minute',
  'sats',
#  'second',
#  'Baro_Altitude',
#  'Pressure',
#  'filtered_Baro_Altitude',
#  'humidity',
#  'slope',
#  'temperature',
#  'pas_rpm',
#  'pulse_delay_us_x',
#  'filtered_motor_rpm',
#  'motor_acceleration',
  'motor_rpm',
#  'pulse_delay_us_y',
#   'acceleration_x',
#   'acceleration_x_filtered',
#  'acceleration_y',
#  'acceleration_z',
#  'gyro_x',
#   'gyro_x_filtered',
#   'gyro_y',
#  'gyro_z',
#  'brake_state',
#   'SOC'
]
do_pair_plot(df_ml_train, params_of_interest, "Comparison of GPS Speed, Motor Internal Speed and GPS Sats")

In [None]:
params_of_interest = [
#     'Current',
#  'Current_uncalibrated',
#  'Datetime',
#  'INA226 ID',
    'Power',
#  'Power_averaged',
#  'Power_uncalibrated',
#  'V_shunt',
#  'Voltage_V',
  'GPS Speed',
#  'altitude',
#  'fix_type',
#  'gnssFixOK',
#  'gps_acceleration',
#  'heading',
#  'hour',
#  'latitude',
#  'longitude',
#  'millisecond',
#  'minute',
#   'sats',
#  'second',
#  'Baro_Altitude',
#  'Pressure',
#  'filtered_Baro_Altitude',
#  'humidity',
#  'slope',
#  'temperature',
#  'pas_rpm',
#  'pulse_delay_us_x',
#  'filtered_motor_rpm',
#  'motor_acceleration',
#   'motor_rpm',
#  'pulse_delay_us_y',
#   'acceleration_x',
#   'acceleration_x_filtered',
#  'acceleration_y',
#  'acceleration_z',
#  'gyro_x',
#   'gyro_x_filtered',
#   'gyro_y',
#  'gyro_z',
#  'brake_state',
#   'SOC'
    "vertical_velocity",
    "vertical_distance"
    
]
do_pair_plot(df_ml_train, params_of_interest, "Comparison of GPS Speed[km/h], Power[mW], Vertical Distance[m], Vertical Speed[m/s]")

In [None]:
do_join_plot(df_ml_train, "vertical_velocity", "Power", "vertical_velocity[m/s] vs Power output[mW] from Battery")

In [None]:
do_join_plot(df_ml_train, "vertical_distance", "Power", "vertical_velocity[m/s] vs Power output[mW] from Battery")

In [None]:
def do_join_plot(df, x_param, y_param, title):
    p = sns.jointplot(x=x_param,
                  y=y_param,
                  data=df.sample(n=10000, random_state=1),
                  kind="reg",
                  scatter_kws={"s": 1})
    p.fig.suptitle(title)
    #p.ax_joint.collections[0].set_alpha(0)
    p.fig.tight_layout()
    p.fig.subplots_adjust(top=0.95) # Reduce plot to make room 
    
do_join_plot(df_ml_train, "SOC", "Power", "State-of-Charge(SOC) vs Power output[mW] from Battery")

## Export dataframe to pickle

In [None]:
def export_to_pickle(df):
    print(df.head())
    df.to_pickle("df_ml_train.pkl")

export_to_pickle(df_ml_train)

### Display SOC vs power

In [None]:
fig =  px.line(df_ml_train, x="SOC", y="Power", title='SOC vs Power')
fig.update_xaxes(autorange="reversed")
fig.update_yaxes(title = "Power[mW]")
fig.write_html("output/soc_vs_power.html")
fig.show()

### Display SOC vs Time

In [None]:
fig =  px.line(df_ml_test, x=df_ml_test["Datetime"], y="SOC", title='SOC over Time')
fig.update_yaxes(title = "Power[mW]")
fig.write_html("output/soc_vs_power.html")
fig.show()

### Display barometric altitude vs GPS

In [None]:
def display_barom_vs_gps(raw_dfs):

    df_ina, df_gps, df_baro, df_pas, df_ms, df_imu, df_brake = raw_dfs

    fig = make_subplots(rows=1, cols=1,
                        shared_xaxes=True,
                        vertical_spacing=0.01,
                       )


    fig.update_layout(hovermode="x unified")

    fig.add_trace(go.Scatter(
        x=df_baro["Datetime"],
        y=df_baro["Barometric_Altitude_Uncalibrated"],
        name="Barometric Altitude Uncalibrated",
        hoverinfo='y'),
        row=1, col=1)

    fig.add_trace(go.Scatter(
        x=df_baro["Datetime"],
        y=df_baro["Baro_Altitude"],
        name="Barometric Altitude Calibrated",
        hoverinfo='y'),
        row=1, col=1)

    fig.add_trace(go.Scatter(
        x=df_baro["Datetime"],
        y=df_gps["altitude"],
        name="GPS Altitude",
        hoverinfo='y'),
        row=1, col=1)

    fig.update_yaxes(title_text="Altitude[m]", row=1, col=1)
    fig.update_xaxes(title_text="Time", row=1, col=1)


    fig.update_layout(title_text="GPS Altitude vs Barometric Altitude")

    fig.write_html("output/GPS_vs_Baro.html")

    fig.show()

display_barom_vs_gps(raw_dfs)

In [None]:
def display_ina(raw_dfs):

    df_ina, df_gps, df_baro, df_pas, df_ms, df_imu, df_brake = raw_dfs
    
    #df_ina = df_ina.sample(n=100000).sort_values('Datetime')
    df_ina = df_ina.iloc[-100000:]

    fig = make_subplots(rows=1, cols=1,
                        shared_xaxes=True,
                        vertical_spacing=0.01,
                       )


    fig.update_layout(hovermode="x unified")

    fig.add_trace(go.Scatter(
        x=df_ina["Datetime"],
        y=df_ina["Power"],
        name="Power",
        hoverinfo='y'),
        row=1, col=1)

    fig.add_trace(go.Scatter(
        x=df_ina["Datetime"],
        y=df_ina["Power_averaged"],
        name="Power Averaged",
        hoverinfo='y'),
        row=1, col=1)




    fig.write_html("output/ina.html")

    fig.show()

display_ina(raw_dfs)

## Display 3D plot of trip

In [None]:
plot_3d_plot(raw_dfs_test[1], raw_dfs_test[2], raw_dfs_test[0])

## Group power into grid squares of longitude/latitude

In [None]:
def group_data_location(df):
    step = 0.0002
    to_bin = lambda x: np.floor(x / step) * step
    df["latbin"] = df.latitude.map(to_bin)
    df["lonbin"] = df.longitude.map(to_bin)
    groups = df.groupby(["latbin", "lonbin"]).mean()
    return groups

def group_data_time_interval(df):
    groups = df.groupby(pd.Grouper(key="Datetime", freq="1s")).mean()
    #df["Datetime"] = df.index
    groups = groups.dropna()
    return groups

df_ml_train_grouped = group_data_time_interval(df_ml_train)
display_gps_positions_bins(df_ml_train_grouped)
# df_ml_train_grouped = group_data_location(df_ml_train)
# display_gps_positions_bins(df_ml_train_grouped)

## Display distribution of power parameters

In [None]:
import plotly.express as px
fig = px.histogram(raw_dfs_train[0], x="Power", title='Power Distribution')
fig.update_xaxes(title_text="Power[mW]")
fig.show()
fig = px.histogram(raw_dfs_train[0], x="Voltage_V", title='Battery Voltage Distribution')
fig.update_xaxes(title_text="Voltage[V]")
fig.show()
fig = px.histogram(raw_dfs_train[0], x="Current", title='Current Distribution')
fig.update_xaxes(title_text="Current[mA]")

fig.show()

### Display speed vs power

In [None]:
fig =  px.line(raw_dfs_train[4], x="Datetime", y="filtered_motor_rpm", title='Speed vs Time').show()
fig =  px.line(raw_dfs_train[4], x="Datetime", y="motor_rpm", title='Speed vs Time').show()

### Display Lat/Long vs power(unbinned)

In [None]:
#latitude_start_pt, longitude_start_pt = 51.45282, -0.2275045
from scipy.signal import find_peaks

L1 = [51.45282, -0.2275045]
def plot_proximity_to_start_point(df):
    df['distance'] = df[['latitude', 'longitude']].sub(np.array(L1)).pow(2).sum(1).pow(0.5)
    
    fig = px.line(df, x="Datetime_copy", y="distance", title='Distance from start point').show()
    
    time_series = df['distance']
    indices = find_peaks(-time_series, distance = 2000,height=-0.0005)[0]
    
    
    df["loop_number"] = 0
    for i in range(len(indices)-1):
        rows = range(indices[i],indices[i+1])
        
        df.loc[rows, "loop_number"] = i+1
    

    fig = px.line(df, x="latitude", y="Power", color='loop_number', title= "Power profile on each loop")
    fig.write_html("output/Power_profile_on_each_loop.html")

    fig.show()

    
plot_proximity_to_start_point(df_ml_train)

### Calculate Energy Consumption each loop

In [None]:
def calculate_energy_consumption_for_each_loop(df_ml):
    energies_per_loop = []
    for i in df_ml['loop_number'].unique():
        
        df_loop = df_ml[df_ml["loop_number"] == i]
        energy_in_loop = energy_from_power_time(df_loop["Datetime"], df_loop["Power"])
        energies_per_loop.append((i,energy_in_loop))
                                 
    return pd.DataFrame(energies_per_loop, columns=['Loop_number', 'Energy_Kilo_Joules'])

def plot_energy_consumption_per_loop(df_ml):
    energies_per_loop = calculate_energy_consumption_for_each_loop(df_ml)       
    fig = px.line(energies_per_loop, x="Loop_number", y="Energy_Kilo_Joules", title='Energy consumption of each loop around Putney Heath')
    fig.update_layout(yaxis_range=[48,65])
    fig.write_html("output/Energy_consumption_per_loop.html")
    fig.show()

plot_energy_consumption_per_loop(df_ml_train)

### Display Lat/Long vs power(binned)

In [None]:
def plot_proximity_to_start_point_binned(df):
    df['distance'] = df[['latitude', 'longitude']].sub(np.array(L1)).pow(2).sum(1).pow(0.5)
    
    
    time_series = df['distance']
    indices = find_peaks(-time_series,
                         distance = 2000,
                         height=-0.0005)[0]
    
    
    df["ts"] = df.index.values
    df["loop_number"] = 0
    for i in range(len(indices)-1):
        
        start_time = df["ts"].iloc[indices[i]]
        end_time = df["ts"].iloc[indices[i+1]]        
        df.loc[start_time:end_time, "loop_number"] = i+1
    
    fig = px.line(df, x="latitude", y="Power", color='loop_number', title= "Power profile on each loop")
    fig.write_html("output/Power_profile_on_each_loop_binned.html")

    fig.show()

plot_proximity_to_start_point_binned(df_ml_train)

In [None]:
fig =  px.scatter(df_ml_train, x="latitude", y="Power", title='Latitude vs Power').show()

### Display IMU data

In [None]:

def display_imu_plots(df_imu):
    fig = go.Figure()
    
    # Add traces
    fig.add_trace(go.Scatter(x=df_imu["gyro_x"],
                             y=df_imu["acceleration_x"],
                             mode='markers',
                             marker=dict(size=1),
                             name='Raw data'
                            )
                 )
    
    fig.add_trace(go.Scatter(x=df_imu["gyro_x_filtered"], 
                             y=df_imu["acceleration_x_filtered"],
                             mode='markers',
                             marker=dict(size=1),
                             name='Filtered data'

                            )
                 )
    
    fig.update_layout(
        title="IMU Acceleration vs Angular velocity(gyro)",
        xaxis_title="Angular Velocity[rad/s]",
        yaxis_title="Acceleration[m/s^2]",
    )
    fig.show()
                  
display_imu_plots(raw_dfs_train[5])

In [None]:
fig = px.scatter(raw_dfs_train[5], x="gyro_x", y="acceleration_x", title='Acceleration[m/s^2] vs Angular velocity[rad/s]')
fig.update_traces(marker=dict(size=1))
fig.show()

In [None]:
fig = px.scatter(raw_dfs_train[5], x="Datetime", y=["acceleration_x","acceleration_x_filtered"], title='Acceleration over Time').show()

In [None]:
fig = px.line(df_ml_train, x="Datetime", y=["Power","Power_averaged"], title='Power over Time').show()

## Train ML model XGboost

In [None]:
x, y = split_x_y(df_ml_train, params_x_for_ml, params_y_for_ml)
xgbr = do_ml(x, y)

## Plot predicted and actual data

In [None]:
make_predictions_xgboost(xgbr, df_ml_test)

## Create test of 15 minute runs

In [None]:
def test_short_runs(xgbr, df_ml_test, predictor, scaler = None, test_length = 600, row_timestep = 0.0, show_plots=False):
    test_n_rows = test_length/row_timestep

    n_rows = len(df_ml_test.index)


    tests = np.array_split(df_ml_test, n_rows//test_n_rows)

    
    scores = []
    for test in tests:
        error = predictor(xgbr, test, plot=show_plots)
        scores.append(error)
        
    return scores
        

In [None]:
scores = test_short_runs(xgbr, df_ml_test, make_predictions_xgboost, test_length = 600, row_timestep = 0.01, show_plots=False)

In [None]:
sns.displot(x=scores, kde=True)

## Display interesting variables

In [None]:
display_interesting_variables(df_ml_test, xgbr)

## Use Neural Network with multidimensional input to do regression
Currently, we pass rows into the XG boost model. What if we could insert a snap shot of 10 seconds of data containing all the features, and calculating the energy consumption of this snapshot? Its like a photograph used in Deep Neural Networks: 2 dimensional input. Run the StructuredDataRegressor.
You can also leave the epochs unspecified for an adaptive number of epochs.

In [None]:
# Initialize the structured data regressor.
reg = ak.StructuredDataRegressor(
    #overwrite=True,
    max_trials=3
)  # It tries 3 different models.



x, y = split_x_y(df_ml_train, params_x_for_ml, params_y_for_ml)

x = scaler.fit_transform(x)


# Feed the structured data regressor with training data.
with tf.device('/gpu:0'):
    reg.fit(x,
            y,
            epochs=5
           )

model = reg.export_model()

print(type(model))  # <class 'tensorflow.python.keras.engine.training.Model'>

try:
    model.save("model_autokeras", save_format="tf")
except Exception:
    model.save("model_autokeras.h5")


In [None]:
def plot_predicted_data_autokeras(model, df_ml_test,scaler):
    x, y  = split_x_y(df_ml_test, params_x_for_ml, params_y_for_ml)
    x = scaler.transform(x)
    ypred = model.predict(x).flatten()    

    print_power_consumption_score(df_ml_test["Datetime"], y, ypred)
    plot_predicted_data(df_ml_test["Datetime"], y, ypred)

In [None]:
plot_predicted_data_autokeras(reg, df_ml_test, scaler)

## Use Scikit learn simple functions

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn import svm


scaler = StandardScaler()

df_ml_train.dropna(subset=params_x_for_ml, inplace=True)
x, y = split_x_y(df_ml_train, params_x_for_ml, params_y_for_ml)
x = scaler.fit_transform(x)


#reg = svm.SVR().fit(x, y)
reg = LinearRegression().fit(x, y)
print("Score: ",reg.score(x, y))

In [None]:
plot_predicted_data_autokeras(reg, df_ml_test, scaler)

In [None]:

def score_predicted_data_sk(model, df_ml_test):
    x, y  = split_x_y(df_ml_test, params_x_for_ml, params_y_for_ml)
    x = scaler.transform(x)
    ypred = model.predict(x)
    error = print_power_consumption_score(df_ml_test["Datetime"], y, ypred)
    return y, ypred, error

def make_predictions_sk(model, df_ml_test, plot=True):
    y, ypred, error = score_predicted_data_sk(model, df_ml_test)
    if plot == True:
        plot_predicted_data(df_ml_test["Datetime"], y, ypred)
    return error

In [None]:
scores = test_short_runs(reg, df_ml_test, make_predictions_sk, test_length = 5*60, row_timestep = 0.01)

In [None]:
scores = np.clip(scores, -100, 100 )
sns.displot(x=scores, kde=True)

## Now try to use the ImageRegression for autokeras

To make this tutorial easy to follow, we just treat MNIST dataset as a
regression dataset. It means we will treat prediction targets of MNIST dataset,
which are integers ranging from 0 to 9 as numerical values, so that they can be
directly used as the regression targets.


In [None]:
def prep_for_autokeras_image_regressor(df_ml):
    
    chunks = np.array_split(
        df_ml[params_x_for_ml+[params_y_for_ml]].to_numpy(),
        range(0, len(df_ml), 100) # 100 x 0.01 seconds chunks = 1 second chunks
    )

    chunks = chunks[1:-1] # Drop the first and last chunk that may be shorter

    energies = []
    chunks_edited = []

    time_interval = 0.010 # seconds
    for chunk in chunks:
        powers = chunk[:, -1] # for last column in mW
        energy = np.sum(powers * time_interval / 1000000) # in KiloJoules
        chunks_edited.append(chunk[:, :-1]) # for all but last column
        energies.append(energy)

    
    fig =  px.line(y=energies, title='Energies islotated').show()



    y = np.array(energies)
    x = np.array(chunks_edited)

    print(x.shape)
    print(y.shape)
    
    return x, y

In [None]:
def do_ml_image_regression(x, y):
    #X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.10, random_state=42)

    # Initialize the image regressor.
    reg = ak.ImageRegressor(overwrite=True,
                            max_trials=3
                           )
    # Feed the image regressor with training data.
    reg.fit(x, y)

    # Evaluate the best model with testing data.
    #print(reg.evaluate(X_test, y_test))
    
    return reg

reg  = do_ml_image_regression(*prep_for_autokeras_image_regressor(df_ml_train))

In [None]:
# Predict with the best model.

def predict_and_score(reg, df_ml_test):
    x, y  = prep_for_autokeras_image_regressor(df_ml_test)
    predicted_y = reg.predict(x)

    fig =  px.line(y=predicted_y.flatten().tolist(), title='predicted_y').show()


    length = len(y)

    # intialise data of lists.
    data = {'Energies':predicted_y.flatten().tolist() + y.tolist(),
            'type_of_data':length * ["Predicted"] + length * ["Actual"],
            "Index": list(range(length))*2}

    # Create DataFrame
    df = pd.DataFrame(data)

    # Print the output.
    fig = px.line(df, x = "Index", y="Energies", color="type_of_data", title='Energies').show()

    get_energy_error(sum(predicted_y), sum(y))
    
predict_and_score(reg, df_ml_test)