## Configuration

In [None]:
### Load Libraries
from pandas.api.indexers import BaseIndexer
from sklearn.cluster import KMeans
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.text as txt
import numpy as np
import seaborn as sns

import warnings # this allows us to better control warning messages

# Set matplotlib to output plots without having to use plt.show()
%matplotlib inline 

# Set max row display
#pd.set_option('display.max_row', 200)

# Set iPython's max column width to 60
#pd.set_option('display.max_columns', 60)

# Use Seaborn Settings for all plots
# Adjust figure size
plt.rcParams['figure.figsize'] = [30, 30]

# Adjust font size for seaborn graphs and configure settings
sns.set()

# Displays all columns of each DataFrame when the entire DataFrame is called
pd.set_option('display.max_columns', None)

# Do not print warning messages
#warnings.simplefilter('ignore')

DH_plus_dist=pd.read_csv("DH_plus_dist_interpolated.csv")
LA_plus_dist=pd.read_csv("LA_plus_dist_interpolated.csv")
Lucy_16_20_May_plus_dist=pd.read_csv("Lucy_16-20May_plus_dist_interpolated.csv")

### Custom Functions

#### Custom Indexer

In [None]:
class CustomIndexer(BaseIndexer):
        
    # From 1.5.0 onwards step is a mandatory parameter. Adding a default value allows for backward and foreward compatibility up to the latest version 2.2.3. Currently using 1.4.4.
    # All rolling windows are centered reguardless of center parameter.
    def get_window_bounds(self, num_values, min_periods, center, closed):
        
        start = np.empty(num_values, dtype=np.int64)
        end = np.empty(num_values, dtype=np.int64)
        splitDatasets = []

        print("Custom indexer number of values:",num_values)
        print("Indexer fixed window size (inherited parameter with immutable value and is not used):",self.window_size)
        print("Indexer dynamic window size (custom parameter with mutable value and is used):",self.dynamic_window_size)

        # For an even window size you can only have integer indexes. Hence "middle" index value is rounded up automatically to the nearest integer. Thus the "middle" index closer to the right value.
        # Within the BaseIndexer class the start and end bounds work just like the range(a, b) function where: 
        # a = the starting index or value.
        # b = the index or value up to but not including.  
        if self.dynamic_window_size % 2 == 0:
            minShift = self.dynamic_window_size/2
            maxShift = self.dynamic_window_size/2
        else:
            minShift = int(self.dynamic_window_size/2)
            maxShift = round(self.dynamic_window_size/2)
            

        for i in range(num_values):
            # This check makes sure the earliest/lower bound DateTime is <= latest/upper bound DateTime
            # Prevents IndexError by goind out of bounds or a ValueError due to the list traversing to the end of the list as this is how arrays work in Python.
            if i - minShift < 0:
                start[i] = 0
                end[i] = i + maxShift

            elif i + maxShift > num_values - 1:
                start[i] = i - minShift
                end[i] = num_values 

            else:
                start[i] = i - minShift
                end[i] = i + maxShift
        #These print statements were to check if the indexes were assigned to the correct bounds
        #print(start)
        #print(end)

        return start, end

In [None]:
def remove_duplicate_datetimes(dataset):
    print("Number of duplicate records found and removed:",dataset.index.duplicated().sum())
    # transform() method used so the newly formed groupded DataFrame retains its index structure and dimensions meaning records can be added back to the original DataFrame easily.
    x = dataset.groupby(dataset.index,sort=False)['distance'].transform('mean')
    #Assign duplicate DateTimes with the the mean of all of the distances to the dataset
    dataset.loc[:,'distance'] = x.loc[:]
    #Drop duplicate records.
    dataset = dataset.drop_duplicates()
    return dataset
    # For potential future testing
    # 1. Count initital number of records.
    # 2. Count number of duplicates within inital DataFrame.
    # 3. Remove dupliucates.
    # 4. Difference between number of records between before and after removing duplicates.
    # 5. Does difference equal the amount of duplicates found?

In [None]:
# Regular hourly average and standard deviation distance and centred rolling window average and standard deviation distance

def hourly_summary(dataset):

    hourly_dataset = dataset.groupby([dataset.index.date, dataset.index.hour])

    # agg() function because multiple summaries are being caluclated, and creates a DataFrame with a different (Multiindex) index structure and dimensions from the orginal.
    hourly_dataset = hourly_dataset.agg({'distance':['mean','std']})
    
    #hourly mean and std columns added to the original dataFrame with the Dataset on each observed record
    hourly_mean = dataset.groupby([dataset.index.date,dataset.index.hour])[['distance']].transform('mean')
    hourly_standard_deviation = dataset.groupby([dataset.index.date,dataset.index.hour])[['distance']].transform('std')
    dataset['hourly_mean'] = hourly_mean['distance']
    dataset['hourly_standard_deviation'] = hourly_standard_deviation['distance']

    
    rolling_hour_mean = dataset.loc[:,'distance'].rolling('1h',center=True).mean()
    rolling_hour_standard_deviation = dataset.loc[:,'distance'].rolling('1h',center=True).std()
    dataset['rolling_hourly_mean'] = rolling_hour_mean
    dataset['rolling_hourly_standard_deviation'] = rolling_hour_standard_deviation

    #Returns orginal DataFrame and new hourly_dataset DataFrame indexed by date and hour only, with repeated hourly average recordings removed. Different dimensions means noit all data could be aggregated onto the original DataFrame. 
    return dataset, hourly_dataset
    

#### Display Time Series Functions

NOTE: If I wanted to plot only time on the x-axis from the Timestamp datatype: the time attribute was returning the time object, what I needed to do was call the time() method to return the time value.

Without Y axis range from 0 to largest distance value. Range from smallest to largest recorded values.

In [None]:
def satisfied_condition_record_count(dataset):
    dates = dataset.loc[:,'sample_date'].unique()
    total_true_records = 0
    total_false_records = 0
    for date in dates:
        condition = dataset.sample_date==date
        if "satisfied_filtering_condition" in dataset.columns:
            distances = dataset.loc[condition,['distance','satisfied_filtering_condition']]
            green_datetimes = distances.loc[distances.satisfied_filtering_condition == True,:]
            red_datetimes = distances.loc[distances.satisfied_filtering_condition == False,:]
            print("Within",date)
            print("Number of records satisfying filtering condition:",len(green_datetimes))
            print("Number of records not satisfying filtering condition:",len(red_datetimes))
            total_true_records += green_datetimes.count()
            total_false_records += red_datetimes.count()
        
    print("Total number of records satisfying filtering condition:",total_true_records)
    print("Total of records not satisfying filtering condition:",total_false_records)

In [None]:
def relative_range_time_series_line(dataset, *other_column_names): 
    dates = dataset.loc[:,'sample_date'].unique()
    print(dates)
    figure, axis = plt.subplots(dates.size)
    i = 0
    for date in dates:
        min = pd.to_datetime(date+" "+"00:00:00", dayfirst = True)
        max = pd.to_datetime(date+" "+"23:59:59", dayfirst = True)
        print(min)
        print(max)
        condition = dataset.sample_date==date
        times = dataset.loc[condition,'sample_time']
        datetimes = pd.to_datetime(date+" "+times, dayfirst=True)
        distances = dataset.loc[condition,'distance']
        axis[i].set_xlim(min,max)
        axis[i].plot(datetimes,distances,color='g')
        for arg in other_column_names:
            if arg in dataset.columns:
                axis[i].plot(datetimes, dataset.loc[condition,arg],'--')
        #if 'hourly_mean' in dataset.columns:
            # Compare against hourly average.
            #axis[i].plot(datetimes, dataset.loc[condition,'hourly_mean'],'--')
            # Compare against moving hourly average.
            # axis[i].plot(datetimes, dataset.loc[condition,'rolling_hourly_mean'],'--')
        i+=1

In [None]:
def relative_range_time_series_plot(dataset, *other_column_names):
    dates = dataset.loc[:,'sample_date'].unique()
    print(dates)
    figure, axis = plt.subplots(dates.size)
    i = 0
    for date in dates:
        min = pd.to_datetime(date+" "+"00:00:00", dayfirst = True)
        max = pd.to_datetime(date+" "+"23:59:59", dayfirst = True)
        condition = dataset.sample_date==date
        times = dataset.loc[condition,'sample_time']
        datetimes = pd.to_datetime(date+" "+times, dayfirst=True)
        axis[i].set_xlim(min,max)
        # Filterning condition shown only for plots instead of lines as it is more intuative.
        if "satisfied_filtering_condition" in dataset.columns:
            distances = dataset.loc[condition,['distance','satisfied_filtering_condition']]
            green_datetimes = distances.loc[distances.satisfied_filtering_condition == True,:]
            red_datetimes = distances.loc[distances.satisfied_filtering_condition == False,:]
            axis[i].plot(green_datetimes.index,green_datetimes.distance,'x',markersize=3,color='g')
            axis[i].plot(red_datetimes.index,red_datetimes.distance,'x',markersize=3,color='r')
        else:
            distances = dataset.loc[condition,'distance']
            axis[i].plot(datetimes,distances,'x',markersize=3,color='g')
        for arg in other_column_names:
            if arg in dataset.columns:
                axis[i].plot(datetimes, dataset.loc[condition,arg],'--')
        #if 'hourly_mean' in dataset.columns:
            # Compare against hourly average.
            #axis[i].plot(datetimes, dataset.loc[condition,'hourly_mean'],'--')
            # Compare against moving hourly average.
            # axis[i].plot(datetimes, dataset.loc[condition,'rolling_hourly_mean'],'--')
        #if 'rolling_hourly_mean' in dataset.columns:
            #axis[i].plot(datetimes, dataset.loc[condition,'rolling_hourly_mean'],'--')
        #if 'rolling_hourly_standard_deviation' in dataset.columns:
            #axis[i].plot(datetimes, dataset.loc[condition,'rolling_hourly_standard_deviation'],'--')
        #if other_column_name in dataset.columns:
            #axis[i].plot(datetimes, dataset.loc[condition,other_column_name],'--')
        i+=1

With Y axis range from 0 to largest distance value.

In [None]:
def absolute_range_time_series_line(dataset, *other_column_names):    
    dates = dataset.loc[:,'sample_date'].unique()
    largestY = dataset.loc[:,'distance'].max()
    print("Largest distance value:",largestY)
    print(dates)
    figure, axis = plt.subplots(dates.size)
    i = 0
    for date in dates:
        min = pd.to_datetime(date+" "+"00:00:00", dayfirst = True)
        max = pd.to_datetime(date+" "+"23:59:59", dayfirst = True)
        condition = dataset.sample_date==date
        times = dataset.loc[condition,'sample_time']
        datetimes = pd.to_datetime(date+" "+times, dayfirst=True)
        distances = dataset.loc[condition,'distance']
        axis[i].set_xlim(min,max)
        axis[i].set_ylim(0,largestY)
        axis[i].plot(datetimes,distances,color='g')
        for arg in other_column_names:
            if arg in dataset.columns:
                axis[i].plot(datetimes, dataset.loc[condition,arg],'--')
        #if 'hourly_mean' in dataset.columns:
            # Compare against hourly average.
            #axis[i].plot(datetimes, dataset.loc[condition,'hourly_mean'],'--')
            # Compare against moving hourly average.
            # axis[i].plot(datetimes, dataset.loc[condition,'rolling_hourly+mean'],'--')
        i+=1

In [None]:
def absolute_range_time_series_plot(dataset, *other_column_names):
    dates = dataset.loc[:,'sample_date'].unique()
    largestY = dataset.loc[:,'distance'].max()
    print("Largest distance value:",largestY)
    print(dates)
    figure, axis = plt.subplots(dates.size)
    i = 0
    for date in dates:
        min = pd.to_datetime(date+" "+"00:00:00", dayfirst = True)
        max = pd.to_datetime(date+" "+"23:59:59", dayfirst = True)
        condition = dataset.sample_date==date
        times = dataset.loc[condition,'sample_time']
        datetimes = pd.to_datetime(date+" "+times, dayfirst=True)
        axis[i].set_xlim(min,max)
        axis[i].set_ylim(0,largestY)
        # Filterning condition shown only for plots instead of lines as it is more intuative.
        if "satisfied_filtering_condition" in dataset.columns:
            distances = dataset.loc[condition,['distance','satisfied_filtering_condition']]
            green_datetimes = distances.loc[distances.satisfied_filtering_condition == True,:]
            red_datetimes = distances.loc[distances.satisfied_filtering_condition == False,:]
            axis[i].plot(green_datetimes.index,green_datetimes.distance,'x',markersize=3,color='g')
            axis[i].plot(red_datetimes.index,red_datetimes.distance,'x',markersize=3,color='r')
        else:
            distances = dataset.loc[condition,'distance']
            axis[i].plot(datetimes,distances,'x',markersize=3,color='g')
        for arg in other_column_names:
            if arg in dataset.columns:
                axis[i].plot(datetimes, dataset.loc[condition,arg],'--')
        #if 'hourly_mean' in dataset.columns:
            # Compare against hourly average.
            #axis[i].plot(datetimes, dataset.loc[condition,'hourly_mean'],'--')
            # Compare against moving hourly average.
            # axis[i].plot(datetimes, dataset.loc[condition,'rolling_hourly_mean'],'--')
        i+=1

#### Time Series Clustering

For plotting clusters of different distance values on the same graphs.

In [None]:
def inertia_elbow_method_of_whole_duration(dataset):
    
    inertias = []
    
    number_of_clusters = range(1,21)
    
    raw_values = dataset.distance
    
    if "five_record_mean" in dataset.columns:
        five_record_values = dataset.five_record_mean
    else:
        five_record_values = dataset.loc[:,'distance'].rolling(5,center=True).mean()
        
    if "twenty_five_record_mean" in dataset.columns:
        twenty_five_record_values = dataset.twenty_five_record_mean
    else:
        twenty_five_record_values = dataset.loc[:,'distance'].rolling(25,center=True).mean()
    
    #print(type(str(raw_values.index.array)))
    
    #datetimes = raw_values.index.array.strftime('%d/%m/%y %H:%M:%S')
    #print(type(datetimes[0]))
    #print(datetimes[0])
    
    ## Convert Datetime indexes to integers in the form of Unix epochs (non leap year seconds since 01/01/1970) as k-means clustering only compares against numeric values.
    raw_unix_epoch = (raw_values.index - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s")
    
    five_record_unix_epoch = (five_record_values.index - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s")
    
    twenty_five_record_unix_epoch = (twenty_five_record_values.index - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s")

    
    all_data_points = list(zip(raw_unix_epoch.array, raw_values.array)) + list(zip(five_record_unix_epoch.array, five_record_values.array)) + list(zip(twenty_five_record_unix_epoch.array, twenty_five_record_values.array))
    
    
    #five_record_data_points = list(zip(five_record_unix_epoch.array, five_record_values.array))
    
    #twenty_five_record_data_points = list(zip(twenty_five_record_unix_epoch.array, twenty_five_record_values.array))

    number_of_values = len(raw_values) + len(five_record_values) + len(twenty_five_record_values)
    for i in number_of_clusters:
        
            kmeans = KMeans(n_clusters=i)
        
            kmeans.fit(all_data_points)
            inertias.append(kmeans.inertia_)

    plt.plot(number_of_clusters, inertias, marker='o')
    plt.title('Elbow method')
    plt.xlabel('Number of clusters')
    plt.ylabel('Inertia')
    plt.show()

In [None]:
def k_means_clustering_of_whole_duration(dataset, num_of_clusters):
    kmeans = KMeans(n_clusters=num_of_clusters)
    
    unix_epoch = (dataset.index - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s")
    
    all_data_points = np.array(list(zip(unix_epoch.array, dataset.distance.array)) + list(zip(unix_epoch.array, dataset.five_record_mean.array)) + list(zip(unix_epoch.array, dataset.twenty_five_record_mean.array)))
    
    kmeans.fit(all_data_points)
    
    # save new clusters for chart 
    y_km = kmeans.fit_predict(all_data_points)
    
    plt.scatter(all_data_points[:,0], all_data_points[:,1], c=y_km)

Attempt to apply elbow method to each day of the provided dataset on seperate graphs. Time constraints held this back.

In [None]:
#Attempt to apply elbow method to each day of the provided dataset on seperate graphs. Time constraints held this back.

#def inertia_elbow_method_of_daily_duration(dataset):
    
#    dates = dataset.loc[:,'sample_date'].unique()

#    figure, axis = plt.subplots(dates.size)
#    i = 0
#    for date in dates:
#        min = pd.to_datetime(date+" "+"00:00:00", dayfirst = True)
#        max = pd.to_datetime(date+" "+"23:59:59", dayfirst = True)
#        condition = dataset.sample_date==date

#        axis[i].set_xlim(min,max)
        # Filterning condition shown only for plots instead of lines as it is more intuative.
        
#        raw_unix_epoch = (raw_values.index - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s")
    
#        five_record_unix_epoch = (five_record_values.index - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s")
    
#        twenty_five_record_unix_epoch = (twenty_five_record_values.index - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s")

    
#        all_data_points = list(zip(raw_unix_epoch.array, raw_values.array)) + list(zip(five_record_unix_epoch.array, five_record_values.array)) + list(zip(twenty_five_record_unix_epoch.array, twenty_five_record_values.array))
    
    
        #five_record_data_points = list(zip(five_record_unix_epoch.array, five_record_values.array))
    
        #twenty_five_record_data_points = list(zip(twenty_five_record_unix_epoch.array, twenty_five_record_values.array))

#        number_of_values = len(raw_values) + len(five_record_values) + len(twenty_five_record_values)
#        for i in number_of_clusters:
        
#            kmeans = KMeans(n_clusters=i)
        
#            kmeans.fit(all_data_points)
#            inertias.append(kmeans.inertia_)

#        plt.plot(number_of_clusters, inertias, marker='o')
#        plt.title('Elbow method')
#        plt.xlabel('Number of clusters')
#        plt.ylabel('Inertia')
#        plt.show()
        
        
#        if "satisfied_filtering_condition" in dataset.columns:
#            distances = dataset.loc[condition,['distance','satisfied_filtering_condition']]
#            green_datetimes = distances.loc[distances.satisfied_filtering_condition == True,:]
#            red_datetimes = distances.loc[distances.satisfied_filtering_condition == False,:]

#            axis[i].plot(green_datetimes.index,green_datetimes.distance,'x',markersize=3,color='g')
#            axis[i].plot(red_datetimes.index,red_datetimes.distance,'x',markersize=3,color='r')
#        else:
#            distances = dataset.loc[condition,'distance']
#            axis[i].plot(datetimes,distances,'x',markersize=3,color='g')
#        for arg in other_column_names:
#            if arg in dataset.columns:
#                axis[i].plot(datetimes, dataset.loc[condition,arg],'x',markersize=3)
#        i+=1
        
    
#    inertias = []
    
#    number_of_clusters = range(1,21)
    
#    raw_values = dataset.distance
    
#    if "five_record_mean" in dataset.columns:
#        five_record_values = dataset.five_record_mean
#    else:
#        five_record_values = dataset.loc[:,'distance'].rolling(5,center=True).mean()
        
#    if "twenty_five_record_mean" in dataset.columns:
#        twenty_five_record_values = dataset.twenty_five_record_mean
#    else:
#        twenty_five_record_values = dataset.loc[:,'distance'].rolling(25,center=True).mean()
    
    #print(type(str(raw_values.index.array)))
    
    #datetimes = raw_values.index.array.strftime('%d/%m/%y %H:%M:%S')
    #print(type(datetimes[0]))
    #print(datetimes[0])
    
    ## Convert Datetime indexes to integers in the form of Unix epochs (non leap year seconds since 01/01/1970) as k-means clustering only compares against numeric values.
#    raw_unix_epoch = (raw_values.index - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s")
    
#    five_record_unix_epoch = (five_record_values.index - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s")
    
#    twenty_five_record_unix_epoch = (twenty_five_record_values.index - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s")

    
#    all_data_points = list(zip(raw_unix_epoch.array, raw_values.array)) + list(zip(five_record_unix_epoch.array, five_record_values.array)) + list(zip(twenty_five_record_unix_epoch.array, twenty_five_record_values.array))
    
    
    #five_record_data_points = list(zip(five_record_unix_epoch.array, five_record_values.array))
    
    #twenty_five_record_data_points = list(zip(twenty_five_record_unix_epoch.array, twenty_five_record_values.array))

#    number_of_values = len(raw_values) + len(five_record_values) + len(twenty_five_record_values)
#    for i in number_of_clusters:
        
#            kmeans = KMeans(n_clusters=i)
        
#            kmeans.fit(all_data_points)
#            inertias.append(kmeans.inertia_)

#    plt.plot(number_of_clusters, inertias, marker='o')
#    plt.title('Elbow method')
#    plt.xlabel('Number of clusters')
#    plt.ylabel('Inertia')
#    plt.show()

Attempt to apply clustering to each day of the provided dataset on seperate graphs. Time constraints held this back.

In [None]:
#Attempt to apply clustering to each day of the provided dataset on seperate graphs. Time constraints held this back.

#def k_means_clustering_of_daily_duration(dataset, num_of_clusters):
    
#    dates = dataset.loc[:,'sample_date'].unique()
#    print(dates)
#    figure, axis = plt.subplots(dates.size)
#    i = 0
#    for date in dates:
#        kmeans = KMeans(n_clusters=num_of_clusters)
        
#        min = pd.to_datetime(date+" "+"00:00:00", dayfirst = True) - pd.Timestamp("1970-01-01") // pd.Timedelta("1s")
#        max = pd.to_datetime(date+" "+"23:59:59", dayfirst = True) - pd.Timestamp("1970-01-01") // pd.Timedelta("1s")
#        condition = dataset.sample_date==date
#        times = dataset.loc[condition,'sample_time']
#        datetimes = pd.to_datetime(date+" "+times, dayfirst=True)
#        axis[i].set_xlim(min,max)

#        distances = dataset.loc[condition,'distance']
#        five_record_means = dataset.loc[condition,'five_record_mean']
#        twenty_five_record_means = dataset.loc[condition,'twenty_five_record_mean']
        
#        unix_epoch = (times.index - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s")
    
#        all_data_points = np.array(list(zip(unix_epoch.array, distances.distance.array)) + list(zip(unix_epoch.array, five_record_means.five_record_mean.array)) + list(zip(unix_epoch.array, twenty_five_record_means.twenty_five_record_mean.array)))
        
#        kmeans.fit(all_data_points)
    
        # save new clusters for chart 
#        y_km = kmeans.fit_predict(all_data_points)
        
#        axis[i].scatter(all_data_points[:,0], all_data_points[:,1], c=y_km)

#        i+=1

#### Rolling Window Functions

In [None]:
## Old function

# def midpoints(window_size,dataframe):
    
    # Assuming all rolling windows are centered.
    # Because midpoint index must be set to an integer value, if the window size is even it needs to be rounded up to the nearest integer.
#    minShift = round(window_size/2)
#    maxShift = int(window_size/2)
#    indexes = dataframe.index # or indexes = temp.index
#    newIndexes = []
#    count = 0

    # This is to check the positon of the indexes
#    for i in range(indexes.size):

        # This check makes sure the earliest/lower bound DateTime is <= latest/upper bound DateTime
        # Prevents IndexError by goind out of bounds or a ValueError due to the list traversing to the end of the list as this is how arrays work in Python.
#        if i - minShift < 0:
#            earliest = indexes[0]
#            latest = indexes[i+maxShift]
#        elif i + maxShift > indexes.size-1:
#            earliest = indexes[i-minShift]
#            latest = indexes[indexes.size-1]
#        else:
#            earliest = indexes[i-minShift]
#            latest = indexes[i+maxShift]

#        midpoint = pd.Interval(earliest, latest, closed='both').mid
#        newIndexes.append(midpoint)
    
#    return newIndexes


In [None]:
def midpoints(window_size,dataframe):
    
    # Assuming all rolling windows are centered.

    # For an even window size you can only have integer indexes. Hence "middle" index value is rounded up automatically to the nearest integer. Thus the "middle" index closer to the right value.
    # An odd window size will always return the exact "middle" index between the window range. Hence shame shift sizes.
    if window_size % 2 == 0:
        minShift = window_size/2
        maxShift = (window_size/2) - 1
    else:
        minShift = int(window_size/2)
        maxShift = int(window_size/2)
        
    indexes = dataframe.index # or indexes = temp.index
    newIndexes = []
    count = 0

    # This is to check the positon of the indexes
    for i in range(indexes.size):

        # This check makes sure the earliest/lower bound DateTime is <= latest/upper bound DateTime
        # Prevents IndexError by goind out of bounds or a ValueError due to the list traversing to the end of the list as this is how arrays work in Python.
        if i - minShift < 0:
            earliest = indexes[0]
            latest = indexes[i+maxShift]
        elif i + maxShift > indexes.size-1:
            earliest = indexes[i-minShift]
            latest = indexes[indexes.size-1]
        else:
            earliest = indexes[i-minShift]
            latest = indexes[i+maxShift]

        midpoint = pd.Interval(earliest, latest, closed='both').mid
        newIndexes.append(midpoint)
    
    return newIndexes


In [None]:
def statistical_summary(dataset):

    # The RolingWindow object automatically checks for if the Window size exceeds the min number of records per window by throwing an exception.

    #Can also do it this way.
    #DH_minute_rolling_window_summary = pd.DataFrame(index=dataset.index, data = {'mean':dataset.loc[:,'distance'].rolling('min').mean(),'std':dataset.loc[:,'distance'].rolling('min').std(),'var':dataset.loc[:,'distance'].rolling('min').var()})
    rolling_window_summary = pd.DataFrame()
    rolling_window_summary.loc[:,'difference'] = dataset.loc[:,'distance'].diff()
    rolling_window_summary.loc[:,'fractional_change'] = dataset.loc[:,'distance'].pct_change()
    
    #rolling_window_summary.loc[:,'three_record_midpoint'] = midpoints(3, dataset)
    #rolling_window_summary.loc[:,'one_minute_mean'] = dataset.loc[:,'distance'].rolling('1min',center=True).mean()
    #rolling_window_summary.loc[:,'one_minute_standard_deviation'] = dataset.loc[:,'distance'].rolling('1min',center=True).std()
    #rolling_window_summary.loc[:,'one_minute_variance'] = dataset.loc[:,'distance'].rolling('1min',center=True).var()
    #rolling_window_summary.loc[:,'one_minute_difference'] = dataset.loc[:,'distance'].rolling('1min',center=True).mean().diff()
    #rolling_window_summary.loc[:,'one_minute_fractional_change'] = dataset.loc[:,'distance'].rolling('1min',center=True).mean().pct_change()

    #rolling_window_summary.loc[:,'five_record_midpoint'] = midpoints(5, dataset)
    #rolling_window_summary.loc[:,'three_minute_mean'] = dataset.loc[:,'distance'].rolling('3min',center=True).mean()
    #rolling_window_summary.loc[:,'three_minute_standard_deviation'] = dataset.loc[:,'distance'].rolling('3min',center=True).std()
    #rolling_window_summary.loc[:,'three_minute_variance'] = dataset.loc[:,'distance'].rolling('3min',center=True).var()
    #rolling_window_summary.loc[:,'three_minute_difference'] = dataset.loc[:,'distance'].rolling('3min',center=True).mean().diff()
    #rolling_window_summary.loc[:,'three_minute_fractional_change'] = dataset.loc[:,'distance'].rolling('3min',center=True).mean().pct_change()

    #rolling_window_summary.loc[:,'ten_record_midpoint'] = midpoints(10, dataset)
    #rolling_window_summary.loc[:,'five_minute_mean'] = dataset.loc[:,'distance'].rolling('5min',center=True).mean()
    #rolling_window_summary.loc[:,'five_minute_standard_deviation'] = dataset.loc[:,'distance'].rolling('5min',center=True).std()
    #rolling_window_summary.loc[:,'five_minute_variance'] = dataset.loc[:,'distance'].rolling('5min',center=True).var()
    #rolling_window_summary.loc[:,'five_minute_difference'] = dataset.loc[:,'distance'].rolling('5min',center=True).mean().diff()
    #rolling_window_summary.loc[:,'five_minute_fractional_change'] = dataset.loc[:,'distance'].rolling('5min',center=True).mean().pct_change()
    
    #rolling_window_summary.loc[:,'ten_minute_mean'] = dataset.loc[:,'distance'].rolling('10min',center=True).mean()
    #rolling_window_summary.loc[:,'ten_minute_standard_deviation'] = dataset.loc[:,'distance'].rolling('10min',center=True).std()
    #rolling_window_summary.loc[:,'ten_minute_variance'] = dataset.loc[:,'distance'].rolling('10min',center=True).var()
    #rolling_window_summary.loc[:,'ten_minute_difference'] = dataset.loc[:,'distance'].rolling('10min',center=True).mean().diff()
    #rolling_window_summary.loc[:,'ten_minute_fractional_change'] = dataset.loc[:,'distance'].rolling('10min',center=True).mean().pct_change()
    
    #rolling_window_summary.loc[:,'hour_mean'] = dataset.loc[:,'distance'].rolling('h',center=True).mean()
    #rolling_window_summary.loc[:,'hour_standard_deviation'] = dataset.loc[:,'distance'].rolling('h',center=True).std()
    #rolling_window_summary.loc[:,'hour_variance'] = dataset.loc[:,'distance'].rolling('h',center=True).var()
    #rolling_window_summary.loc[:,'hour_difference'] = dataset.loc[:,'distance'].rolling('h',center=True).mean().diff()
    #rolling_window_summary.loc[:,'hour_fractional_change'] = dataset.loc[:,'distance'].rolling('h',center=True).mean().pct_change()
    
    rolling_window_summary.loc[:,'three_record_midpoint'] = midpoints(3, dataset)
    rolling_window_summary.loc[:,'three_record_mean'] = dataset.loc[:,'distance'].rolling(3,center=True).mean()
    rolling_window_summary.loc[:,'three_record_standard_deviation'] = dataset.loc[:,'distance'].rolling(3,center=True).std()
    rolling_window_summary.loc[:,'three_record_variance'] = dataset.loc[:,'distance'].rolling(3,center=True).var()
    rolling_window_summary.loc[:,'three_record_difference'] = dataset.loc[:,'distance'].rolling(3,center=True).mean().diff()
    rolling_window_summary.loc[:,'three_record_fractional_change'] = dataset.loc[:,'distance'].rolling(3,center=True).mean().pct_change()

    rolling_window_summary.loc[:,'five_record_midpoint'] = midpoints(5, dataset)
    rolling_window_summary.loc[:,'five_record_mean'] = dataset.loc[:,'distance'].rolling(5,center=True).mean()
    rolling_window_summary.loc[:,'five_record_standard_deviation'] = dataset.loc[:,'distance'].rolling(5,center=True).std()
    rolling_window_summary.loc[:,'five_record_variance'] = dataset.loc[:,'distance'].rolling(5,center=True).var()
    rolling_window_summary.loc[:,'five_record_difference'] = dataset.loc[:,'distance'].rolling(5,center=True).mean().diff()
    rolling_window_summary.loc[:,'five_record_fractional_change'] = dataset.loc[:,'distance'].rolling(5,center=True).mean().pct_change()

    rolling_window_summary.loc[:,'ten_record_midpoint'] = midpoints(10, dataset)
    rolling_window_summary.loc[:,'ten_record_mean'] = dataset.loc[:,'distance'].rolling(10,center=True).mean()
    rolling_window_summary.loc[:,'ten_record_standard_deviation'] = dataset.loc[:,'distance'].rolling(10,center=True).std()
    rolling_window_summary.loc[:,'ten_record_variance'] = dataset.loc[:,'distance'].rolling(10,center=True).var()
    rolling_window_summary.loc[:,'ten_record_difference'] = dataset.loc[:,'distance'].rolling(10,center=True).mean().diff()
    rolling_window_summary.loc[:,'ten_record_fractional_change'] = dataset.loc[:,'distance'].rolling(10,center=True).mean().pct_change()
   
    return rolling_window_summary


In [None]:
def differences_separate(dataset, rolling_window):

    dates = dataset.loc[:,'sample_date'].unique()

    figure_one, axis_one = plt.subplots(dates.size)
    figure_two, axis_two = plt.subplots(dates.size)
    figure_three, axis_three = plt.subplots(dates.size)
    figure_four, axis_four = plt.subplots(dates.size)

    figure_one.suptitle("Differences per record")
    figure_two.suptitle("Differences per every three records")
    figure_three.suptitle("Differences per every five records")
    figure_four.suptitle("Differences per every ten records")

    #Expected number of measurements plotted
    numOfDistances = 0;
    i = 0
    for date in dates:
        min = pd.to_datetime(date+" "+"00:00:00", dayfirst = True)
        max = pd.to_datetime(date+" "+"23:59:59", dayfirst = True)
        condition = dataset.sample_date==date

        axis_one[i].set_xlim(min,max)
        axis_two[i].set_xlim(min,max)
        axis_three[i].set_xlim(min,max)
        axis_four[i].set_xlim(min,max)

        print(dates)
        axis_one[i].plot(dataset.index,dataset.loc[:,'distance'])
        axis_one[i].plot(rolling_window.index,rolling_window.loc[:,'difference'])
        axis_two[i].plot(dataset.index,dataset.loc[:,'distance'])
        axis_two[i].plot(rolling_window.loc[:,'three_record_midpoint'],rolling_window.loc[:,'three_record_difference'])
        axis_three[i].plot(dataset.index,dataset.loc[:,'distance'])
        axis_three[i].plot(rolling_window.loc[:,'five_record_midpoint'],rolling_window.loc[:,'five_record_difference'])
        axis_four[i].plot(dataset.index,dataset.loc[:,'distance'])
        axis_four[i].plot(rolling_window.loc[:,'ten_record_midpoint'],rolling_window.loc[:,'ten_record_difference']) 
        #Each subplot adds to total number of values plotted
        numOfDistances = numOfDistances + rolling_window.loc[condition,'difference'].count()
        #Checks to see if all values are plotted.
        print(numOfDistances)
        i+=1

In [None]:
def fractional_changes_separate(dataset, rolling_window):
    
    dates = dataset.loc[:,'sample_date'].unique()

    figure_one, axis_one = plt.subplots(dates.size)
    figure_two, axis_two = plt.subplots(dates.size)
    figure_three, axis_three = plt.subplots(dates.size)
    figure_four, axis_four = plt.subplots(dates.size)

    figure_one.suptitle("Fractional changes per record")
    figure_two.suptitle("Fractional changes per every three records")
    figure_three.suptitle("Fractional changes per every five records")
    figure_four.suptitle("Fractional changes per every ten records")

    #Expected number of measurements plotted
    numOfDistances = 0;
    i = 0
    for date in dates:
        min = pd.to_datetime(date+" "+"00:00:00", dayfirst = True)
        max = pd.to_datetime(date+" "+"23:59:59", dayfirst = True)
        condition = dataset.sample_date==date

        axis_one[i].set_xlim(min,max)
        axis_two[i].set_xlim(min,max)
        axis_three[i].set_xlim(min,max)
        axis_four[i].set_xlim(min,max)

        print(dates)
        axis_one[i].plot(dataset.index,dataset.loc[:,'distance'])
        axis_one[i].plot(rolling_window.index,rolling_window.loc[:,'fractional_change'])
        axis_two[i].plot(dataset.index,dataset.loc[:,'distance'])
        axis_two[i].plot(rolling_window.loc[:,'three_record_midpoint'],rolling_window.loc[:,'three_record_fractional_change'])
        axis_three[i].plot(dataset.index,dataset.loc[:,'distance'])
        axis_three[i].plot(rolling_window.loc[:,'five_record_midpoint'],rolling_window.loc[:,'five_record_fractional_change'])
        axis_four[i].plot(dataset.index,dataset.loc[:,'distance'])
        axis_four[i].plot(rolling_window.loc[:,'ten_record_midpoint'],rolling_window.loc[:,'ten_record_fractional_change']) 
        #Each subplot adds to total number of values plotted
        numOfDistances = numOfDistances + rolling_window.loc[condition,'fractional_change'].count()
        #Checks to see if all values are plotted.
        print(numOfDistances)
        i+=1

In [None]:
def means_separate(dataset, rolling_window):
    
    dates = dataset.loc[:,'sample_date'].unique()

    figure_one, axis_one = plt.subplots(dates.size)
    figure_two, axis_two = plt.subplots(dates.size)
    figure_three, axis_three = plt.subplots(dates.size)

    figure_one.suptitle("Mean per every three records")
    figure_two.suptitle("Mean per every five records")
    figure_three.suptitle("Mean per every ten records")

    #Expected number of measurements plotted
    numOfDistances = 0;
    i = 0
    for date in dates:
        min = pd.to_datetime(date+" "+"00:00:00", dayfirst = True)
        max = pd.to_datetime(date+" "+"23:59:59", dayfirst = True)
        condition = dataset.sample_date==date

        axis_one[i].set_xlim(min,max)
        axis_two[i].set_xlim(min,max)
        axis_three[i].set_xlim(min,max)

        print(dates)

        axis_one[i].plot(dataset.index,dataset.loc[:,'distance'])
        axis_one[i].plot(rolling_window.loc[:,'three_record_midpoint'],rolling_window.loc[:,'three_record_mean'])
        axis_two[i].plot(dataset.index,dataset.loc[:,'distance'])
        axis_two[i].plot(rolling_window.loc[:,'five_record_midpoint'],rolling_window.loc[:,'five_record_mean'])
        axis_three[i].plot(dataset.index,dataset.loc[:,'distance'])
        axis_three[i].plot(rolling_window.loc[:,'ten_record_midpoint'],rolling_window.loc[:,'ten_record_mean']) 
        #Each subplot adds to total number of values plotted
        numOfDistances = numOfDistances + rolling_window.loc[condition,'three_record_mean'].count()
        #Checks to see if all values are plotted.
        print(numOfDistances)
        i+=1

In [None]:
def standard_deviations_separate(dataset, rolling_window):
    
    dates = dataset.loc[:,'sample_date'].unique()

    figure_one, axis_one = plt.subplots(dates.size)
    figure_two, axis_two = plt.subplots(dates.size)
    figure_three, axis_three = plt.subplots(dates.size)

    figure_one.suptitle("Standard Deviation per every three records")
    figure_two.suptitle("Standard Deviation per every five records")
    figure_three.suptitle("Standard Deviation per every ten records")

    #Expected number of measurements plotted
    numOfDistances = 0;
    i = 0
    for date in dates:
        min = pd.to_datetime(date+" "+"00:00:00", dayfirst = True)
        max = pd.to_datetime(date+" "+"23:59:59", dayfirst = True)
        condition = dataset.sample_date==date

        axis_one[i].set_xlim(min,max)
        axis_two[i].set_xlim(min,max)
        axis_three[i].set_xlim(min,max)

        print(dates)

        axis_one[i].plot(dataset.index,dataset.loc[:,'distance'])
        axis_one[i].plot(rolling_window.loc[:,'three_record_midpoint'],rolling_window.loc[:,'three_record_standard_deviation'])
        axis_two[i].plot(dataset.index,dataset.loc[:,'distance'])
        axis_two[i].plot(rolling_window.loc[:,'five_record_midpoint'],rolling_window.loc[:,'five_record_standard_deviation'])
        axis_three[i].plot(dataset.index,dataset.loc[:,'distance'])
        axis_three[i].plot(rolling_window.loc[:,'ten_record_midpoint'],rolling_window.loc[:,'ten_record_standard_deviation']) 
        #Each subplot adds to total number of values plotted
        numOfDistances = numOfDistances + rolling_window.loc[condition,'three_record_standard_deviation'].count()
        #Checks to see if all values are plotted.
        print(numOfDistances)
        i+=1

In [None]:
def variances_separate(dataset, rolling_window):
    
    dates = dataset.loc[:,'sample_date'].unique()

    figure_one, axis_one = plt.subplots(dates.size)
    figure_two, axis_two = plt.subplots(dates.size)
    figure_three, axis_three = plt.subplots(dates.size)

    figure_one.suptitle("Variance per every three records")
    figure_two.suptitle("Variance per every five records")
    figure_three.suptitle("Variance per every ten records")

    #Expected number of measurements plotted
    numOfDistances = 0;
    i = 0
    for date in dates:
        min = pd.to_datetime(date+" "+"00:00:00", dayfirst = True)
        max = pd.to_datetime(date+" "+"23:59:59", dayfirst = True)
        condition = dataset.sample_date==date

        axis_one[i].set_xlim(min,max)
        axis_two[i].set_xlim(min,max)
        axis_three[i].set_xlim(min,max)

        print(dates)

        axis_one[i].plot(dataset.index,dataset.loc[:,'distance'])
        axis_one[i].plot(rolling_window.loc[:,'three_record_midpoint'],rolling_window.loc[:,'three_record_variance'])
        axis_two[i].plot(dataset.index,dataset.loc[:,'distance'])
        axis_two[i].plot(rolling_window.loc[:,'five_record_midpoint'],rolling_window.loc[:,'five_record_variance'])
        axis_three[i].plot(dataset.index,dataset.loc[:,'distance'])
        axis_three[i].plot(rolling_window.loc[:,'ten_record_midpoint'],rolling_window.loc[:,'ten_record_variance']) 
        #Each subplot adds to total number of values plotted
        numOfDistances = numOfDistances + rolling_window.loc[condition,'three_record_variance'].count()
        #Checks to see if all values are plotted.
        print(numOfDistances)
        i+=1

In [None]:
def differences_combined(dataset, rolling_window):

    dates = dataset.loc[:,'sample_date'].unique()

    figure, axis = plt.subplots(dates.size)

    figure.suptitle("Differences against each other")

    #Expected number of measurements plotted
    numOfDistances = 0;
    i = 0
    for date in dates:
        min = pd.to_datetime(date+" "+"00:00:00", dayfirst = True)
        max = pd.to_datetime(date+" "+"23:59:59", dayfirst = True)
        condition = dataset.sample_date==date

        axis[i].set_xlim(min,max)

        print(dates)
        axis[i].plot(dataset.index,dataset.loc[:,'distance'])
        axis[i].plot(rolling_window.index,rolling_window.loc[:,'difference'])
        axis[i].plot(rolling_window.loc[:,'three_record_midpoint'],rolling_window.loc[:,'three_record_difference'])
        axis[i].plot(rolling_window.loc[:,'five_record_midpoint'],rolling_window.loc[:,'five_record_difference'])
        axis[i].plot(rolling_window.loc[:,'ten_record_midpoint'],rolling_window.loc[:,'ten_record_difference']) 
        #Each subplot adds to total number of values plotted
        numOfDistances = numOfDistances + rolling_window.loc[condition,'difference'].count()
        #Checks to see if all values are plotted.
        print(numOfDistances)
        i+=1
    figure.legend(["distance","difference","three_record_difference","five_record_difference","ten_record_difference"])

In [None]:
def fractional_changes_combined(dataset, rolling_window):
    
    dates = dataset.loc[:,'sample_date'].unique()

    figure, axis = plt.subplots(dates.size)

    figure.suptitle("Fractional changes against each other")

    #Expected number of measurements plotted
    numOfDistances = 0;
    i = 0
    for date in dates:
        min = pd.to_datetime(date+" "+"00:00:00", dayfirst = True)
        max = pd.to_datetime(date+" "+"23:59:59", dayfirst = True)
        condition = dataset.sample_date==date

        axis[i].set_xlim(min,max)

        print(dates)
        axis[i].plot(dataset.index,dataset.loc[:,'distance'])
        axis[i].plot(rolling_window.index,rolling_window.loc[:,'fractional_change'])
        axis[i].plot(rolling_window.loc[:,'three_record_midpoint'],rolling_window.loc[:,'three_record_fractional_change'])
        axis[i].plot(rolling_window.loc[:,'five_record_midpoint'],rolling_window.loc[:,'five_record_fractional_change'])
        axis[i].plot(rolling_window.loc[:,'ten_record_midpoint'],rolling_window.loc[:,'ten_record_fractional_change']) 
        #Each subplot adds to total number of values plotted
        numOfDistances = numOfDistances + rolling_window.loc[condition,'fractional_change'].count()
        #Checks to see if all values are plotted.
        print(numOfDistances)
        i+=1
    figure.legend(["distance","fractional_change","three_record_fractional_change","five_record_fractional_change","ten_record_fractional_change"])

In [None]:
def means_combined(dataset, rolling_window):
    
    dates = dataset.loc[:,'sample_date'].unique()

    figure, axis = plt.subplots(dates.size)

    figure.suptitle("Means against each other")

    #Expected number of measurements plotted
    numOfDistances = 0;
    i = 0
    for date in dates:
        min = pd.to_datetime(date+" "+"00:00:00", dayfirst = True)
        max = pd.to_datetime(date+" "+"23:59:59", dayfirst = True)
        condition = dataset.sample_date==date

        axis[i].set_xlim(min,max)

        print(dates)
        axis[i].plot(dataset.index,dataset.loc[:,'distance'])

        axis[i].plot(rolling_window.loc[:,'three_record_midpoint'],rolling_window.loc[:,'three_record_mean'])
        axis[i].plot(rolling_window.loc[:,'five_record_midpoint'],rolling_window.loc[:,'five_record_mean'])
        axis[i].plot(rolling_window.loc[:,'ten_record_midpoint'],rolling_window.loc[:,'ten_record_mean']) 
        #Each subplot adds to total number of values plotted
        numOfDistances = numOfDistances + rolling_window.loc[condition,'three_record_mean'].count()
        #Checks to see if all values are plotted.
        print(numOfDistances)
        i+=1
    figure.legend(["distance","three_record_mean","five_record_mean","ten_record_mean"])

In [None]:
def standard_deviations_combined(dataset, rolling_window):
    
    dates = dataset.loc[:,'sample_date'].unique()

    figure, axis = plt.subplots(dates.size)

    figure.suptitle("Standard deviations against each other")

    #Expected number of measurements plotted
    numOfDistances = 0;
    i = 0
    for date in dates:
        min = pd.to_datetime(date+" "+"00:00:00", dayfirst = True)
        max = pd.to_datetime(date+" "+"23:59:59", dayfirst = True)
        condition = dataset.sample_date==date

        axis[i].set_xlim(min,max)

        print(dates)
        axis[i].plot(dataset.index,dataset.loc[:,'distance'])
        axis[i].plot(rolling_window.loc[:,'three_record_midpoint'],rolling_window.loc[:,'three_record_standard_deviation'])
        axis[i].plot(rolling_window.loc[:,'five_record_midpoint'],rolling_window.loc[:,'five_record_standard_deviation'])
        axis[i].plot(rolling_window.loc[:,'ten_record_midpoint'],rolling_window.loc[:,'ten_record_standard_deviation']) 
        #Each subplot adds to total number of values plotted
        numOfDistances = numOfDistances + rolling_window.loc[condition,'three_record_standard_deviation'].count()
        #Checks to see if all values are plotted.
        print(numOfDistances)
        i+=1
    figure.legend(["distance","three_record_standard_deviation","five_record_standard_deviation","ten_record_standard_deviation"])

In [None]:
def variances_combined(dataset, rolling_window):
    
    dates = dataset.loc[:,'sample_date'].unique()

    figure, axis = plt.subplots(dates.size)

    figure.suptitle("Variances against each other")

    #Expected number of measurements plotted
    numOfDistances = 0;
    i = 0
    for date in dates:
        min = pd.to_datetime(date+" "+"00:00:00", dayfirst = True)
        max = pd.to_datetime(date+" "+"23:59:59", dayfirst = True)
        condition = dataset.sample_date==date

        axis[i].set_xlim(min,max)

        print(dates)
        axis[i].plot(dataset.index,dataset.loc[:,'distance'])
        axis[i].plot(rolling_window.loc[:,'three_record_midpoint'],rolling_window.loc[:,'three_record_variance'])
        axis[i].plot(rolling_window.loc[:,'five_record_midpoint'],rolling_window.loc[:,'five_record_variance'])
        axis[i].plot(rolling_window.loc[:,'ten_record_midpoint'],rolling_window.loc[:,'ten_record_variance']) 
        #Each subplot adds to total number of values plotted
        numOfDistances = numOfDistances + rolling_window.loc[condition,'three_record_variance'].count()
        #Checks to see if all values are plotted.
        print(numOfDistances)
        i+=1
    figure.legend(["distance","three_record_variance","five_record_variance","ten_record_variance"])

## First Dataset

### First Dataset Basic Plot

In [None]:
# DateTime indexes
DH_plus_dist = DH_plus_dist.set_index(pd.to_datetime(DH_plus_dist['sample_date']+" "+DH_plus_dist['sample_time'], dayfirst=True), drop = False)
# Makes sure records are in chronological order
DH_plus_dist = DH_plus_dist.sort_index()

In [None]:
print(DH_plus_dist)
print(DH_plus_dist.describe())

In [None]:
print("Number of missing values:\n",DH_plus_dist.isna().sum())

In [None]:
DH_plus_dist.index.has_duplicates

In [None]:
DH_plus_dist = remove_duplicate_datetimes(DH_plus_dist)

In [None]:
# Non function equivalent

# x = DH_plus_dist.groupby(DH_plus_dist.index,sort=False)['distance'].transform('mean')
# DH_plus_dist.loc[:,'distance'] = x.loc[:]
# DH_plus_dist = DH_plus_dist.drop_duplicates()

In [None]:
DH_plus_dist.index.has_duplicates

In [None]:
DH_plus_dist, DH_hourly = hourly_summary(DH_plus_dist)

In [None]:
print(DH_plus_dist)
print(DH_plus_dist.describe())

In [None]:
print("Number of missing values:\n",DH_plus_dist.isna().sum())

In [None]:
print(DH_hourly)
print(DH_hourly.describe())

In [None]:
print("Number of missing hourly values:\n",DH_hourly.isna().sum())

In [None]:
sns.displot(data=DH_plus_dist.loc[:,'distance'], kde=True)

In [None]:
sns.displot(data=DH_hourly.loc[:,'distance']['mean'], kde=True)

In [None]:
sns.displot(data=DH_hourly.loc[:,'distance']['std'], kde=True)

In [None]:
sns.displot(data=DH_plus_dist.loc[:,'rolling_hourly_mean'], kde=True)

In [None]:
sns.displot(data=DH_plus_dist.loc[:,'rolling_hourly_standard_deviation'], kde=True)

In [None]:
sns.displot(data=(DH_plus_dist.loc[:,'distance'],DH_plus_dist.loc[:,'rolling_hourly_mean']), kde=True)

In [None]:
sns.pairplot(DH_plus_dist,height=10)

In [None]:
sns.boxplot(x=DH_plus_dist["sample_date"], y=DH_plus_dist["distance"])

NOTE: If I wanted to plot only time on the x-axis from the Timestamp datatype: the time attribute was returning the time object, what I needed to do was call the time() method to return the time value.

Without Y axis range from 0 to largest distance value. Range from smallest to largest recorded values.

In [None]:
relative_range_time_series_line(DH_plus_dist)

In [None]:
relative_range_time_series_plot(DH_plus_dist)

With Y axis range from 0 to largest distance value.

In [None]:
absolute_range_time_series_line(DH_plus_dist)

In [None]:
absolute_range_time_series_plot(DH_plus_dist)

### First Dataset Rolling Window Summaries

In [None]:
#DataFrame of statistical summaries of dataset.

DH_rolling_window_summary = statistical_summary(DH_plus_dist)

In [None]:
DH_rolling_window_summary

In [None]:
DH_rolling_window_summary.describe()

In [None]:
# Expeced number of values which are not null or nan for the following DH_plus_dist windowing methods.
DH_rolling_window_summary.count()

#### All on seperate subplots

In [None]:
differences_separate(DH_plus_dist, DH_rolling_window_summary)

In [None]:
fractional_changes_separate(DH_plus_dist, DH_rolling_window_summary)

In [None]:
means_separate(DH_plus_dist, DH_rolling_window_summary)

In [None]:
standard_deviations_separate(DH_plus_dist, DH_rolling_window_summary)

In [None]:
variances_separate(DH_plus_dist, DH_rolling_window_summary)

#### Combined

In [None]:
differences_combined(DH_plus_dist, DH_rolling_window_summary)

In [None]:
fractional_changes_combined(DH_plus_dist, DH_rolling_window_summary)

In [None]:
means_combined(DH_plus_dist, DH_rolling_window_summary)

In [None]:
standard_deviations_combined(DH_plus_dist, DH_rolling_window_summary)

In [None]:
variances_combined(DH_plus_dist, DH_rolling_window_summary)

## Second Dataset

### Second Dataset Basic Plot

In [None]:
# DateTime indexes
LA_plus_dist = LA_plus_dist.set_index(pd.to_datetime(LA_plus_dist['sample_date']+" "+LA_plus_dist['sample_time'], dayfirst=True), drop = False)
# Makes sure records are in chronological order
LA_plus_dist = LA_plus_dist.sort_index()

In [None]:
print(LA_plus_dist)
LA_plus_dist.describe()

In [None]:
print("Number of missing values:\n",LA_plus_dist.isna().sum())

In [None]:
LA_plus_dist.index.has_duplicates

In [None]:
LA_plus_dist = remove_duplicate_datetimes(LA_plus_dist)

In [None]:
# Non function equivalent

# x = LA_plus_dist.groupby(LA_plus_dist.index,sort=False)['distance'].transform('mean')
# LA_plus_dist.loc[:,'distance'] = x.loc[:]
# LA_plus_dist = LA_plus_dist.drop_duplicates()

In [None]:
LA_plus_dist.index.has_duplicates

In [None]:
LA_plus_dist, LA_hourly = hourly_summary(LA_plus_dist)

In [None]:
print(LA_plus_dist)
print(LA_plus_dist.describe())

In [None]:
print("Number of missing values:\n",LA_plus_dist.isna().sum())

In [None]:
print(LA_hourly)
print(LA_hourly.describe())

In [None]:
print("Number of missing hourly values:\n",LA_hourly.isna().sum())

In [None]:
sns.displot(data=LA_plus_dist.loc[:,'distance'], kde=True)

In [None]:
sns.displot(data=LA_hourly.loc[:,'distance']['mean'], kde=True)

In [None]:
sns.displot(data=LA_hourly.loc[:,'distance']['std'], kde=True)

In [None]:
sns.displot(data=LA_plus_dist.loc[:,'rolling_hourly_mean'], kde=True)

In [None]:
sns.displot(data=LA_plus_dist.loc[:,'rolling_hourly_standard_deviation'], kde=True)

In [None]:
sns.displot(data=(LA_plus_dist.loc[:,'distance'],LA_plus_dist.loc[:,'rolling_hourly_mean']), kde=True)

In [None]:
sns.pairplot(LA_plus_dist,height=10)

In [None]:
sns.boxplot(x=LA_plus_dist["sample_date"], y=LA_plus_dist["distance"])

In [None]:
something = LA_plus_dist.loc['2023-04-05':'2023-04-08']


plt.plot(something.index, something["distance"])

Without Y axis range from 0 to largest distance value. Range from smallest to largest recorded values.

In [None]:
relative_range_time_series_line(LA_plus_dist)

In [None]:
relative_range_time_series_plot(LA_plus_dist)

With Y axis range from 0 to largest distance value.

In [None]:
absolute_range_time_series_line(LA_plus_dist)

In [None]:
absolute_range_time_series_plot(LA_plus_dist)

### Second Dataset Rolling Window Summaries

In [None]:
#DataFrame of statistical summaries of dataset.

LA_rolling_window_summary = statistical_summary(LA_plus_dist)

In [None]:
LA_rolling_window_summary.describe()

In [None]:
# Expeced number of values which are not null or nan for the following LA_plus_dist windowing methods.
LA_rolling_window_summary.count()

#### All on seperate subplots

In [None]:
differences_separate(LA_plus_dist, LA_rolling_window_summary)

In [None]:
fractional_changes_separate(LA_plus_dist, LA_rolling_window_summary)

In [None]:
means_separate(LA_plus_dist, LA_rolling_window_summary)

In [None]:
standard_deviations_separate(LA_plus_dist, LA_rolling_window_summary)

In [None]:
variances_separate(LA_plus_dist, LA_rolling_window_summary)

#### Combined

In [None]:
differences_combined(LA_plus_dist, LA_rolling_window_summary)

In [None]:
fractional_changes_combined(LA_plus_dist, LA_rolling_window_summary)

In [None]:
means_combined(LA_plus_dist, LA_rolling_window_summary)

In [None]:
standard_deviations_combined(LA_plus_dist, LA_rolling_window_summary)

In [None]:
variances_combined(LA_plus_dist, LA_rolling_window_summary)

## Third Dataset

### Third Dataset Basic Plot

In [None]:
# DateTime indexes
Lucy_16_20_May_plus_dist = Lucy_16_20_May_plus_dist.set_index(pd.to_datetime(Lucy_16_20_May_plus_dist['sample_date']+" "+Lucy_16_20_May_plus_dist['sample_time'], dayfirst=True), drop = False)
# Makes sure records are in chronological order
Lucy_16_20_May_plus_dist = Lucy_16_20_May_plus_dist.sort_index()

In [None]:
print(Lucy_16_20_May_plus_dist)
Lucy_16_20_May_plus_dist.describe()

In [None]:
print("Number of missing values:\n",Lucy_16_20_May_plus_dist.isna().sum())

In [None]:
Lucy_16_20_May_plus_dist.index.has_duplicates

In [None]:
Lucy_16_20_May_plus_dist = remove_duplicate_datetimes(Lucy_16_20_May_plus_dist)

In [None]:
# Non function equivalent

# x = Lucy_16_20_May_plus_dist.groupby(Lucy_16_20_May_plus_dist.index,sort=False)['distance'].transform('mean')
# Lucy_16_20_May_plus_dist.loc[:,'distance'] = x.loc[:]
# Lucy_16_20_May_plus_dist = Lucy_16_20_May_plus_dist.drop_duplicates()

In [None]:
Lucy_16_20_May_plus_dist.index.has_duplicates

In [None]:
Lucy_16_20_May_plus_dist,Lucy_16_20_May_hourly = hourly_summary(Lucy_16_20_May_plus_dist)

In [None]:
print(Lucy_16_20_May_plus_dist)
print(Lucy_16_20_May_plus_dist.describe())

In [None]:
print("Number of missing values:\n",Lucy_16_20_May_plus_dist.isna().sum())

In [None]:
print(Lucy_16_20_May_hourly)
print(Lucy_16_20_May_hourly.describe())

In [None]:
print("Number of missing hourly values:\n",Lucy_16_20_May_hourly.isna().sum())

In [None]:
sns.displot(data=Lucy_16_20_May_plus_dist.loc[:,'distance'], kde=True)

In [None]:
sns.displot(data=Lucy_16_20_May_hourly.loc[:,'distance']['mean'], kde=True)

In [None]:
sns.displot(data=Lucy_16_20_May_hourly.loc[:,'distance']['std'], kde=True)

In [None]:
sns.displot(data=Lucy_16_20_May_plus_dist.loc[:,'rolling_hourly_mean'], kde=True)

In [None]:
sns.displot(data=Lucy_16_20_May_plus_dist.loc[:,'rolling_hourly_standard_deviation'], kde=True)

In [None]:
sns.displot(data=(Lucy_16_20_May_plus_dist.loc[:,'distance'],Lucy_16_20_May_plus_dist.loc[:,'rolling_hourly_mean']), kde=True)

In [None]:
sns.pairplot(Lucy_16_20_May_plus_dist,height=10)

In [None]:
sns.boxplot(x=Lucy_16_20_May_plus_dist["sample_date"], y=Lucy_16_20_May_plus_dist["distance"])

Without Y axis range from 0 to largest distance value. Range from smallest to largest recorded values.

In [None]:
relative_range_time_series_line(Lucy_16_20_May_plus_dist)

In [None]:
relative_range_time_series_plot(Lucy_16_20_May_plus_dist)

With Y axis range from 0 to largest distance value.

In [None]:
absolute_range_time_series_line(Lucy_16_20_May_plus_dist)

In [None]:
absolute_range_time_series_plot(Lucy_16_20_May_plus_dist)

### Third Dataset Rolling Window Summaires

In [None]:
#DataFrame of statistical summaries of dataset.

Lucy_rolling_window_summary = statistical_summary(Lucy_16_20_May_plus_dist)

In [None]:
Lucy_rolling_window_summary

In [None]:
Lucy_rolling_window_summary.describe()

In [None]:
# Expeced number of values which are not null or nan for the following DH_plus_dist windowing methods.
Lucy_rolling_window_summary.count()

#### All on seperate subplots

In [None]:
differences_separate(Lucy_16_20_May_plus_dist, Lucy_rolling_window_summary)

In [None]:
fractional_changes_separate(Lucy_16_20_May_plus_dist, Lucy_rolling_window_summary)

In [None]:
means_separate(Lucy_16_20_May_plus_dist, Lucy_rolling_window_summary)

In [None]:
standard_deviations_separate(Lucy_16_20_May_plus_dist, Lucy_rolling_window_summary)

In [None]:
variances_separate(Lucy_16_20_May_plus_dist, Lucy_rolling_window_summary)

#### Combined

In [None]:
differences_combined(Lucy_16_20_May_plus_dist, Lucy_rolling_window_summary)

In [None]:
fractional_changes_combined(Lucy_16_20_May_plus_dist, Lucy_rolling_window_summary)

In [None]:
means_combined(Lucy_16_20_May_plus_dist, Lucy_rolling_window_summary)

In [None]:
standard_deviations_combined(Lucy_16_20_May_plus_dist, Lucy_rolling_window_summary)

In [None]:
variances_combined(Lucy_16_20_May_plus_dist, Lucy_rolling_window_summary)

# Data Preprocessing & Feature Engineering

1. Load datasets.
2. Format and add datetime indexes to each record
3. Apply custom method for removing duplicate records by using the mean distance value of mulitple distance values
4. Remove records where the distancer value is 0. We know that the bracelet has been taken off and shouldn't be picking up new observations.

n. Final step display summary of the processed data, to compare raw data against new data.

In [None]:
def feature_engineering(dataset_path):
    
    dataset = pd.read_csv(dataset_path)
    print("Dataset before feature engineering:\n",dataset)
    # DateTime indexes
    dataset = dataset.set_index(pd.to_datetime(dataset['sample_date']+" "+dataset['sample_time'], dayfirst=True), drop = False)
    # Makes sure records are in chronological order
    dataset = dataset.sort_index()
    print(dataset.describe())
    print("Number of missing values:\n",dataset.isna().sum())
    
    print("Number of duplicate records removed:\n",dataset.index.duplicated().sum())
    dataset = remove_duplicate_datetimes(dataset)

    zero_distance_count = dataset.loc[dataset.distance == 0,'distance'].count()
    print("Number of zero distance records removed:\n",zero_distance_count)
    dataset = dataset.loc[dataset.distance > 0,:]
    
    rolling_hour_mean = dataset.loc[:,'distance'].rolling('1h',center=True).mean()
    rolling_hour_standard_deviation = dataset.loc[:,'distance'].rolling('1h',center=True).std()
    dataset['rolling_hourly_mean'] = rolling_hour_mean
    dataset['rolling_hourly_standard_deviation'] = rolling_hour_standard_deviation
    
    print("Dataset after feature engineering:\n",dataset)
    print(dataset.describe())
    #sns.displot(data=dataset.loc[:,'distance'], kde=True)
    #sns.boxplot(x=dataset["sample_date"], y=dataset["distance"])
    #plt.plot(dataset.loc[:,'distance'])
    #plt.plot(dataset.loc[:,'distance'].rolling('1h',center=True).std(),'x')
    relative_range_time_series_plot(dataset,'rolling_hourly_mean','rolling_hourly_standard_deviation')
    #absolute_range_time_series_line(dataset.loc[:,'distance'].rolling('1h',center=True).std())

In [None]:
feature_engineering("DH_plus_dist_interpolated.csv")

In [None]:
feature_engineering("LA_plus_dist_interpolated.csv")

In [None]:
feature_engineering("Lucy_16-20May_plus_dist_interpolated.csv")

In [None]:
def feature_engineering_rolling_window_imputation(dataset_path):
    
    dataset = pd.read_csv(dataset_path)
    print("Dataset before feature engineering:\n",dataset)
    # DateTime indexes
    dataset = dataset.set_index(pd.to_datetime(dataset['sample_date']+" "+dataset['sample_time'], dayfirst=True), drop = False)
    # Makes sure records are in chronological order
    dataset = dataset.sort_index()
    print(dataset.describe())
    print("Number of missing values:\n",dataset.isna().sum())
    
    print("Number of duplicate records removed:\n",dataset.index.duplicated().sum())
    dataset = remove_duplicate_datetimes(dataset)

    zero_distance_count = dataset.loc[dataset.distance == 0,'distance'].count()
    print("Number of zero distance records removed:\n",zero_distance_count)
    dataset = dataset.loc[dataset.distance > 0,:]
    
    rolling_hour_mean = dataset.loc[:,'distance'].rolling('1h',center=True).mean()
    rolling_hour_standard_deviation = dataset.loc[:,'distance'].rolling('1h',center=True).std()
    rolling_ten_record_mean = dataset.loc[:,'distance'].rolling(10,center=True).mean()
    rolling_ten_record_standard_deviation = dataset.loc[:,'distance'].rolling(10,center=True).std()
    dataset['rolling_hourly_mean'] = rolling_hour_mean
    dataset['rolling_hourly_standard_deviation'] = rolling_hour_standard_deviation
    dataset['rolling_ten_record_mean'] = rolling_ten_record_mean
    dataset['rolling_ten_record_standard_deviation'] = rolling_ten_record_standard_deviation
    
    # Removes too many records and there are still distinct outliers present
    # dataset = dataset.loc[dataset.distance < (dataset.rolling_hourly_mean-dataset.rolling_hourly_standard_deviation).abs(),:]
    
    dataset['satisfied_filtering_condition'] = (dataset["distance"] < (dataset["distance"] + dataset["distance"].std())) & (dataset["distance"] > (dataset["distance"] - dataset["distance"].std()))
    #(dataframe1['column'] == "expression") & (dataframe1['column'] != "another expression)
    print("Total number of records meeting filtering condition\n:",dataset.loc[:,'satisfied_filtering_condition'].value_counts())

    print("Dataset after feature engineering:\n",dataset)
    print(dataset.describe())
    satisfied_condition_record_count(dataset)

    relative_range_time_series_plot(dataset)
    print(dataset.loc[dataset.satisfied_filtering_condition == True,'satisfied_filtering_condition'].shape)
    print(dataset)

In [None]:
feature_engineering_rolling_window_imputation("DH_plus_dist_interpolated.csv")

In [None]:
feature_engineering_rolling_window_imputation("LA_plus_dist_interpolated.csv")

In [None]:
feature_engineering_rolling_window_imputation("Lucy_16-20May_plus_dist_interpolated.csv")

In [None]:
def feature_engineering_rolling_window_imputation_two(dataset_path):
    
    dataset = pd.read_csv(dataset_path)
    print("Dataset before feature engineering:\n",dataset)
    # DateTime indexes
    dataset = dataset.set_index(pd.to_datetime(dataset['sample_date']+" "+dataset['sample_time'], dayfirst=True), drop = False)
    # Makes sure records are in chronological order
    dataset = dataset.sort_index()
    print(dataset.describe())
    print("Number of missing values:\n",dataset.isna().sum())
    
    print("Number of duplicate records removed:\n",dataset.index.duplicated().sum())
    dataset = remove_duplicate_datetimes(dataset)

    zero_distance_count = dataset.loc[dataset.distance == 0,'distance'].count()
    print("Number of zero distance records removed:\n",zero_distance_count)
    dataset = dataset.loc[dataset.distance > 0,:]
    
    rolling_ten_minutes_mean = dataset.loc[:,'distance'].rolling('10min',center=True).mean()
    rolling_ten_minutes_standard_deviation = dataset.loc[:,'distance'].rolling('10min',center=True).std()
    dataset['rolling_ten_minutes_mean'] = rolling_ten_minutes_mean
    dataset['rolling_ten_minutes_standard_deviation'] = rolling_ten_minutes_standard_deviation
    
    # Removes too many records and there are still distinct outliers present
    # dataset = dataset.loc[dataset.distance < (dataset.rolling_hourly_mean-dataset.rolling_hourly_standard_deviation).abs(),:]
    
    dataset['satisfied_filtering_condition'] = (dataset["rolling_ten_minutes_mean"] < (dataset["distance"] + dataset["rolling_ten_minutes_standard_deviation"])) & (dataset["rolling_ten_minutes_mean"] > (dataset["distance"] - dataset["rolling_ten_minutes_standard_deviation"]))
    #(dataframe1['column'] == "expression") & (dataframe1['column'] != "another expression)
    print("Total number of records meeting filtering condition\n:",dataset.loc[:,'satisfied_filtering_condition'].value_counts())
    
    # Already tried this
    #std = dataset.distance.std()
    #print("Standard Deviation:" ,std)
    #print(dataset.loc[(dataset.distance < (dataset.distance + std)) & (dataset.distance > (dataset.distance - std)),'distance'].count())

    print("Dataset after feature engineering:\n",dataset)
    print(dataset.describe())
    
    satisfied_condition_record_count(dataset)
    relative_range_time_series_plot(dataset,'rolling_ten_minutes_mean','rolling_ten_minutes_standard_deviation')

In [None]:
feature_engineering_rolling_window_imputation_two("DH_plus_dist_interpolated.csv")

In [None]:
feature_engineering_rolling_window_imputation_two("LA_plus_dist_interpolated.csv")

In [None]:
feature_engineering_rolling_window_imputation_two("Lucy_16-20May_plus_dist_interpolated.csv")

In [None]:
def feature_engineering_rolling_window_imputation_three(dataset_path):
    
    dataset = pd.read_csv(dataset_path)
    print("Dataset before feature engineering:\n",dataset)
    # DateTime indexes
    dataset = dataset.set_index(pd.to_datetime(dataset['sample_date']+" "+dataset['sample_time'], dayfirst=True), drop = False)
    # Makes sure records are in chronological order
    dataset = dataset.sort_index()
    print(dataset.describe())
    print("Number of missing values:\n",dataset.isna().sum())
    
    print("Number of duplicate records removed:\n",dataset.index.duplicated().sum())
    dataset = remove_duplicate_datetimes(dataset)

    zero_distance_count = dataset.loc[dataset.distance == 0,'distance'].count()
    print("Number of zero distance records removed:\n",zero_distance_count)
    dataset = dataset.loc[dataset.distance > 0,:]
    
    rolling_fifty_record_mean = dataset.loc[:,'distance'].rolling(50,center=True).mean()
    rolling_fifty_record_standard_deviation = dataset.loc[:,'distance'].rolling(50,center=True).std()
    dataset['rolling_fifty_record_mean'] = rolling_fifty_record_mean
    dataset['rolling_fifty_record_standard_deviation'] = rolling_fifty_record_standard_deviation
    
    # Removes too many records and there are still distinct outliers present
    # dataset = dataset.loc[dataset.distance < (dataset.rolling_hourly_mean-dataset.rolling_hourly_standard_deviation).abs(),:]
    
    dataset['satisfied_filtering_condition'] = (dataset["rolling_fifty_record_mean"] < (dataset["distance"] + dataset["rolling_fifty_record_standard_deviation"])) & (dataset["rolling_fifty_record_mean"] > (dataset["distance"] - dataset["rolling_fifty_record_standard_deviation"]))
    #(dataframe1['column'] == "expression") & (dataframe1['column'] != "another expression)
    print("Total number of records meeting filtering condition:\n",dataset.loc[:,'satisfied_filtering_condition'].value_counts())

    print("Dataset after feature engineering:\n",dataset)
    print(dataset.describe())
    
    satisfied_condition_record_count(dataset)
    relative_range_time_series_plot(dataset,'rolling_fifty_record_mean','rolling_fifty_record_standard_deviation')

In [None]:
feature_engineering_rolling_window_imputation_three("DH_plus_dist_interpolated.csv")

In [None]:
feature_engineering_rolling_window_imputation_three("LA_plus_dist_interpolated.csv")

In [None]:
feature_engineering_rolling_window_imputation_three("Lucy_16-20May_plus_dist_interpolated.csv")

In [None]:
def hourly_box_plots(dataset_path):
    
    dataset = pd.read_csv(dataset_path)
    print("Dataset before feature engineering:\n",dataset)
    # DateTime indexes
    dataset = dataset.set_index(pd.to_datetime(dataset['sample_date']+" "+dataset['sample_time'], dayfirst=True), drop = False)
    # Makes sure records are in chronological order
    dataset = dataset.sort_index()
    print(dataset.describe())
    print("Number of missing values:\n",dataset.isna().sum())
    
    print("Number of duplicate records removed:\n",dataset.index.duplicated().sum())
    dataset = remove_duplicate_datetimes(dataset)

    zero_distance_count = dataset.loc[dataset.distance == 0,'distance'].count()
    print("Number of zero distance records removed:\n",zero_distance_count)
    dataset = dataset.loc[dataset.distance > 0,:]
    dataset['hour'] = dataset.index.hour
    sns.boxplot(x=dataset["hour"], y=dataset["distance"])

In [None]:
hourly_box_plots("DH_plus_dist_interpolated.csv")

In [None]:
hourly_box_plots("LA_plus_dist_interpolated.csv")

In [None]:
hourly_box_plots("Lucy_16-20May_plus_dist_interpolated.csv")

In [None]:
def hourly_average_box_plot(dataset_path):
    
    dataset = pd.read_csv(dataset_path)
    print("Dataset before feature engineering:\n",dataset)
    # DateTime indexes
    dataset = dataset.set_index(pd.to_datetime(dataset['sample_date']+" "+dataset['sample_time'], dayfirst=True), drop = False)
    # Makes sure records are in chronological order
    dataset = dataset.sort_index()
    print(dataset.describe())
    print("Number of missing values:\n",dataset.isna().sum())
    
    print("Number of duplicate records removed:\n",dataset.index.duplicated().sum())
    dataset = remove_duplicate_datetimes(dataset)

    zero_distance_count = dataset.loc[dataset.distance == 0,'distance'].count()
    print("Number of zero distance records removed:\n",zero_distance_count)
    dataset = dataset.resample('h').mean()
    print(dataset)
    print(dataset.describe())
    sns.boxplot(x=dataset['distance'])

In [None]:
hourly_average_box_plot("DH_plus_dist_interpolated.csv")

In [None]:
hourly_average_box_plot("LA_plus_dist_interpolated.csv")

In [None]:
hourly_average_box_plot("Lucy_16-20May_plus_dist_interpolated.csv")

In [None]:
def greater_than_lower_quartile(dataset_path):
    
    dataset = pd.read_csv(dataset_path)
    print("Dataset before feature engineering:\n",dataset)
    # DateTime indexes
    dataset = dataset.set_index(pd.to_datetime(dataset['sample_date']+" "+dataset['sample_time'], dayfirst=True), drop = False)
    # Makes sure records are in chronological order
    dataset = dataset.sort_index()
    print(dataset.describe())
    print("Number of missing values:\n",dataset.isna().sum())
    
    print("Number of duplicate records removed:\n",dataset.index.duplicated().sum())
    dataset = remove_duplicate_datetimes(dataset)

    zero_distance_count = dataset.loc[dataset.distance == 0,'distance'].count()
    print("Number of zero distance records removed:\n",zero_distance_count)
    dataset = dataset.loc[dataset.distance > 0,:]
    
    rolling_hour_mean = dataset.loc[:,'distance'].rolling('1h',center=True).mean()
    rolling_hour_standard_deviation = dataset.loc[:,'distance'].rolling('1h',center=True).std()
    dataset['rolling_hourly_mean'] = rolling_hour_mean
    dataset['rolling_hourly_standard_deviation'] = rolling_hour_standard_deviation
    
    dataset['satisfied_filtering_condition'] = dataset["distance"] > dataset["distance"].quantile(0.25)
    #(dataframe1['column'] == "expression") & (dataframe1['column'] != "another expression)
    print("Total number of records above the lower quartile:\n",dataset.loc[:,'satisfied_filtering_condition'].value_counts())
    print("Dataset after feature engineering:\n",dataset)
    print(dataset.describe())
    satisfied_condition_record_count(dataset)
    
    relative_range_time_series_plot(dataset,'rolling_hourly_mean','rolling_hourly_standard_deviation')

In [None]:
greater_than_lower_quartile("DH_plus_dist_interpolated.csv")

In [None]:
greater_than_lower_quartile("LA_plus_dist_interpolated.csv")

In [None]:
greater_than_lower_quartile("Lucy_16-20May_plus_dist_interpolated.csv")

In [None]:
def stationary(dataset_path):
    dataset = pd.read_csv(dataset_path)
    print("Dataset before feature engineering:\n",dataset)
    # DateTime indexes
    dataset = dataset.set_index(pd.to_datetime(dataset['sample_date']+" "+dataset['sample_time'], dayfirst=True), drop = False)
    # Makes sure records are in chronological order
    dataset = dataset.sort_index()
    print(dataset.describe())
    print("Number of missing values:\n",dataset.isna().sum())
    
    print("Number of duplicate records removed:\n",dataset.index.duplicated().sum())
    dataset = remove_duplicate_datetimes(dataset)

    zero_distance_count = dataset.loc[dataset.distance == 0,'distance'].count()
    print("Number of zero distance records removed:\n",zero_distance_count)
    dataset = dataset.loc[dataset.distance > 0,:]
    
    rolling_hour_mean = dataset.loc[:,'distance'].rolling('1h',center=True).mean()
    rolling_hour_standard_deviation = dataset.loc[:,'distance'].rolling('1h',center=True).std()
    dataset['rolling_hourly_mean'] = rolling_hour_mean
    dataset['rolling_hourly_standard_deviation'] = rolling_hour_standard_deviation
    dataset['difference'] = dataset["distance"].diff()
    
    
    rolling_daily_mean = dataset.loc[:,'distance'].rolling('1D',center=True).mean()
    rolling_daily_standard_deviation = dataset.loc[:,'distance'].rolling('1D',center=True).std()
    dataset['rolling_daily_mean'] = rolling_daily_mean
    dataset['rolling_daily_standard_deviation'] = rolling_daily_standard_deviation
    
    ten_record_difference = dataset.loc[:,'distance'].rolling(10,center=True).mean().diff()
    dataset["ten_record_difference"] = ten_record_difference
    #rolling_ten_record_mean = dataset.loc[:,'distance'].rolling(10,center=True).mean()
    #rolling_ten_record_standard_deviation = dataset.loc[:,'distance'].rolling(10,center=True).std()
    #dataset['rolling_ten_record_mean'] = rolling_ten_record_mean
    #dataset['rolling_ten_record_standard_deviation'] = rolling_ten_record_standard_deviation
    
    #rolling_fifty_record_mean = dataset.loc[:,'distance'].rolling(50,center=True).mean()
    #rolling_fifty_record_standard_deviation = dataset.loc[:,'distance'].rolling(50,center=True).std()
    #dataset['rolling_fifty_record_mean'] = rolling_fifty_record_mean
    #dataset['rolling_fifty_record_standard_deviation'] = rolling_fifty_record_standard_deviation
    
    #five_record_mean = dataset.loc[:,'distance'].rolling(5,center=True).mean()
    #five_record_standard_deviation = dataset.loc[:,'distance'].rolling(5,center=True).std()
    #dataset['five_record_mean'] = five_record_mean
    #dataset['five_record_standard_deviation'] = five_record_standard_deviation
    
    twenty_five_record_mean = dataset.loc[:,'distance'].rolling(25,center=True).mean()
    twenty_five_record_standard_deviation = dataset.loc[:,'distance'].rolling(25,center=True).std()
    dataset['twenty_five_record_mean'] = twenty_five_record_mean
    dataset['twenty_five_record_standard_deviation'] = twenty_five_record_standard_deviation
    twenty_five_record_difference = dataset.loc[:,'distance'].rolling(25,center=True).mean().diff()
    dataset['twenty_five_record_difference'] = twenty_five_record_difference
    
    #dataset['satisfied_filtering_condition'] = (dataset["distance"] > (dataset["distance"].mean() - dataset["distance"].std())) & (dataset["distance"] < (dataset["distance"].mean() + dataset["distance"].std()))
    #(dataframe1['column'] == "expression") & (dataframe1['column'] != "another expression)
    #dataset['satisfied_filtering_condition'] = (dataset["distance"] > (dataset["rolling_hourly_mean"] - dataset["rolling_hourly_standard_deviation"])) & (dataset["distance"] < (dataset["rolling_hourly_mean"] + dataset["rolling_hourly_standard_deviation"]))
    #print("Total number of records meeting filtering condition\n:",dataset.loc[:,'satisfied_filtering_condition'].value_counts())
    
    #dataset['satisfied_filtering_condition'] = (dataset["distance"] > (dataset["rolling_ten_record_mean"] - dataset["rolling_ten_record_standard_deviation"])) & (dataset["distance"] < (dataset["rolling_ten_record_mean"] + dataset["rolling_ten_record_standard_deviation"]))
    #dataset['satisfied_filtering_condition'] = (dataset["rolling_ten_record_mean"] > (dataset["distance"] - dataset["rolling_ten_record_standard_deviation"]))
    #dataset['satisfied_filtering_condition'] = (dataset["rolling_ten_record_standard_deviation"]<dataset["rolling_ten_record_mean"])
    #dataset['satisfied_filtering_condition'] = (dataset["rolling_hourly_standard_deviation"]<dataset["rolling_hourly_mean"])
    #dataset['satisfied_filtering_condition'] = (dataset["five_record_standard_deviation"]<dataset["five_record_mean"])
    #dataset['satisfied_filtering_condition'] = (dataset["distance"].diff() < dataset["rolling_hourly_standard_deviation"]) # Use for changing betwen passive and active staes possibly. standard deviation or variance for statiarity.
    #dataset['satisfied_filtering_condition'] = (dataset["distance"].diff().abs() < dataset["distance"].std()**2)
    
    #dataset['satisfied_filtering_condition'] = (dataset["distance"] < dataset["rolling_daily_mean"] + dataset["rolling_daily_standard_deviation"])
    
    #dataset['satisfied_filtering_condition'] = (dataset["ten_record_difference"].abs() < dataset["distance"].std())
    
    #dataset['satisfied_filtering_condition'] = ((dataset["distance"] < (dataset["twenty_five_record_mean"] + dataset["distance"].std())) & (dataset["distance"] > (dataset["twenty_five_record_mean"] - dataset["distance"].std())))
    
    #dataset['satisfied_filtering_condition'] = (dataset["twenty_five_record_difference"].abs()<=0.25)
    
    #dataset['satisfied_filtering_condition'] = ((dataset["distance"] < (dataset["twenty_five_record_mean"] + dataset["twenty_five_record_standard_deviation"])) & (dataset["distance"] > (dataset["twenty_five_record_mean"] - dataset["twenty_five_record_standard_deviation"])))
    
    
    # Try out 5, 15 and 25 records and see how they compare.
    
    dataset['satisfied_filtering_condition'] = (dataset["twenty_five_record_standard_deviation"] < 0.1) | (dataset["distance"].diff().abs() < 0.25) # Stationary, added additional condition regarding the amount of difference from previous value so it won't misclasify the ends of passive phases. 
    #Then I could cont the number of consecutive records which meet this condition to determine whether the section is in an active or passive phase.
    
    #dataset['satisfied_filtering_condition'] = (dataset["distance"] < (dataset["twenty_five_record_mean"] + 1)) & (dataset["distance"] > (dataset["twenty_five_record_mean"] - 1)) | (dataset["distance"].diff().abs() < 0.25)  # Finds noise
    
    #dataset['satisfied_filtering_condition'] = (dataset["distance"].diff().abs() < 0.25) # Useful for retaining useful records at the ends of stationary phases.
    
    #dataset['satisfied_filtering_condition'] = dataset['twenty_five_record_standard_deviation'] <= dataset["distance"].std() # might also be used to support indication of transition between active and passive states.

    #dataset['satisfied_filtering_condition'] = (dataset.index.diff().abs() < 10)
    
    dataset['shifted_datetime'] = dataset.index
    dataset['shifted_datetime'] = dataset['shifted_datetime'].shift(1)
    
    print(dataset.groupby(dataset.index - dataset['shifted_datetime'] <= pd.Timedelta(15, "m")))
    #dataset['satisfied_filtering_condition'] = (dataset.index - dataset['shifted_datetime'] <= pd.Timedelta(15, "m")) # If gaps between recordings are grater than 15 minutes then the rolling can restart as long gaps between recordings can skew rolling window calculations.
    
    print("Total number of records meeting filtering condition\n:",dataset.loc[:,'satisfied_filtering_condition'].value_counts())
    
    print("Dataset after feature engineering:\n",dataset)
    print(dataset.describe())
    satisfied_condition_record_count(dataset)

    #relative_range_time_series_plot(dataset,'rolling_hourly_mean','rolling_hourly_standard_deviation')
    #relative_range_time_series_plot(dataset,'difference')
    #relative_range_time_series_plot(dataset,'ten_record_difference')
    relative_range_time_series_plot(dataset,'twenty_five_record_mean','twenty_five_record_standard_deviation','twenty_five_record_difference')

In [None]:
stationary("DH_plus_dist_interpolated.csv")

In [None]:
stationary("LA_plus_dist_interpolated.csv")

In [None]:
stationary("Lucy_16-20May_plus_dist_interpolated.csv")

In [None]:
def long_term(dataset_path):
    dataset = pd.read_csv(dataset_path)
    print("Dataset before feature engineering:\n",dataset)
    # DateTime indexes
    dataset = dataset.set_index(pd.to_datetime(dataset['sample_date']+" "+dataset['sample_time'], dayfirst=True), drop = False)
    # Makes sure records are in chronological order
    dataset = dataset.sort_index()
    print(dataset.describe())
    print("Number of missing values:\n",dataset.isna().sum())
    
    print("Number of duplicate records removed:\n",dataset.index.duplicated().sum())
    dataset = remove_duplicate_datetimes(dataset)

    zero_distance_count = dataset.loc[dataset.distance == 0,'distance'].count()
    print("Number of zero distance records removed:\n",zero_distance_count)
    dataset = dataset.loc[dataset.distance > 0,:]
    
    
    twenty_five_record_mean = dataset.loc[:,'distance'].rolling(25,center=True).mean()
    twenty_five_record_standard_deviation = dataset.loc[:,'distance'].rolling(25,center=True).std()
    dataset['twenty_five_record_mean'] = twenty_five_record_mean
    dataset['twenty_five_record_standard_deviation'] = twenty_five_record_standard_deviation
    twenty_five_record_difference = dataset.loc[:,'distance'].rolling(25,center=True).mean().diff()
    dataset['twenty_five_record_difference'] = twenty_five_record_difference
    
    #dataset['satisfied_filtering_condition'] = (dataset["twenty_five_record_standard_deviation"] < 0.1) | (dataset["distance"].diff().abs() < 0.25) # Stationary, added additional condition regarding the amount of difference from previous value so it won't misclasify the ends of passive phases. 
    #Then I could count the number of consecutive records which meet this condition to determine whether the section is in an active or passive phase.
    
    #dataset['satisfied_filtering_condition'] = (dataset["distance"] < (dataset["twenty_five_record_mean"] + 1)) & (dataset["distance"] > (dataset["twenty_five_record_mean"] - 1)) | (dataset["distance"].diff().abs() < 0.25)  # Finds noise
    
    #dataset['satisfied_filtering_condition'] = (dataset["distance"].diff().abs() < 0.25) # Useful for retaining useful records at the ends of stationary phases.
    
    #dataset['satisfied_filtering_condition'] = dataset['twenty_five_record_standard_deviation'] <= dataset["distance"].std() # might also be used to support indication of transition between active and passive states.

    #dataset['satisfied_filtering_condition'] = (dataset.index.diff().abs() < 10)
    
    #dataset['satisfied_filtering_condition'] = (dataset["distance"].diff().abs() < 0.1) # Alternative method to determine passive periods.
    
    dataset['satisfied_filtering_condition'] = (dataset["distance"].diff().abs() <= 0.5) | (dataset['twenty_five_record_standard_deviation'] <= 0.25)
    
    #dataset['satisfied_filtering_condition'] = (dataset["distance"].diff().abs() <= 1)
    
    subset = dataset.loc[dataset.satisfied_filtering_condition == False,:].index.to_frame(name='false_condition_interval')
    #othersubset = subset.index.to_frame(name='false_condition_interval')
    subset['false_condition_interval'] = subset.diff()
    #othersubset['shifted_false_datetime'] = subset.index.to_series().diff()
    print(subset)
    #print(subset.index.to_series().diff())
    #subset['shifted_false_datetime'] = pd.to_datetime(subset['sample_date']+" "+subset['sample_time'], dayfirst=True)
    #subset['shifted_false_datetime'] = subset['shifted_false_datetime'].diff()
    
    
    #print("Subset:", subset)

    dataset['shifted_datetime'] = dataset.index
    dataset['shifted_datetime'] = dataset['shifted_datetime'].shift(1)
    dataset = pd.concat([dataset,subset], axis=1)
    #dataset['satisfied_filtering_condition'] = (dataset.index - dataset['shifted_datetime'] <= pd.Timedelta(15, "m")) # If gaps between recordings are grater than 15 minutes then the rolling can restart as long gaps between recordings can skew rolling window calculations.
    
    print("Total number of records meeting filtering condition\n:",dataset.loc[:,'satisfied_filtering_condition'].value_counts())
    
    print("Dataset after feature engineering:\n",dataset)
    print(dataset.describe())
    satisfied_condition_record_count(dataset)

    #relative_range_time_series_plot(dataset,'rolling_hourly_mean','rolling_hourly_standard_deviation')
    #relative_range_time_series_plot(dataset,'difference')
    #relative_range_time_series_plot(dataset,'ten_record_difference')
    relative_range_time_series_plot(dataset,'twenty_five_record_mean','twenty_five_record_standard_deviation','twenty_five_record_difference')

In [None]:
def middle_term(dataset_path):
    dataset = pd.read_csv(dataset_path)
    print("Dataset before feature engineering:\n",dataset)
    # DateTime indexes
    dataset = dataset.set_index(pd.to_datetime(dataset['sample_date']+" "+dataset['sample_time'], dayfirst=True), drop = False)
    # Makes sure records are in chronological order
    dataset = dataset.sort_index()
    print(dataset.describe())
    print("Number of missing values:\n",dataset.isna().sum())
    
    print("Number of duplicate records removed:\n",dataset.index.duplicated().sum())
    dataset = remove_duplicate_datetimes(dataset)

    zero_distance_count = dataset.loc[dataset.distance == 0,'distance'].count()
    print("Number of zero distance records removed:\n",zero_distance_count)
    dataset = dataset.loc[dataset.distance > 0,:]
    
    rolling_fifteen_record_mean = dataset.loc[:,'distance'].rolling(15,center=True).mean()
    rolling_fifteen_record_standard_deviation = dataset.loc[:,'distance'].rolling(15,center=True).std()
    dataset['rolling_fifteen_record_mean'] = rolling_fifteen_record_mean
    dataset['rolling_fifteen_record_standard_deviation'] = rolling_fifteen_record_standard_deviation
    rolling_fifteen_record_difference = dataset.loc[:,'distance'].rolling(15,center=True).mean().diff()
    dataset['rolling_fifteen_record_difference'] = rolling_fifteen_record_difference
    
    #dataset['satisfied_filtering_condition'] = (dataset["rolling_fifteen_record_standard_deviation"] < 0.1) | (dataset["distance"].diff().abs() < 0.25) # Stationary, added additional condition regarding the amount of difference from previous value so it won't misclasify the ends of passive phases. 
    #Then I could count the number of consecutive records which meet this condition to determine whether the section is in an active or passive phase.
    
    #dataset['satisfied_filtering_condition'] = (dataset["distance"] < (dataset["rolling_fifteen_record_mean"] + 1)) & (dataset["distance"] > (dataset["rolling_fifteen_record_mean"] - 1)) | (dataset["distance"].diff().abs() < 0.25)  # Finds noise
    
    #dataset['satisfied_filtering_condition'] = (dataset["distance"].diff().abs() < 0.25) # Useful for retaining useful records at the ends of stationary phases.
    
    #dataset['satisfied_filtering_condition'] = dataset['rolling_fifteen_record_standard_deviation'] <= dataset["distance"].std() # might also be used to support indication of transition between active and passive states.

    #dataset['satisfied_filtering_condition'] = (dataset.index.diff().abs() < 10)
    
    dataset['satisfied_filtering_condition'] = (dataset["distance"].diff().abs() < 0.25) # Alternative method to determine passive periods.
    
    dataset['shifted_datetime'] = dataset.index
    dataset['shifted_datetime'] = dataset['shifted_datetime'].shift(1)
    
    #dataset['satisfied_filtering_condition'] = (dataset.index - dataset['shifted_datetime'] <= pd.Timedelta(15, "m")) # If gaps between recordings are grater than 15 minutes then the rolling can restart as long gaps between recordings can skew rolling window calculations.
    
    print("Total number of records meeting filtering condition\n:",dataset.loc[:,'satisfied_filtering_condition'].value_counts())
    
    print("Dataset after feature engineering:\n",dataset)
    print(dataset.describe())
    satisfied_condition_record_count(dataset)

    #relative_range_time_series_plot(dataset,'rolling_hourly_mean','rolling_hourly_standard_deviation')
    #relative_range_time_series_plot(dataset,'difference')
    #relative_range_time_series_plot(dataset,'ten_record_difference')
    relative_range_time_series_plot(dataset,'rolling_fifteen_record_mean','rolling_fifteen_record_standard_deviation','rolling_fifteen_record_difference')

In [None]:
def short_term(dataset_path):
    dataset = pd.read_csv(dataset_path)
    print("Dataset before feature engineering:\n",dataset)
    # DateTime indexes
    dataset = dataset.set_index(pd.to_datetime(dataset['sample_date']+" "+dataset['sample_time'], dayfirst=True), drop = False)
    # Makes sure records are in chronological order
    dataset = dataset.sort_index()
    print(dataset.describe())
    print("Number of missing values:\n",dataset.isna().sum())
    
    print("Number of duplicate records removed:\n",dataset.index.duplicated().sum())
    dataset = remove_duplicate_datetimes(dataset)

    zero_distance_count = dataset.loc[dataset.distance == 0,'distance'].count()
    print("Number of zero distance records removed:\n",zero_distance_count)
    dataset = dataset.loc[dataset.distance > 0,:]
    
    five_record_mean = dataset.loc[:,'distance'].rolling(5,center=True).mean()
    five_record_standard_deviation = dataset.loc[:,'distance'].rolling(5,center=True).std()
    dataset['five_record_mean'] = five_record_mean
    dataset['five_record_standard_deviation'] = five_record_standard_deviation
    five_record_difference = dataset.loc[:,'distance'].rolling(5,center=True).mean().diff()
    dataset['five_record_difference'] = five_record_difference
    
    #dataset['satisfied_filtering_condition'] = (dataset["five_record_standard_deviation"] < 0.1) | (dataset["distance"].diff().abs() < 0.25) # Stationary, added additional condition regarding the amount of difference from previous value so it won't misclasify the ends of passive phases. 
    #Then I could count the number of consecutive records which meet this condition to determine whether the section is in an active or passive phase.
    
    #dataset['satisfied_filtering_condition'] = (dataset["distance"] < (dataset["five_record_mean"] + 1)) & (dataset["distance"] > (dataset["five_record_mean"] - 1)) | (dataset["distance"].diff().abs() < 0.25)  # Finds noise
    
    #dataset['satisfied_filtering_condition'] = (dataset["distance"].diff().abs() < 0.25) # Useful for retaining useful records at the ends of stationary phases.
    
    #dataset['satisfied_filtering_condition'] = dataset['five_record_standard_deviation'] <= dataset["distance"].std() # might also be used to support indication of transition between active and passive states.

    #dataset['satisfied_filtering_condition'] = (dataset.index.diff().abs() < 10)
    
    dataset['satisfied_filtering_condition'] = (dataset["distance"].diff().abs() < 0.25) # Alternative method to determine passive periods.
    
    dataset['shifted_datetime'] = dataset.index
    dataset['shifted_datetime'] = dataset['shifted_datetime'].shift(1)
    
    #dataset['satisfied_filtering_condition'] = (dataset.index - dataset['shifted_datetime'] <= pd.Timedelta(15, "m")) # If gaps between recordings are grater than 15 minutes then the rolling can restart as long gaps between recordings can skew rolling window calculations.
    
    print("Total number of records meeting filtering condition\n:",dataset.loc[:,'satisfied_filtering_condition'].value_counts())
    
    print("Dataset after feature engineering:\n",dataset)
    print(dataset.describe())
    satisfied_condition_record_count(dataset)

    #relative_range_time_series_plot(dataset,'rolling_hourly_mean','rolling_hourly_standard_deviation')
    #relative_range_time_series_plot(dataset,'difference')
    #relative_range_time_series_plot(dataset,'ten_record_difference')
    relative_range_time_series_plot(dataset,'five_record_mean','five_record_standard_deviation','five_record_difference')

In [None]:
long_term("DH_plus_dist_interpolated.csv")

In [None]:
long_term("LA_plus_dist_interpolated.csv")

In [None]:
long_term("Lucy_16-20May_plus_dist_interpolated.csv")

In [None]:
middle_term("DH_plus_dist_interpolated.csv")

In [None]:
middle_term("LA_plus_dist_interpolated.csv")

In [None]:
middle_term("Lucy_16-20May_plus_dist_interpolated.csv")

In [None]:
short_term("DH_plus_dist_interpolated.csv")

In [None]:
short_term("LA_plus_dist_interpolated.csv")

In [None]:
short_term("Lucy_16-20May_plus_dist_interpolated.csv")

# Labelling

## Final Iteration

In [None]:
def rolling_window_with_custom_window(dataset_path):
    dataset = pd.read_csv(dataset_path)
    #print(dataset)
    print("Dataset before feature engineering:\n",dataset)
    
    # DateTime indexes
    dataset = dataset.set_index(pd.to_datetime(dataset['sample_date']+" "+dataset['sample_time'], dayfirst=True), drop = False)
    # Makes sure records are in chronological order
    dataset = dataset.sort_index()
    print(dataset.describe())
    print("Number of missing values:\n",dataset.isna().sum())
    
    print("Number of duplicate records removed:\n",dataset.index.duplicated().sum())
    dataset = remove_duplicate_datetimes(dataset)

    zero_distance_count = dataset.loc[dataset.distance == 0,'distance'].count()
    print("Number of zero distance records removed:\n",zero_distance_count)
    dataset = dataset.loc[dataset.distance > 0,:]
    
    #Drop columns with missing values because datasets 1 and 2 each have one missing distance value which is missing completely at random.
    dataset = dataset.dropna()
    
    dataset['lag'] = dataset.index
    dataset['lag'] = dataset['lag'].diff()
    print("Number of lags greater than 15 minutes:", (dataset[dataset.lag > pd.Timedelta(15, "m")]).count())
    
    five_record_custom_indexer = CustomIndexer(dataset = dataset, dynamic_window_size = 5)
    
    twenty_five_record_custom_indexer = CustomIndexer(dataset = dataset, dynamic_window_size = 25)

    five_record_mean = dataset.loc[:,'distance'].rolling(window=five_record_custom_indexer).mean()
    five_record_standard_deviation = dataset.loc[:,'distance'].rolling(window=five_record_custom_indexer).std()
    five_record_difference = dataset.loc[:,'distance'].rolling(window=five_record_custom_indexer).mean().diff()
    dataset['five_record_mean'] = five_record_mean
    dataset['five_record_standard_deviation'] = five_record_standard_deviation
    dataset['five_record_difference'] = five_record_difference
    
    twenty_five_record_mean = dataset.loc[:,'distance'].rolling(window=twenty_five_record_custom_indexer).mean()
    twenty_five_record_standard_deviation = dataset.loc[:,'distance'].rolling(window=twenty_five_record_custom_indexer).std()
    twenty_five_record_difference = dataset.loc[:,'distance'].rolling(window=twenty_five_record_custom_indexer).mean().diff()
    dataset['twenty_five_record_mean'] = twenty_five_record_mean
    dataset['twenty_five_record_standard_deviation'] = twenty_five_record_standard_deviation
    dataset['twenty_five_record_difference'] = twenty_five_record_difference
    
    rolling_fifteen_minute_mean = dataset.loc[:,'distance'].rolling('15min',center=True).mean()
    rolling_fifteen_minute_standard_deviation = dataset.loc[:,'distance'].rolling('15min',center=True).std()
    rolling_fifteen_minute_difference = dataset.loc[:,'distance'].rolling('15min',center=True).mean().diff()
    dataset['rolling_fifteen_minute_mean'] = rolling_fifteen_minute_mean
    dataset['rolling_fifteen_minute_standard_deviation'] = rolling_fifteen_minute_standard_deviation
    dataset['rolling_fifteen_minute_difference'] = rolling_fifteen_minute_difference
    
    rolling_hourly_mean = dataset.loc[:,'distance'].rolling('1h', center=True).mean()
    rolling_hourly_standard_deviation = dataset.loc[:,'distance'].rolling('1h', center=True).std()
    rolling_hourly_difference = dataset.loc[:,'distance'].rolling('1h', center=True).mean().diff()
    dataset['rolling_hourly_mean'] = rolling_hourly_mean
    dataset['rolling_hourly_standard_deviation'] = rolling_hourly_standard_deviation
    dataset['rolling_hourly_difference'] = rolling_hourly_difference
    
    
    #dataset['satisfied_filtering_condition'] = (dataset["twenty_five_record_standard_deviation"] < 0.1) | (dataset["distance"].diff().abs() < 0.25) # Stationary, added additional condition regarding the amount of difference from previous value so it won't misclasify the ends of passive phases. 
    #Then I could count the number of consecutive records which meet this condition to determine whether the section is in an active or passive phase.
    
    #dataset['satisfied_filtering_condition'] = (dataset["distance"] < (dataset["twenty_five_record_mean"] + 1)) & (dataset["distance"] > (dataset["twenty_five_record_mean"] - 1)) | (dataset["distance"].diff().abs() < 0.25)  # Finds noise
    
    #dataset['satisfied_filtering_condition'] = (dataset["distance"].diff().abs() < 0.25) # Useful for retaining useful records at the ends of stationary phases.
    
    #dataset['satisfied_filtering_condition'] = dataset['twenty_five_record_standard_deviation'] <= dataset["distance"].std() # might also be used to support indication of transition between active and passive states.

    #dataset['satisfied_filtering_condition'] = (dataset.index.diff().abs() < 10)
    
    #dataset['satisfied_filtering_condition'] = (dataset["distance"].diff().abs() < 0.1) # Alternative method to determine passive periods.
    
    #dataset['satisfied_filtering_condition'] = (dataset["distance"].diff().abs() <= 0.25) | (dataset['twenty_five_record_standard_deviation'] <= 0.25)
    
    #dataset['satisfied_filtering_condition'] = (dataset["distance"].diff().abs() <= 1) 
    
    dataset['satisfied_filtering_condition'] = (dataset["distance"].diff().abs() <= 0.5) | (dataset['twenty_five_record_standard_deviation'] <= 0.25)
    
    subset = dataset.loc[dataset.satisfied_filtering_condition == False,:].index.to_frame(name='false_condition_interval')

    subset['false_condition_interval'] = subset.diff()

    print(subset)


    dataset['shifted_datetime'] = dataset.index
    dataset['shifted_datetime'] = dataset['shifted_datetime'].shift(1)
    dataset = pd.concat([dataset,subset], axis=1)
    #dataset['satisfied_filtering_condition'] = (dataset.index - dataset['shifted_datetime'] <= pd.Timedelta(15, "m")) # If gaps between recordings are grater than 15 minutes then the rolling can restart as long gaps between recordings can skew rolling window calculations.
    
    print("Total number of records meeting filtering condition\n:",dataset.loc[:,'satisfied_filtering_condition'].value_counts())
    
    
    print("Dataset after feature engineering:\n",dataset)
    print(dataset.describe())
    satisfied_condition_record_count(dataset)
    
    dataset = dataset.loc[dataset["satisfied_filtering_condition"] == True ]
    
    relative_range_time_series_plot(dataset,'rolling_hourly_mean','rolling_hourly_standard_deviation','rolling_hourly_difference')
    
    return dataset

In [None]:
dataset_one = rolling_window_with_custom_window("DH_plus_dist_interpolated.csv")

In [None]:
sns.boxplot(x=dataset_one["sample_date"], y=dataset_one["distance"])

In [None]:
inertia_elbow_method_of_whole_duration(dataset_one)

In [None]:
dataset_two = rolling_window_with_custom_window("LA_plus_dist_interpolated.csv")

In [None]:
sns.boxplot(x=dataset_two["sample_date"], y=dataset_two["distance"])

In [None]:
inertia_elbow_method_of_whole_duration(dataset_two)

In [None]:
dataset_three = rolling_window_with_custom_window("Lucy_16-20May_plus_dist_interpolated.csv")

In [None]:
sns.boxplot(x=dataset_three["sample_date"], y=dataset_three["distance"])

In [None]:
inertia_elbow_method_of_whole_duration(dataset_three)

In [None]:
k_means_clustering_of_whole_duration(dataset_one,3)

In [None]:
k_means_clustering_of_whole_duration(dataset_two,3)

In [None]:
k_means_clustering_of_whole_duration(dataset_three,3)

# Mischelanious

In [None]:
print(DH_plus_dist.loc[:,'distance'].rolling(10,center=True).count())

In [None]:
DH_plus_dist.loc[:,'distance'].rolling(3,center=True).quantile(.5)

In [None]:
print(DH_plus_dist.loc[:,'distance'].index[0])
a = (DH_plus_dist.loc[:,'distance'].index[1] - DH_plus_dist.loc[:,'distance'].index[0])/2
b = DH_plus_dist.loc[:,'distance'].index[0] + a
print(b)

In [None]:
DH_plus_dist.loc[:,'distance'].rolling(3,center=True).sum().index[:]

In [None]:
DH_plus_dist.loc[:,'distance'].rolling(3,center=True).sum().apply(lambda x: print(x.index))

In [None]:
even = DH_plus_dist.loc[:,'distance'].index[0::2]
odd = DH_plus_dist.loc[:,'distance'].index[1::2]
diff = odd-even
mid = diff/2
print(even.size)
print(odd)
print(diff.size)
print(mid)
even+mid

In [None]:
def midpoint(window_size):
    multiples=[]
    for i in range(window_size):
        multiples.append(DH_plus_dist.loc[:,'distance'].index[i::window_size])
    print(multiples)
    

In [None]:
midpoint(3)

In [None]:
print(type(DH_plus_dist.loc[:,'distance'].rolling(3,center=True)))

In [None]:
dir('pandas.core.window.rolling.Rolling')

In [None]:
DH_plus_dist.loc[:,'distance'].rolling(3,center=True).__str__()

In [None]:
temp = pd.DataFrame()
#DH_rolling_window_summary.loc[:,'difference'] = DH_plus_dist.loc[:,'distance'].diff()
temp.loc[:,'test'] = DH_plus_dist.loc[:,'distance'].rolling(3,center=True,min_periods=3).mean()
idx = DH_plus_dist.loc[:,'distance'].index
print(len(idx))
all = []
for i in range(idx.size-1):
    #print(idx[i])
    earliest = idx[i-1]
    latest = idx[i+1]
    #halfdiff = (latest - earliest) / 2
    #final = earliest+halfdiff
    
    midpoint = pd.Interval(earliest,latest,closed='both').mid
    all.append(midpoint)
    #all.append(final)
    #print(idx[i-1],idx[i],idx[i+1])
    #print((idx[i+1]-idx[i-1])/2)
    #print()
print(len(all))
print(temp.count())
print(temp.shape)
temp

In [None]:
#temp = pd.DataFrame(index=DH_plus_dist.index)
# The RolingWindow object automatically checks for if the Window size exceeds the min number of records per window by throwing an exception.
#temp.loc[:,'test'] = DH_plus_dist.loc[:,'distance'].rolling(3,center=True,min_periods=3).mean()

# Because midpoint index is always rounded up if the window size is even
minShift = round(3/2)
maxShift = int(3/2)
indexes = DH_plus_dist.index # or indexes = temp.index
newIndexes = []
count = 0

# This is to check the positon of the indexes
for i in range(indexes.size) :

    #earliest = indexes[i-minShift]
    #latest = indexes[i+maxShift]
    
    # This check makes sure the earliest/lower bound DateTime is <= latest/upper bound DateTime
    # Prevents IndexError by goind out of bounds or a ValueError due to the list traversing to the end of the list as this is how arrays work in Python.
    if i - minShift < 0:
        earliest = indexes[0]
        latest = indexes[i+maxShift]
    elif i + maxShift > indexes.size-1:
        earliest = indexes[i-minShift]
        latest = indexes[indexes.size-1]
    else:
        earliest = indexes[i-minShift]
        latest = indexes[i+maxShift]

    midpoint = pd.Interval(earliest, latest, closed='both').mid
    newIndexes.append(midpoint)
    #count+=1
    #print(i)

#temp.reset_index(drop=True)
#temp.set_index(newIndexes)
print(len(DH_plus_dist))
print(len(newIndexes))
print(temp.index.size)
#print(count)
#print(DH_plus_dist.loc[:,'distance'])
val = DH_plus_dist.loc[:,'distance']
val.reset_index(drop=True)
print(val)

temp2 = pd.DataFrame(data = {'col1':newIndexes,'col2':val})
#temp.loc[:'val'] = DH_plus_dist.loc[:,'distance']
temp2.reset_index(drop=True, inplace=True)
temp2.set_index('col1')
temp2

In [None]:
temp2.set_index('col1')

In [None]:
plt.plot(temp2.col1,temp2.col2)

In [None]:
def midpoints(window_size,dataframe):
    
    # Because midpoint index is always rounded up if the window size is even
    minShift = round(window_size/2)
    maxShift = int(window_size/2)
    indexes = dataframe.index # or indexes = temp.index
    newIndexes = []
    count = 0

    # This is to check the positon of the indexes
    for i in range(indexes.size) :

        #earliest = indexes[i-minShift]
        #latest = indexes[i+maxShift]

        # This check makes sure the earliest/lower bound DateTime is <= latest/upper bound DateTime
        # Prevents IndexError by goind out of bounds or a ValueError due to the list traversing to the end of the list as this is how arrays work in Python.
        if i - minShift < 0:
            earliest = indexes[0]
            latest = indexes[i+maxShift]
        elif i + maxShift > indexes.size-1:
            earliest = indexes[i-minShift]
            latest = indexes[indexes.size-1]
        else:
            earliest = indexes[i-minShift]
            latest = indexes[i+maxShift]

        midpoint = pd.Interval(earliest, latest, closed='both').mid
        newIndexes.append(midpoint)
    
    return newIndexes


In [None]:
def statistical_summary(test_dataset):
    #temp = pd.DataFrame(index=test_dataset.index)
    # The RolingWindow object automatically checks for if the Window size exceeds the min number of records per window by throwing an exception.
    #temp.loc[:,'test'] = test_dataset.loc[:,'distance'].rolling(3,center=True,min_periods=3).mean()

    #Can also do it this way.
    #DH_minute_rolling_window_summary = pd.DataFrame(index=test_dataset.index, data = {'mean':test_dataset.loc[:,'distance'].rolling('min').mean(),'std':test_dataset.loc[:,'distance'].rolling('min').std(),'var':test_dataset.loc[:,'distance'].rolling('min').var()})
    rolling_window_summary = pd.DataFrame()
    rolling_window_summary.loc[:,'difference'] = test_dataset.loc[:,'distance'].diff()
    rolling_window_summary.loc[:,'fractional_change'] = test_dataset.loc[:,'distance'].pct_change()
    
    rolling_window_summary.loc[:,'three_record_midpoint'] = midpoints(3, test_dataset)
    rolling_window_summary.loc[:,'three_record_mean'] = test_dataset.loc[:,'distance'].rolling(3,center=True).mean()
    rolling_window_summary.loc[:,'three_record_standard_deviation'] = test_dataset.loc[:,'distance'].rolling(3,center=True).std()
    rolling_window_summary.loc[:,'three_record_variance'] = test_dataset.loc[:,'distance'].rolling(3,center=True).var()
    rolling_window_summary.loc[:,'three_record_difference'] = test_dataset.loc[:,'distance'].rolling(3,center=True).mean().diff()
    rolling_window_summary.loc[:,'three_record_fractional_change'] = test_dataset.loc[:,'distance'].rolling(3,center=True).mean().pct_change()

    rolling_window_summary.loc[:,'five_record_midpoint'] = midpoints(5, test_dataset)
    rolling_window_summary.loc[:,'five_record_mean'] = test_dataset.loc[:,'distance'].rolling(5,center=True).mean()
    rolling_window_summary.loc[:,'five_record_standard_deviation'] = test_dataset.loc[:,'distance'].rolling(5,center=True).std()
    rolling_window_summary.loc[:,'five_record_variance'] = test_dataset.loc[:,'distance'].rolling(5,center=True).var()
    rolling_window_summary.loc[:,'five_record_difference'] = test_dataset.loc[:,'distance'].rolling(5,center=True).mean().diff()
    rolling_window_summary.loc[:,'five_record_fractional_change'] = test_dataset.loc[:,'distance'].rolling(5,center=True).mean().pct_change()

    rolling_window_summary.loc[:,'ten_record_midpoint'] = midpoints(10, test_dataset)
    rolling_window_summary.loc[:,'ten_record_mean'] = test_dataset.loc[:,'distance'].rolling(10,center=True).mean()
    rolling_window_summary.loc[:,'ten_record_standard_deviation'] = test_dataset.loc[:,'distance'].rolling(10,center=True).std()
    rolling_window_summary.loc[:,'ten_record_variance'] = test_dataset.loc[:,'distance'].rolling(10,center=True).var()
    rolling_window_summary.loc[:,'ten_record_difference'] = test_dataset.loc[:,'distance'].rolling(10,center=True).mean().diff()
    rolling_window_summary.loc[:,'ten_record_fractional_change'] = test_dataset.loc[:,'distance'].rolling(10,center=True).mean().pct_change()
   
    return rolling_window_summary


In [None]:
statistical_summary(LA_plus_dist) 

In [None]:
statistical_summary(LA_plus_dist).count()

In [None]:
print(temp2)

In [None]:
int(135.9)

In [None]:
year_2017 = pd.Interval(pd.Timestamp('2017-01-01 00:00:00'),
                        pd.Timestamp('2017-01-01 00:00:00'),
                        closed='both')
year_2017.mid

In [None]:
idx.size

In [None]:
DH_plus_dist.loc[:,'distance'].tail()

In [None]:
DH_plus_dist.loc[:,'distance'].rolling(3,center=True,min_periods=3,closed='both').count()

In [None]:
DH_plus_dist.loc[:,'distance'].rolling(3,center=True,min_periods=3).count()

In [None]:
DH_plus_dist.loc[:,'distance'].rolling(3,center=True,min_periods=3, closed='left').count()

In [None]:
DH_plus_dist.loc[:,'distance'].rolling(3).count()

In [None]:
DH_plus_dist.loc[:,'distance'].rolling(1,center=True,closed='both').count()

In [None]:
s = pd.Series([2, 3, 7, 9, 19, 12, 14])
print(s.rolling(4,center=True).sum())

In [None]:
DH_rolling_window_summary = statistical_summary(DH_plus_dist)

In [None]:
dates = DH_plus_dist.loc[:,'sample_date'].unique()

figure, axis = plt.subplots(dates.size)

figure.suptitle("Means against each other")

#Expected number of measurements plotted
numOfDistances = 0;
i = 0
for date in dates:
    min = pd.to_datetime(date+" "+"00:00:00", dayfirst = True)
    max = pd.to_datetime(date+" "+"23:59:59", dayfirst = True)
    condition = DH_plus_dist.sample_date==date

    axis[i].set_xlim(min,max)

    print(dates)
    axis[i].plot(DH_plus_dist.index,DH_plus_dist.loc[:,'distance'])

    axis[i].plot(DH_rolling_window_summary.loc[:,'three_record_midpoint'],DH_rolling_window_summary.loc[:,'three_record_mean'])
    axis[i].plot(DH_rolling_window_summary.loc[:,'five_record_midpoint'],DH_rolling_window_summary.loc[:,'five_record_mean'])
    axis[i].plot(DH_rolling_window_summary.loc[:,'ten_record_midpoint'],DH_rolling_window_summary.loc[:,'ten_record_mean']) 
    #Each subplot adds to total number of values plotted
    numOfDistances = numOfDistances + DH_rolling_window_summary.loc[condition,'three_record_mean'].count()
    #Checks to see if all values are plotted.
    print(numOfDistances)
    i+=1
figure.legend(["distance","three_record_mean","five_record_mean","ten_record_mean"])

In [None]:
dates = DH_plus_dist.loc[:,'sample_date'].unique()

figure_one, axis_one = plt.subplots(dates.size)
figure_two, axis_two = plt.subplots(dates.size)
figure_three, axis_three = plt.subplots(dates.size)

figure_one.suptitle("Mean per every three records")
figure_two.suptitle("Mean per every five records")
figure_three.suptitle("Mean per every ten records")

#Expected number of measurements plotted
numOfDistances = 0;
i = 0
for date in dates:
    min = pd.to_datetime(date+" "+"00:00:00", dayfirst = True)
    max = pd.to_datetime(date+" "+"23:59:59", dayfirst = True)
    condition = DH_plus_dist.sample_date==date

    axis_one[i].set_xlim(min,max)
    axis_two[i].set_xlim(min,max)
    axis_three[i].set_xlim(min,max)
    #axis_four[i].set_xlim(min,max)

    print(dates)

    axis_one[i].plot(DH_plus_dist.index,DH_plus_dist.loc[:,'distance'])
    axis_one[i].plot(DH_rolling_window_summary.loc[:,'three_record_midpoint'],DH_rolling_window_summary.loc[:,'three_record_mean'])
    axis_two[i].plot(DH_plus_dist.index,DH_plus_dist.loc[:,'distance'])
    axis_two[i].plot(DH_rolling_window_summary.loc[:,'five_record_midpoint'],DH_rolling_window_summary.loc[:,'five_record_mean'])
    axis_three[i].plot(DH_plus_dist.index,DH_plus_dist.loc[:,'distance'])
    axis_three[i].plot(DH_rolling_window_summary.loc[:,'ten_record_midpoint'],DH_rolling_window_summary.loc[:,'ten_record_mean']) 
    #Each subplot adds to total number of values plotted
    numOfDistances = numOfDistances + DH_rolling_window_summary.loc[condition,'three_record_mean'].count()
    #Checks to see if all values are plotted.
    print(numOfDistances)
    i+=1

### First test_dataset

In [None]:
#DataFrame of statistical summaries of test_dataset.

DH_rolling_window_summary = statistical_summary(DH_plus_dist)

In [None]:
DH_rolling_window_summary

In [None]:
DH_rolling_window_summary.describe()

In [None]:
# Expeced number of values which are not null or nan for the following DH_plus_dist windowing methods.
DH_rolling_window_summary.count()

#### All on seperate subplots

In [None]:
differences_separate(DH_plus_dist, DH_rolling_window_summary)

In [None]:
fractional_changes_separate(DH_plus_dist, DH_rolling_window_summary)

In [None]:
means_separate(DH_plus_dist, DH_rolling_window_summary)

In [None]:
standard_deviations_separate(DH_plus_dist, DH_rolling_window_summary)

In [None]:
variances_separate(DH_plus_dist, DH_rolling_window_summary)

#### Combined

In [None]:
differences_combined(DH_plus_dist, DH_rolling_window_summary)

In [None]:
fractional_changes_combined(DH_plus_dist, DH_rolling_window_summary)

In [None]:
means_combined(DH_plus_dist, DH_rolling_window_summary)

In [None]:
standard_deviations_combined(DH_plus_dist, DH_rolling_window_summary)

In [None]:
variances_combined(DH_plus_dist, DH_rolling_window_summary)

### Second test_dataset

In [None]:
#DataFrame of statistical summaries of test_dataset.

LA_rolling_window_summary = statistical_summary(LA_plus_dist)

In [None]:
LA_rolling_window_summary.describe()

In [None]:
# Expeced number of values which are not null or nan for the following LA_plus_dist windowing methods.
LA_rolling_window_summary.count()

#### All on seperate subplots

In [None]:
differences_separate(LA_plus_dist, LA_rolling_window_summary)

In [None]:
fractional_changes_separate(LA_plus_dist, LA_rolling_window_summary)

In [None]:
means_separate(LA_plus_dist, LA_rolling_window_summary)

In [None]:
standard_deviations_separate(LA_plus_dist, LA_rolling_window_summary)

In [None]:
variances_separate(LA_plus_dist, LA_rolling_window_summary)

#### Combined

In [None]:
differences_combined(LA_plus_dist, LA_rolling_window_summary)

In [None]:
fractional_changes_combined(LA_plus_dist, LA_rolling_window_summary)

In [None]:
means_combined(LA_plus_dist, LA_rolling_window_summary)

In [None]:
standard_deviations_combined(LA_plus_dist, LA_rolling_window_summary)

In [None]:
variances_combined(LA_plus_dist, LA_rolling_window_summary)

### Third test_dataset

In [None]:
#DataFrame of statistical summaries of test_dataset.

Lucy_rolling_window_summary = statistical_summary(Lucy_16_20_May_plus_dist)

In [None]:
Lucy_rolling_window_summary

In [None]:
Lucy_rolling_window_summary.describe()

In [None]:
# Expeced number of values which are not null or nan for the following DH_plus_dist windowing methods.
Lucy_rolling_window_summary.count()

#### All on seperate subplots

In [None]:
differences_separate(Lucy_16_20_May_plus_dist, Lucy_rolling_window_summary)

In [None]:
fractional_changes_separate(Lucy_16_20_May_plus_dist, Lucy_rolling_window_summary)

In [None]:
means_separate(Lucy_16_20_May_plus_dist, Lucy_rolling_window_summary)

In [None]:
standard_deviations_separate(Lucy_16_20_May_plus_dist, Lucy_rolling_window_summary)

In [None]:
variances_separate(Lucy_16_20_May_plus_dist, Lucy_rolling_window_summary)

#### Combined

In [None]:
differences_combined(Lucy_16_20_May_plus_dist, Lucy_rolling_window_summary)

In [None]:
fractional_changes_combined(Lucy_16_20_May_plus_dist, Lucy_rolling_window_summary)

In [None]:
means_combined(Lucy_16_20_May_plus_dist, Lucy_rolling_window_summary)

In [None]:
standard_deviations_combined(Lucy_16_20_May_plus_dist, Lucy_rolling_window_summary)

In [None]:
variances_combined(Lucy_16_20_May_plus_dist, Lucy_rolling_window_summary)

### Indexing, multiindexing and hourly average

In [None]:
example = pd.read_csv("DH_plus_dist_interpolated.csv")
example = example.set_index(pd.to_datetime(example['sample_date']+" "+example['sample_time'], dayfirst=True))
example.loc[:,'hour_average'] = 0

# Multilevel index, an array of tuples with 2 values the date and hour number from 0 to 23.

# transform() method so hourly averages can be added to each index of the original DataFrame instead of adding them to new idexes based on the groupby parameters/criteria.
# Original and grouped DataFrames now have the same number of records making it simple to copy hour_average column values from the grouped DataFrame to the original without any dimensionality conflict.
example.loc[:,'hour_average'] = example.groupby([example.index.date, example.index.hour])['distance'].transform("mean")

# Multilevel index, an array of tuples with 2 values the date and hour number from 0 to 23.
hour_averages = example.groupby([example.index.date, example.index.hour])['distance'].mean()
#if example.index.date == hour_averages.index[0] and example.index.hour == hour_averages.index[0]: 
#    example.loc[:,'hour_average'] = hour_averages
#example
print(type(hour_averages.index))
print(hour_averages)

In [None]:
print("Date + hour index count:",hour_averages.index.size)
print("DateTime index count:",example.index.size)

In [None]:
example

In [None]:
example.index.hour

In [None]:
example.index.day

In [None]:
hour_averages.index

In [None]:
hour_averages.index[:][0]

In [None]:
hour_averages

In [None]:
example_hour = pd.read_csv("DH_plus_dist_interpolated.csv")
example_hour = example.set_index(pd.to_datetime(example_hour['sample_date']+" "+example_hour['sample_time'], dayfirst=True))
example_hour = hourly_average(example_hour)

In [None]:
example_hour

In [None]:
example_hour2 = pd.read_csv("DH_plus_dist_interpolated.csv")
example_hour2 = example.set_index(pd.to_datetime(example_hour2['sample_date']+" "+example_hour2['sample_time'], dayfirst=True))
example_hour2.loc[:,'hour_average'] = example_hour2.groupby([example_hour2.index.date, example_hour2.index.hour])['distance'].transform("mean")

In [None]:
example_hour2

In [None]:
print(example_hour2)

In [None]:
pd.isna(example_hour2.loc[:,'hour_average']).sum()

In [None]:
example.groupby([example.index.date, example.index.hour])['distance'].transform("mean")

In [None]:
def hourly_standard_deviation(test_dataset):
    test_datasetOne = test_dataset.groupby([test_dataset.index.date, test_dataset.index.hour])['distance'].transform("mean")
    test_datasetTwo = test_dataset.groupby([test_dataset.index.date, test_dataset.index.hour]).mean()
    #plt.plot(test_datasetOne.index,test_datasetOne.loc[:])
    #test_datasetTwo.index = test_datasetTwo.index.set_names(['date','hour'])
    #plt.plot(test_datasetTwo.index.get_level_values(0),test_datasetTwo.loc[:,'hour_average'])
    e = test_dataset.resample('H').mean()
    
    print(pd.isna(e.loc[:,'hour_average']).sum())
    print(pd.isna(test_dataset.loc[:,'hour_average']).sum())
    plt.plot(e.index,e.loc[:,'hour_average'])
    #plt.plot(test_dataset.index,test_dataset.loc[:,'hour_average'])
    #pd.plotting.autocorrelation_plot(test_datasetOne)
    #pd.plotting.autocorrelation_plot(test_datasetTwo)
    #print(test_datasetTwo.index)
    #print(test_datasetTwo)
    print(test_dataset)
    plt.plot(test_dataset.index,test_dataset.loc[:,'hour_average'])
    
    #dates = test_dataset.loc[:,'sample_date'].unique()
    #largestY = DH_plus_dist.loc[:,'distance'].max()
    #print("Largest distance value:",largestY)
    #print(dates)
    #figure, axis = plt.subplots(dates.size)
    #i = 0
    #for date in dates:
    #    min = pd.to_datetime(date+" "+"00:00:00", dayfirst = True)
    #    max = pd.to_datetime(date+" "+"23:59:59", dayfirst = True)
    #    condition = DH_plus_dist.sample_date==date
    #    times = DH_plus_dist.loc[condition,'sample_time']
    #    datetimes = pd.to_datetime(date+" "+times, dayfirst=True)
    #    print(times)
    #    distances = DH_plus_dist.loc[condition,'distance']
    #    axis[i].set_xlim(min,max)
    #    axis[i].set_ylim(0,largestY)
    #    axis[i].plot(datetimes,distances)
    #    # Compare against hourly average.
    #    axis[i].plot(datetimes, DH_plus_dist.loc[condition,'hour_average'],'--')
    #    i+=1

In [None]:
hourly_standard_deviation(DH_plus_dist)

In [None]:
hourly_standard_deviation(LA_plus_dist)

In [None]:
hourly_standard_deviation(Lucy_16_20_May_plus_dist)

### Differences between adjacent datetimes/record interval durations

In [None]:
diff = []
largest = 0 
for i in range(0,DH_plus_dist.index.size-1):
    if i == 0 :
        diff.append(DH_plus_dist.index[i] - DH_plus_dist.index[i])
    diff.append(DH_plus_dist.index[i+1] - DH_plus_dist.index[i])
diff = pd.to_timedelta(diff)
DH_plus_dist['interval_durations'] = diff
DH_plus_dist
plt.plot(DH_plus_dist.index,DH_plus_dist.loc[:,'interval_durations'])

In [None]:
diff = []
for i in range(0,LA_plus_dist.index.size-1):
    if i == 0 :
        diff.append(LA_plus_dist.index[i] - LA_plus_dist.index[i])
    diff.append(LA_plus_dist.index[i+1] - LA_plus_dist.index[i])
diff = pd.to_timedelta(diff)
plt.plot(LA_plus_dist.index,diff)

In [None]:
diff = []
for i in range(0,Lucy_16_20_May_plus_dist.index.size-1):
    if i == 0 :
        diff.append(Lucy_16_20_May_plus_dist.index[i] - Lucy_16_20_May_plus_dist.index[i])
    diff.append(Lucy_16_20_May_plus_dist.index[i+1] - Lucy_16_20_May_plus_dist.index[i])
diff = pd.to_timedelta(diff)
plt.plot(Lucy_16_20_May_plus_dist.index,diff)

In [None]:
sns.displot(data=LA_plus_dist.loc[(LA_plus_dist.distance>12.75) & (LA_plus_dist.distance<13.5),'distance'], kde=True)

In [None]:
LA_plus_dist.loc[(LA_plus_dist.distance>16.75) & (LA_plus_dist.distance<18),'distance']

### Understanding how the BaseIndexer class works and how it is used in inheritance

In [None]:
from pandas.api.indexers import BaseIndexer
class CustomIndexer(BaseIndexer):
    def get_window_bounds(self, num_values, min_periods, center, closed):
        start = np.empty(num_values, dtype=np.int64)
        end = np.empty(num_values, dtype=np.int64)
        for i in range(num_values):
            start[i] = i
            end[i] = i + self.window_size
        return start, end
df = pd.DataFrame({"values": range(5)})

indexer = CustomIndexer(window_size=2)
df.rolling(window = indexer).sum()

In [None]:
from pandas.api.indexers import BaseIndexer
class CustomIndexer(BaseIndexer):
    def get_window_bounds(self, num_values, min_periods, center, closed):
        start = np.empty(num_values, dtype=np.int64)
        end = np.empty(num_values, dtype=np.int64)
        for i in range(num_values):
            start[i] = i
            end[i] = i + 3
        return start, end
df = pd.DataFrame({"values": range(5)})

indexer = CustomIndexer(window_size=2)
df.rolling(window = indexer).sum()

In [None]:
use_expanding = [True, False, True, False, True]

use_expanding

df = pd.DataFrame({"values": range(5)})

df

In [None]:
class CustomIndexer(BaseIndexer):
    def get_window_bounds(self, num_values, min_periods, center, closed):
        self.index_array = pd.DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15']) # self.index
        start = np.empty(num_values, dtype=np.int64)
        end = np.empty(num_values, dtype=np.int64)
        for i in range(num_values):
            start[i] = i 
            end[i] = i + 2
            print(i)
        print(num_values)
        print(start,end)
        print(self.index_array)
        return start, end

indexer = CustomIndexer()
print(indexer.index_array)

df.rolling(indexer).sum()

In [None]:
from pandas.api.indexers import BaseIndexer
class CustomIndexer(BaseIndexer):
    def get_window_bounds(self, num_values, min_periods, center, closed):
        start = np.empty(num_values, dtype=np.int64)
        end = np.empty(num_values, dtype=np.int64)
        for i in range(num_values):
            start[i] = i
            end[i] = i + self.window_size
        print(start,end)
        return start, end
df = pd.DataFrame({"values": range(5)})
indexer = CustomIndexer(window_size=2)
print(df.rolling(indexer).sum())



baseIndexer = BaseIndexer()

print(dir(baseIndexer))

print(baseIndexer.index_array)

In [None]:
from pandas.api.indexers import BaseIndexer
class CustomIndexer(BaseIndexer):
    def get_window_bounds(self, num_values, min_periods, center, closed):
        start = np.arange(num_values, dtype=np.int64)
        end = np.arange(num_values, dtype=np.int64) + self.window_size
        print(num_values)
        print(start, end)
        return start, end
df = pd.DataFrame({"values": range(5)})
indexer = CustomIndexer(window_size=2)
df.rolling(indexer).sum()
print(dir(indexer))
#from pandas.api.indexers import BaseIndexer
#class CustomIndexer(BaseIndexer):
#    def get_window_bounds(self, num_values, min_periods, center, closed):
#        start = pd.DatetimeIndex()
#        end = pd.DatetimeIndex()
#        print(self.index)
#        for i in range(num_values):
#            print(i)
#            start[i] = i
#            end[i] = i
#        print(start,end)
#        return start, end

#df = pd.DataFrame(index = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00",  "3/1/2020 11:00:00+00:00",  "4/1/2020 11:00:00+00:00",  "5/1/2020 11:00:00+00:00"]),data = {"Data" : range(5)})
#indexer = CustomIndexer(window_size=2, index = df.index)
#df.index
#print(df.rolling(indexer).sum())
#print(indexer)
#print(pd.api.types.is_numeric_dtype(df.index))
#print(pd.api.types.is_numeric_dtype(df['Data']))

In [None]:
from pandas.api.indexers import BaseIndexer
class CustomIndexer(BaseIndexer):
    
    #def __init__(self, index_array, window_size, **kwargs):
        
        #super().__init__(self,index_array, window_size, **kwargs)
        
    # From 1.5.0 onwards step is a mandatory parameter. Adding a default value allows for backward and foreward compatibility up to the latest version 2.2.3. Currently using 1.4.4.
    # All rolling windows are centered reguardless of center parameter.
    def get_window_bounds(self, num_values, min_periods, center, closed):
        
        start = np.empty(num_values, dtype=np.int64)
        end = np.empty(num_values, dtype=np.int64)
        splitDatasets = []
        #between_duration = False
        print(num_values)
        print(self.window_size)

        # For an even window size you can only have integer indexes. Hence "middle" index value is rounded up automatically to the nearest integer. Thus the "middle" index closer to the right value.
        # Within the BaseIndexer class the start and end bounds work just like the range(a, b) function where: 
        # a = the starting index or value.
        # b = the index or value up to but not including.  
        #if self.window_size % 2 == 0:
        #    minShift = self.window_size/2
        #    maxShift = self.window_size/2
        #else:
        #    minShift = int(self.window_size/2)
        #    maxShift = round(self.window_size/2)
            
        indexCut = 0
        for i in range(num_values):
            if self.dataset.lag.iat[i] > pd.Timedelta(15,'m'):
                splitDatasets.append(self.dataset.iloc[indexCut:i,:])
                indexCut = i
            elif i == num_values-1:
                splitDatasets.append(self.dataset.iloc[indexCut:num_values,:])

        print("Number of split datasets:",len(splitDatasets))
        size = 0
            
        for j in splitDatasets:
            
            if len(j) > self.window_size:
                
                subwindow_size = self.window_size
            else:
                subwindow_size = len(j)
            # For an even window size you can only have integer indexes. Hence "middle" index value is rounded up automatically to the nearest integer. Thus the "middle" index closer to the right value.
            # Within the BaseIndexer class the start and end bounds work just like the range(a, b) function where: 
            # a = the starting index or value.
            # b = the index or value up to but not including.      
            if subwindow_size % 2 == 0:
                minShift = subwindow_size/2
                maxShift = subwindow_size/2
            else:
                minShift = int(subwindow_size/2)
                maxShift = round(subwindow_size/2)
                
            for k in range(len(j)):
                # This check makes sure the earliest/lower bound DateTime is <= latest/upper bound DateTime
                # Prevents IndexError by goind out of bounds or a ValueError due to the list traversing to the end of the list as this is how arrays work in Python.
                #if k - minShift < 0 and k + maxShift > len(j) - 1:
                #    start[k + size] = 0 + size
                #    end[k + size] = num_values + size
                if k - minShift < 0:
                    start[k + size] = 0 + size
                    end[k + size] = k + maxShift + size
                elif k + maxShift > len(j) - 1:
                    start[k + size] = k - minShift + size
                    end[k + size] = num_values + size
                else:
                    start[k + size] = k - minShift + size
                    end[k + size] = k + maxShift + size
            size+=len(j)
        print("Number of records:",size)
        
            
        return start, end
        

        #for i in range(num_values):
            #print(i)
            # This check makes sure the earliest/lower bound DateTime is <= latest/upper bound DateTime
            # Prevents IndexError by goind out of bounds or a ValueError due to the list traversing to the end of the list as this is how arrays work in Python.
        #    if i - minShift < 0:
        #        start[i] = 0
        #        end[i] = i + maxShift
                #earliest = indexes[0]
                #latest = indexes[i+maxShift]
        #    elif i + maxShift > num_values - 1:
        #        start[i] = i - minShift
        #        end[i] = num_values 
                #earliest = indexes[i-minShift]
                #latest = indexes[indexes.size-1]
        #    else:
        #        start[i] = i - minShift
        #        end[i] = i + maxShift
        #start = pd.DatetimeIndex(start)
        #end = pd.DatetimeIndex(end)
        #print(start)
        #print(self.datetimeIndexes.size)
        #return start, end
    

        #for i in range(num_values):
            #if self.lags[i] <= pd.Timedelta(15,'m') and between_duration == False:
                #start = np.append(start, i)
                #between_duration = True
            #elif self.lags[i] > pd.Timedelta(15,'m') and between_duration == True:
                #end = np.append(end, i)
                #between_duration = False
        #print(start)    
        #return start, end
        
        #for j in range(minShift+1):
        #    if self.lags[i-minShift] > pd.Timedelta(15,'m'):
        #        start[i] = i - j
        #        break
                
        #for k in range(1, maxShift+1):
        #    if self.lags[i+maxShift] > pd.Timedelta(15,'m'):
        #        end[i] = i + k - 1
        #        break
        
        #for i in range(num_values):
            #print(i)

            # This check makes sure the earliest/lower bound DateTime is <= latest/upper bound DateTime
            # Prevents IndexError by goind out of bounds or a ValueError due to the list traversing to the end of the list as this is how arrays work in Python.
        #    if i - minShift < 0:
        #        for j in range(i):
        #            if self.lags[j] > pd.Timedelta(15,'m'):
        #                start[i] = i - j
        #                break
                #start[i] = 0
        #        for k in range(1, maxShift+1):
        #            if self.lags[k] > pd.Timedelta(15,'m'):
        #                end[i] = i + k - 1
        #                break
                #end[i] = i + maxShift

        #    elif i + maxShift > num_values - 1:
        #        for j in range(minShift+1):
        #            if self.lags[j] > pd.Timedelta(15,'m'):
        #                start[i] = i - j
        #                break
                #start[i] = i - minShift
        #        for k in range(1, num_values - i + 1):
        #            if self.lags[k] > pd.Timedelta(15,'m'):
        #                end[i] = i + k - 1
        #                break
                #end[i] = num_values
                
        #    else:
        #        for j in range(minShift+1):
        #            if self.lags[j] > pd.Timedelta(15,'m'):
        #                start[i] = i - j
        #                break
                #start[i] = i - minShift
        #        for k in range(1, maxShift+1):
        #            if self.lags[k] > pd.Timedelta(15,'m'):
        #                end[i] = i + k - 1
        #                break
                #end[i] = i + maxShift

In [None]:
def custom_term(dataset_path):
    dataset = pd.read_csv(dataset_path)
    print("Dataset before feature engineering:\n",dataset)
    # DateTime indexes
    dataset = dataset.set_index(pd.to_datetime(dataset['sample_date']+" "+dataset['sample_time'], dayfirst=True), drop = False)
    # Makes sure records are in chronological order
    dataset = dataset.sort_index()
    print(dataset.describe())
    print("Number of missing values:\n",dataset.isna().sum())
    
    print("Number of duplicate records removed:\n",dataset.index.duplicated().sum())
    dataset = remove_duplicate_datetimes(dataset)

    zero_distance_count = dataset.loc[dataset.distance == 0,'distance'].count()
    print("Number of zero distance records removed:\n",zero_distance_count)
    dataset = dataset.loc[dataset.distance > 0,:]
    
    dataset['lag'] = dataset.index
    dataset['lag'] = dataset['lag'].diff()
    print("Number of lags greater than 15 minutes:", (dataset[dataset.lag > pd.Timedelta(15, "m")]).count())
    
    dataset['window_size'] = ''
    indexCut = 0
    sections = dataset[dataset.lag > pd.Timedelta(15, "m")]
    print("If the first rows are identical",sections.head(1).equals(dataset.head(1)))
    print(sections)
    print(dataset)
    sections = pd.concat([dataset.head(1), sections]).drop_duplicates()
    print(sections)
    print("Number of sections:",len(sections))
    start = dataset.first_valid_index()
    print("Head:",start)
    listIndex = sections.index.to_list()
    window_size = 25
    for i in range(len(listIndex)):
        print(i)
        if listIndex[i] == listIndex[-1]:
            if len(dataset.loc[listIndex[i]:]) >= window_size:
                dataset.loc[listIndex[i]:,'window_size'] = 25
            else:
                dataset.loc[listIndex[i]:,'window_size'] = len(dataset.loc[listIndex[i]:])
        else:
            if len(dataset.loc[listIndex[i]:listIndex[i+1]]) >= window_size:
                dataset.loc[listIndex[i]:listIndex[i+1], 'window_size'] = 25
            else:
                dataset.loc[listIndex[i]:listIndex[i+1], 'window_size'] = len(dataset.loc[listIndex[i]:listIndex[i+1]])
    print("Number of records less than window size 25:",len(dataset[dataset.window_size<25]))
        #if i.equals(sections.last_valid_index()):
        #    dataset.loc[i:,'window_size'] = 5
        #else:
        #    dataset.loc[start:i,'window_size'] = 5 #len(dataset.loc[start:i]) - 1
        #start = i
          #dataset =          
    #dataset.apply(,,,axis = 1)
    #for i in range(num_values):
    #    if dataset.lag.iat[i] > pd.Timedelta(15,'m'):
    #        dataset['window_size'].iloc[indexCut:i] = dataset.iloc[indexCut:i,:]
    #        #splitDatasets.append(self.dataset.iloc[indexCut:i,:])
    #        indexCut = i
    #    elif i == num_values-1:
    #        dataset['window_size'].iloc[indexCut:i] = dataset.iloc[indexCut:num_values,:]
            #splitDatasets.append(self.dataset.iloc[indexCut:num_values,:])
    
    custom_indexer = CustomIndexer(dataset = dataset, window_size = 25)
    print(dir(custom_indexer))
    twenty_five_record_mean = dataset.loc[:,'distance'].rolling(window=custom_indexer).mean()
    twenty_five_record_standard_deviation = dataset.loc[:,'distance'].rolling(window=custom_indexer).std()
    dataset['twenty_five_record_mean'] = twenty_five_record_mean
    dataset['twenty_five_record_standard_deviation'] = twenty_five_record_standard_deviation
    twenty_five_record_difference = dataset.loc[:,'distance'].rolling(25,center=True).mean().diff()
    dataset['twenty_five_record_difference'] = twenty_five_record_difference
    
    
    #dataset['satisfied_filtering_condition'] = (dataset["twenty_five_record_standard_deviation"] < 0.1) | (dataset["distance"].diff().abs() < 0.25) # Stationary, added additional condition regarding the amount of difference from previous value so it won't misclasify the ends of passive phases. 
    #Then I could count the number of consecutive records which meet this condition to determine whether the section is in an active or passive phase.
    
    #dataset['satisfied_filtering_condition'] = (dataset["distance"] < (dataset["twenty_five_record_mean"] + 1)) & (dataset["distance"] > (dataset["twenty_five_record_mean"] - 1)) | (dataset["distance"].diff().abs() < 0.25)  # Finds noise
    
    #dataset['satisfied_filtering_condition'] = (dataset["distance"].diff().abs() < 0.25) # Useful for retaining useful records at the ends of stationary phases.
    
    #dataset['satisfied_filtering_condition'] = dataset['twenty_five_record_standard_deviation'] <= dataset["distance"].std() # might also be used to support indication of transition between active and passive states.

    #dataset['satisfied_filtering_condition'] = (dataset.index.diff().abs() < 10)
    
    #dataset['satisfied_filtering_condition'] = (dataset["distance"].diff().abs() < 0.1) # Alternative method to determine passive periods.
    
    #dataset['satisfied_filtering_condition'] = (dataset["distance"].diff().abs() <= 0.25) | (dataset['twenty_five_record_standard_deviation'] <= 0.25)
    
    dataset['satisfied_filtering_condition'] = (dataset["distance"].diff().abs() <= 1) 
    
    subset = dataset.loc[dataset.satisfied_filtering_condition == False,:].index.to_frame(name='false_condition_interval')
    #othersubset = subset.index.to_frame(name='false_condition_interval')
    subset['false_condition_interval'] = subset.diff()
    #othersubset['shifted_false_datetime'] = subset.index.to_series().diff()
    print(subset)
    #print(subset.index.to_series().diff())
    #subset['shifted_false_datetime'] = pd.to_datetime(subset['sample_date']+" "+subset['sample_time'], dayfirst=True)
    #subset['shifted_false_datetime'] = subset['shifted_false_datetime'].diff()
    
    
    #print("Subset:", subset)

    dataset['shifted_datetime'] = dataset.index
    dataset['shifted_datetime'] = dataset['shifted_datetime'].shift(1)
    dataset = pd.concat([dataset,subset], axis=1)
    #dataset['satisfied_filtering_condition'] = (dataset.index - dataset['shifted_datetime'] <= pd.Timedelta(15, "m")) # If gaps between recordings are grater than 15 minutes then the rolling can restart as long gaps between recordings can skew rolling window calculations.
    
    print("Total number of records meeting filtering condition\n:",dataset.loc[:,'satisfied_filtering_condition'].value_counts())
    
    print("Dataset after feature engineering:\n",dataset)
    print(dataset.describe())

    #relative_range_time_series_plot(dataset,'rolling_hourly_mean','rolling_hourly_standard_deviation')
    #relative_range_time_series_plot(dataset,'difference')
    #relative_range_time_series_plot(dataset,'ten_record_difference')
    relative_range_time_series_plot(dataset,'twenty_five_record_mean','twenty_five_record_standard_deviation','twenty_five_record_difference')

In [None]:
custom_term("DH_plus_dist_interpolated.csv")

In [None]:
custom_term("LA_plus_dist_interpolated.csv")

In [None]:
custom_term("Lucy_16-20May_plus_dist_interpolated.csv")

In [None]:
print("Pandas version:", pd.__version__)
print("Matplotlib version:", mpl.__version__)
print("Numpy version:", np.__version__)
print("Seaborn version:", sns.__version__)

In [None]:
abc = pd.DatetimeIndex(["12/12/2024", "13/12/2024", "14/12/2024"])

In [None]:
example = pd.DataFrame({'B': [0, 1, 2, 3, 4, 5, 6]})

In [None]:
example.rolling(3, center = True).mean()

In [None]:
example.rolling(7, center = True).mean()