In [1]:
import pandas as pd
import os
#import pyproj as pj # for reliable gps
# or from pyproj import Geod (and remove the pj when executing the functionality)
from pyproj import Geod
import numpy as np # for reliable gps
from collections import Counter # for reliable gps
import datetime as dt # for reliable gps

In [2]:
class TurtleData:
    """Commom base class for all turtle's data """
    
    C1 = 'Acquisition Time'
    C2 ='Acquisition Start Time'
    C3 ='Iridium CEP Radius'
    C4 ='Iridium Latitude'
    C5 ='Iridium Longitude'
    C6 ='GPS Fix Time'
    C7 ='GPS Fix Attempt'
    C8 ='GPS Latitude'
    C9 ='GPS Longitude'
    C10 ='GPS UTM Zone'
    C11 ='GPS UTM Northing'
    C12 ='GPS UTM Easting'
    C13 ='GPS Altitude'
    C14 ='GPS Horizontal Error'
    C15 ='GPS Horizontal Dilution'
    C16 ='GPS Satellite Bitmap'
    C17 ='GPS Satellite Count'
    C18 ='Underwater Percentage'
    C19 ='Dive Count'
    C20 ='Average Dive Duration'
    C21 ='Dive Duration Standard Deviation'
    C22 ='Maximum Dive Duration'
    C23 ='Maximum Dive Depth'
    C24 ='Duration Limit 1 Dive Count'
    C25 ='Duration Limit 2 Dive Count'
    C26 ='Duration Limit 3 Dive Count'
    C27 ='Duration Limit 4 Dive Count'
    C28 ='Duration Limit 5 Dive Count'
    C29 ='Duration Limit 6 Dive Count'
    C30 ='Layer 1 Percentage'
    C31 ='Layer 2 Percentage'
    C32 ='Layer 3 Percentage'
    C33 ='Layer 4 Percentage'
    C34 ='Layer 5 Percentage'
    C35 ='Layer 6 Percentage'
    C36 ='Layer 7 Percentage'
    C37 ='Layer 8 Percentage'
    C38 ='Layer 9 Percentage'
    C39 ='Layer 10 Percentage'
    C40 ='Layer 1 Dive Count'
    C41 ='Layer 2 Dive Count'
    C42 ='Layer 3 Dive Count'
    C43 ='Layer 4 Dive Count'
    C44 ='Layer 5 Dive Count'
    C45 ='Layer 6 Dive Count'
    C46 ='Layer 7 Dive Count'
    C47 ='Layer 8 Dive Count'
    C48 ='Layer 9 Dive Count'
    C49 ='Layer 10 Dive Count'
    C50 ='Temperature'
    C51 ='Satellite Uplink'
    C52 ='Receive Time'
    C53 ='Repetition Count'
    C54 ='Low Voltage'
    C55 ='Mortality'
    C56 ='Saltwater Failsafe'
    C57 ='Iridium Command'
    C58 ='Schedule Set'
    C59 ='Diagnostic Dive Data'
    C60 ='Predeployment Data'
    C61 ='Error'
    col_names = list([
        C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, 
        C11, C12, C13, C14, C15, C16, C17, C18, C19, C20, 
        C21, C22, C23, C24, C25, C26, C27, C28, C29, C30, 
        C31, C32, C33, C34, C35, C36, C37, C38, C39, C40, 
        C41, C42, C43, C44, C45, C46, C47, C48, C49, C50, 
        C51, C52, C53, C54, C55, C56, C57, C58, C59, C60, 
        C61
    ])
    gps_col_names = list([
        C1, C2, C6, C7, C8, C9
    ])
    ID_ALLGPSDF_COLUMN_NAME = "All GPS's Track ID"

    @staticmethod
    def basedNamesForCsv(lastEntryRowDF, selfDfNameString, selfTurtleTag):
        for value in enumerate(lastEntryRowDF):
            #print(value[1][0])
            lastDate = value[1][0]
            date = dt.datetime.strptime(lastDate, "%Y.%m.%d")
            stringDate = date.strftime("%Y") + "_" + date.strftime("%b")
            print(f"The Last Entry in the Dataframe for {selfTurtleTag} is from: ")
            print(stringDate)
            # Give the CSV a Name based on this values above
            # name = allGpsDf_tag_xxxxx_until_lastdate
            cvsName = selfDfNameString + "_Tag_" + selfTurtleTag + "_" + stringDate +".csv"
            print(f"The Name of the {selfDfNameString} CSV for the turtleData {selfTurtleTag} is: ")
            print(cvsName)
            print('--------------')
            return cvsName 

    @staticmethod
    def calculateDistance(geodRef, lon1, lat1, lon2, lat2):
        # # compute forward and back azimuths, plus distance
        az12,az21,dist = geodRef.inv(lon1, lat1, lon2, lat2) #Take the second row and the first row on the count. it shoul give 3 values, but I only need the dist.
        # f"{az12:.3f} {az21:.3f} {dist:.3f}"
        return dist #Put the dist inside the distances variable once empty.
    
    @staticmethod
    def convertUnixTimeFromString(timeString):
        return dt.datetime.strptime(timeString, '%Y.%m.%d %H:%M:%S').timestamp() #[i] is the position in an array
    
    @staticmethod
    def calculateSpeed(d, t1, t2):
        speed = d / (t2 - t1)
        return speed

    def __init__(self, tag):
        self.turtleTag = tag
        self.df = pd.DataFrame()
        self.allGpsDf = pd.DataFrame()
        self.allGpsDfCsvName = ""
        #self.allGpsDf2019 = pd.DataFrame()
        self.allCleanedGpsDf = pd.DataFrame()
        self.allCleanedGpsDfCsvName = ""
        self.reliableGpsDf = pd.DataFrame()
    #def addElement(self, row, header):
        #self.__dict__= dict(zip(header, row))

    def addDataFromCsv(self, filename):
        temporaryDf = pd.read_csv(filename, skiprows=23, names=TurtleData.col_names)
        
        #print(f' ITS CURRENT DF IS: {self.df}') 
        #print('--------------')
        #print(f' ITS TEMPORARY DF IS {temporaryDf}') 
        
        self.df = self.df.append(temporaryDf, ignore_index=True)
        self.df.sort_values("Acquisition Time", inplace = True)

    def getTag(self):
        return self.turtleTag

    def getDf(self):        
        return self.df
    
    def giveAllGpsDf(self):
        # see all the columns in the df
        #print(self.df.columns)
        # see one column at a time        
        self.allGpsDf = self.df.copy()
        print(TurtleData.gps_col_names)
        tempList = TurtleData.gps_col_names.copy()
        for c in self.allGpsDf.columns:
            print(c)            
            if c not in tempList:
                self.allGpsDf.drop(c, inplace=True, axis=1)
            else:
                tempList.remove(c)        

        if tempList:
            print("Colummn Data missing in!")
        else:
            print("The dataframe contains all the GPS columns")
        
        print('-----DF with NaN values ---------')
        print(self.allGpsDf)       
        
        #### Eliminate those GPS's null (NaN) rows from the dataframe
        self.allGpsDf.drop(self.allGpsDf[~self.allGpsDf['GPS Latitude'].notna()].index, inplace=True)
        self.allGpsDf.reset_index(drop=True, inplace=True) # reset index

        print('-----SAME DF without NaN values ---------')
        print(self.allGpsDf)

        ####Create a column for id GPS points to the left
        trackId = self.allGpsDf.index + 1
        self.allGpsDf.insert(0, TurtleData.ID_ALLGPSDF_COLUMN_NAME, trackId)
        
        print(self.allGpsDf)        
        print(' End of all GPS Df ^')
        print('--------------')    
    
    def generateAllGpsDfCsvName(self):
        # Last entry:
        lastEntry = self.allGpsDf['Acquisition Time'].tail(1)
        #print(lastEntry)
        # separing date from time in that column
        lastEntry = pd.Series([[y for y in x.split()] for x in lastEntry])
        #print(lastEntry)
        # assign the Name in the Class Variable
        self.allGpsDfCsvName = TurtleData.basedNamesForCsv(lastEntry, "allGpsDf", self.turtleTag)        
    
    def saveAllGpsData(self, pathToFilePlusCsvName):
        self.allGpsDf.to_csv(pathToFilePlusCsvName, index=False)
    
    def giveAllCleanedGpsDf(self):
        # without 2019 date and without duplicate rows
        precedentYearRowsTemporaryDf = self.allGpsDf.copy()
        print(f"Before cleaning, the AllGpsDf called: {self.allGpsDfCsvName}, contained {len(precedentYearRowsTemporaryDf.index)} rows")
        
        ##### ---------- This Part is not needed ---------- #####
        # Remove 2019 data from 'Acquisition Time column:
        dateColumn = precedentYearRowsTemporaryDf['Acquisition Time']
        #print(dateColumn)
        # separing date from time in that column
        dateColumn = pd.Series([[y for y in x.split()] for x in dateColumn])
        #print(dateColumn)
        all2019DateData = []
        allOtherDateData = []        
        for value in enumerate(dateColumn):
            #print(value[1][0])
            # assign the date rows to the variable dateData
            dateData = value[1][0]
            # removing 2019 from the list
            if dateData.startswith('2019'):
                #print(f"dateData that starts with 2019 = {dateData}")
                # append the 2019 data to the list
                all2019DateData.append(dateData)
            else:
                #print(f"dateData that do not starts with 2019 = {dateData}")
                # append any other data to this list
                allOtherDateData.append(dateData)
        #print(f" 2019 list = {all2019DateData}")
        #print(f" 2020/2021 list = {allOtherDateData}")
        ##### ---------- END OF the Part not needed ---------- #####

        #### Eliminate those 2019 data rows from the dataframe
        ### example: df = df[~df['c'].astype(str).str.startswith('1')]
        print(f"Removing 2019 data from the {self.allGpsDfCsvName}")
        precedentYearRowsTemporaryDf.drop(precedentYearRowsTemporaryDf[precedentYearRowsTemporaryDf['Acquisition Time'].astype(str).str.startswith('2019')].index, inplace=True)
        precedentYearRowsTemporaryDf.reset_index(drop=True, inplace=True) # reset index
        #print(precedentYearRowsTemporaryDf)
        print(f"After removing 2019 data, the AllGpsDf called: {self.allGpsDfCsvName}, contained {len(precedentYearRowsTemporaryDf.index)} rows")

        ### Eliminate duplicate rows
        # Select duplicate rows except first occurrence based on all columns
        ## example of Selection by Position, to see example duplicated rows ----------------------------------
        ## df.iloc[row_indexer,column_indexer]
        print('--------------')
        duplicateRowsTemporaryDf = precedentYearRowsTemporaryDf
        duplicateRowsTemporaryDf = duplicateRowsTemporaryDf.drop_duplicates(
            [
                'Acquisition Time','Acquisition Start Time', 'GPS Fix Time', 'GPS Fix Attempt', 'GPS Latitude', 'GPS Longitude'
            ], keep='first'
        )
        print(duplicateRowsTemporaryDf)
        print(duplicateRowsTemporaryDf.iloc[13:19,1])
        print(f"Without duplicated rows, the dataframe has now {len(duplicateRowsTemporaryDf.index)} rows")
        print("The df without duplicated rows is the duplicateRowsTemporaryDf")
        print('--------------')
        print("Saving this temporary df into the allCleanedGpsDf...")
        self.allCleanedGpsDf = self.allCleanedGpsDf.append(duplicateRowsTemporaryDf, ignore_index=True)
        print(self.allCleanedGpsDf)
        print(self.allCleanedGpsDf.iloc[13:19,1])
        print("The df without duplicated rows is now the self.allCleanedGpsDf")
        print('------- END -------')

    def generateAllCleanedGpsDfCsvName(self):
        # Last entry:
        lastEntry = self.allCleanedGpsDf['Acquisition Time'].tail(1)
        #print(lastEntry)
        # separing date from time in that column
        lastEntry = pd.Series([[y for y in x.split()] for x in lastEntry])
        #print(lastEntry)
        # assign the Name in the Class Variable
        self.allCleanedGpsDfCsvName = TurtleData.basedNamesForCsv(lastEntry, "allCleanedGpsDf", self.turtleTag)
    
    def saveAllCleanedGpsData(self, pathToFilePlusCsvName):
        self.allCleanedGpsDf.to_csv(pathToFilePlusCsvName, index=False)

    def giveReliableGpsDf(self):
        '''
        Remove GPS Errors by Angular velocity/Rotational speed 
        (degree per second)
        Geod Object for Calculations is used as objec to calculate 
        distances between points expressed in lat/lon (in degree)
        Choosing a Reference Ellipsoid - distance in degree more 
        accurate than a spherical method
        '''
        gpsErrorsTemporaryDf = self.allCleanedGpsDf.copy()
        #print(gpsErrorsTemporaryDf)
        wgs84_geod = Geod(ellps='WGS84')
        ## Converting data to a NumPy array.        
        latitudes = gpsErrorsTemporaryDf[['GPS Latitude']].to_numpy() 
        longitudes = gpsErrorsTemporaryDf[['GPS Longitude']].to_numpy()
        acquisitionTimes = gpsErrorsTemporaryDf[['Acquisition Time']].to_numpy()
        
        #latitudes = gpsErrorsTemporaryDf['GPS Latitude'].reset_index().values
        #longitudes = gpsErrorsTemporaryDf['GPS Longitude'].reset_index().values
        ##acquisitionTimes = gpsErrorsTemporaryDf[['Acquisition Time']].reset_index().values
        #acquisitionTimes = gpsErrorsTemporaryDf[['Acquisition Time']].to_numpy()        
        
        #print(latitudes.dtype)
        #print(longitudes.dtype)
        #print(acquisitionTimes.dtype)
        
        #print(latitudes)
        #print(longitudes)
        #print(acquisitionTimes)

        distances = []
        tripTimes = []
        speeds = []
        remSpeeds = []
        pointsToRemove = []        
        
        distances.append(0)
        tripTimes.append(0)
        speeds.append(0)

        i=1
        while i < (len(latitudes)):
            foundS = False
            previous = i-1
            D = 0
            S = 100
            while (S > 1.111) and (i < len(latitudes)):
                D = TurtleData.calculateDistance(wgs84_geod, longitudes[previous], latitudes[previous], longitudes[i], latitudes[i])
                t1 = TurtleData.convertUnixTimeFromString(acquisitionTimes[previous,0])
                t2 = TurtleData.convertUnixTimeFromString(acquisitionTimes[i,0])
                S = TurtleData.calculateSpeed(D,t1,t2)
                #print(f" D = {D}")
                #print('dist: %.3f' % D)
                #print(f" S = {S}")
                #print('S: %.3f' % S)
                if(S > 1.111):
                    remSpeeds.append(S)
                    #print(f"remSpeeds List: {remSpeeds}")                    
                    pointsToRemove.append(acquisitionTimes[i,0])
                    #print(pointsToRemove)
                    i+=1
                else:
                    foundS = True
            if(foundS):
                distances.append(D)
                tripTimes.append(t2-t1)
                speeds.append(S)
            i+=1
        print("Length of pointsToRemove List: ")
        print(len(pointsToRemove))
        print(f"remSpeeds List: {remSpeeds}")
        print(f"pointsToRemove List: {pointsToRemove}")
        #---------
        #print(self.turtleTag)
        #print(remSpeeds)
        #print(pointsToRemove)        
        
        #cond = gpsErrorsTemporaryDf['Acquisition Time'].isin(pointsToRemove)
        ##removedPointsRowDf = gpsErrorsTemporaryDf[gpsErrorsTemporaryDf['Acquisition Time'].isin(pointsToRemove)].index, inplace=True)
        ##removedPointsRowDf.reset_index(drop=True, inplace=True) # reset index
        ##removedPointsRowDf['Speeds > 1,11111 m/s'] = remSpeeds
        #removedPointsRowDf = gpsErrorsTemporaryDf.drop(gpsErrorsTemporaryDf[cond].index, inplace = True)
        #gpsErrorsTemporaryDf.drop(gpsErrorsTemporaryDf[cond].index, inplace = True)
        #gpsErrorsTemporaryDf.reset_index(drop=True, inplace=True)
         
        #gpsErrorsTemporaryDf['Length (m)'] = distances
        #gpsErrorsTemporaryDf['Length (m)'] = gpsErrorsTemporaryDf['Length (m)'].str[0] #remove the brackets of the values in the column
        #gpsErrorsTemporaryDf['Time (s)'] = tripTimes
        #gpsErrorsTemporaryDf['Speed m/s'] = speeds
        #gpsErrorsTemporaryDf['Speed m/s'] = gpsErrorsTemporaryDf['Speed m/s'].str[0] #remove the brackets of the values in the column	
        #print(gpsErrorsTemporaryDf.dtypes)
        #gpsErrorsTemporaryDf['Time (h)'] = pd.to_timedelta(gpsErrorsTemporaryDf['Time (s)'], unit='s') # Add a Column with the Time passed from on Point to another in hours
         
        ##removedPointsRowDf.loc[:, ('Speeds > 1,11111 m/s')] = remSpeeds        
        #print(removedPointsRowDf)
        print('--------------')
        #print(gpsErrorsTemporaryDf)
        #print('--------------')
        
        # Saving Removed Points in another dataframe removedPointsRowDf and dropping those out of the gpsErrorsTemporaryDf
        #removedPointsRowDf = gpsErrorsTemporaryDf[gpsErrorsTemporaryDf['Acquisition Time'].isin(pointsToRemove)]
        #removedPointsRowDf.loc[:,'Speeds > 1,11111 m/s'] = remSpeeds
        ## reseting index
        #removedPointsRowDf.reset_index(drop=True, inplace=True) 
        #removedGPSPoints = removedPointsRowDf.index + 1 
        ## Creating a Column for ID Removed Track Points on the Left
        #removedPointsRowDf.insert(0, 'Removed GPS by Speed', removedGPSPoints) 
        ## Saving the amount of removed points data
        #qtyremovedGPSpointsSept = len(removedPointsRowDf.index) 
        #print(f'QTY OF REMOVED POINTS: {qtyremovedGPSpointsSept}')
        #print(removedPointsRowDf)
        
        ## Add the list values as New Columns of the DataFrame
        #gpsErrorsTemporaryDf.reset_index(drop=True, inplace=True) 
        #gpsErrorsTemporaryDf['Length (m)'] = distances
        #gpsErrorsTemporaryDf['Length (m)'] = gpsErrorsTemporaryDf['Length (m)'].str[0] #remove the brackets of the values in the column
        #gpsErrorsTemporaryDf['Time (s)'] = tripTimes
        #gpsErrorsTemporaryDf['Speed m/s'] = speeds
        #gpsErrorsTemporaryDf['Speed m/s'] = gpsErrorsTemporaryDf['Speed m/s'].str[0] #remove the brackets of the values in the column	
        #print(gpsErrorsTemporaryDf.dtypes)
        #gpsErrorsTemporaryDf['Time (h)'] = pd.to_timedelta(gpsErrorsTemporaryDf['Time (s)'], unit='s') # Add a Column with the Time passed from on Point to another in hours
        #print('--------------')
        #print(gpsErrorsTemporaryDf)
        #print('--------------')

In [3]:
# To run with terminal OR jupyter notebook:
ASSETS_FOLDER = "assets"
ASSETS_FOLDER_ITENS = os.listdir(ASSETS_FOLDER)# ("assets")

DATACLEANINGRESULTS_FOLDER = "dataCleaningResults"
DATACLEANINGRESULTS_FOLDER_ITENS = os.listdir(DATACLEANINGRESULTS_FOLDER)# ("data_analysis/dataCleaningResults")

In [4]:
TAG_TURTLE_1 = '710333A'
TAG_TURTLE_2 = '710348A'

INITIAL_TAG_DIGITS = '7103'

# Replace spaces in filenames with underlines
def replace_space_with_underline(file_name):
    return file_name.replace(" ", "_")

# Convert excel files into csv
def converting_excel_file_into_csv_file(folder_obj, file):        
    # read excel   
    df_xlsx = pd.read_excel(os.path.join(folder_obj, file))
    # change file format
    file_in_csv = file.replace(".xlsx", ".csv")
    # transform excel to csv file with path to store the CSV file
    df_xlsx.to_csv(os.path.join(folder_obj, file_in_csv), index = False)        

# Check if some excel file has not been converted into csv yet
def check_for_excel_files():
    all_my_files = []
    n = 0
    for file in ASSETS_FOLDER_ITENS:
        # put all the file names in the same format
        file = replace_space_with_underline(file).lower()
        all_my_files.append(file)
    
    # Create a copy of list
    for file in all_my_files[:]:
        if file.endswith('.xlsx'):
            print('- Excel file = ' + file)
            file_name = file.split('.', 1)[0] # remove everything (the format) after the dot
            # remove the excel file from my all_my_files list
            all_my_files.remove(file)            
            # check if another file with the same name in the folder exists
            if any(file_name in word for word in all_my_files):            
                print(f"-- Excellent! We've already converted the excel file \'{file_name}\' into csv file")
            else:
                print(f'-- Oh No! The excel file \'{file_name}\' has been not converted. Converting it into csv file...')
                # Call function "Convert excel files into csv"
                converting_excel_file_into_csv_file(ASSETS_FOLDER, file)
                file_in_csv = file.replace(".xlsx", ".csv") 
                all_my_files.append(file_in_csv)
                print('---> ' + file_in_csv + ' has been created!')
                
    # Updated all_my_files List
    print('--- CSV files in the assets folder: ', all_my_files)

def getTurtlesData():
    split_char = '_'
    csvs = []        
    turtlesData = []
    #turtleDfs = []
    for file in ASSETS_FOLDER_ITENS:
        if file.endswith('.csv'):
            # put all the file names in the same format
            csv_string_filename = replace_space_with_underline(file).lower()
            filename_splitted = csv_string_filename.split(split_char)                        
            for word in filename_splitted:
                if word.startswith(INITIAL_TAG_DIGITS):
                    csvs.append(file)
                    currentFileCsv = ASSETS_FOLDER + '\\' + file
                    print('--------------')
                    print("Found TAG ("+ word +") in filename , check if tag is already associated with an object...")

                    #--------------------
                                
                    foundTurtleData = None
                    # check inside the list if the turtle has already been created with that tag (word)
                    for obj in turtlesData:
                        if obj.getTag() == word:
                            foundTurtleData = obj
                            break    
                    #--------------------    
                                    
                    if foundTurtleData == None:
                        print("Instance for TAG ("+ word +") NOT found! Creating a new instance...")
                        # create a TurtleData obj with the turtle tag
                        foundTurtleData = TurtleData(word)
                        turtlesData.append(foundTurtleData)
                        print("Instance for TAG ("+ word +") CREATED!")
                    else:
                        print("Instance for TAG ("+ word +") ALREADY EXISTS, skipping object creation!")
                        print('--------------')

                    # for the instances turtleData objs in the list (for each turtle tag):
                    foundTurtleData.addDataFromCsv(currentFileCsv)                    

    return turtlesData

def checkInstancesAndItsDfs(turtlesData):
    print('Created instances for Obj turtleData: ')
    for turtleData in turtlesData:        
        print(turtleData.getTag())
    print('--------------')
    print('Created Dataframes: ')
    i = 0
    for turtleData in turtlesData: 
        print(f'turtlesData[{i}].df')
        print(turtleData.turtleTag)
        print(turtleData.df)
        print('--------------')
        i+=1

def getAllGpsDataframes(turtlesData):
    for turtleData in turtlesData:
        turtleData.giveAllGpsDf()

def displayAllGpsDf(turtlesData):
    i = 0
    for turtleData in turtlesData:
        print(f'turtlesData[{i}].allGpsDf')
        print(turtleData.turtleTag)
        print(turtleData.allGpsDf)

def createAllGpsDfCsvNameForEachInstance(turtlesData):
    # create a AllGpsDf's name for each turtleData
    for turtleData in turtlesData:
        turtleData.generateAllGpsDfCsvName()

def checkIfAllGpsDfHasBeenSaved(turtlesData):
    filesInResultsFolder = []    
    
    for file in DATACLEANINGRESULTS_FOLDER_ITENS:
        filesInResultsFolder.append(file)    
    print(filesInResultsFolder)

    for turtleData in turtlesData:
        if not filesInResultsFolder:
            print(f"The filename {turtleData.allGpsDfCsvName} is not yet in the folder... saving csv")
            pathToFilePlusCsvName = os.path.join(DATACLEANINGRESULTS_FOLDER, turtleData.allGpsDfCsvName)
            turtleData.saveAllGpsData(pathToFilePlusCsvName)
            print(f"{turtleData.allGpsDfCsvName} has been saved in the results folder!")
            #append file in list

        elif turtleData.allGpsDfCsvName in filesInResultsFolder:
            print(f"The CSV {turtleData.allGpsDfCsvName} has already been saved in the results folder")
        else:
            print(f"The filename {turtleData.allGpsDfCsvName} is not yet in the folder... saving csv")
            pathToFilePlusCsvName = os.path.join(DATACLEANINGRESULTS_FOLDER, turtleData.allGpsDfCsvName)
            turtleData.saveAllGpsData(pathToFilePlusCsvName)
    
def getAllCleanedGpsDataframes(turtlesData):
    for turtleData in turtlesData:
        turtleData.giveAllCleanedGpsDf()

def createAllCleanedGpsDfCsvNameForEachInstance(turtlesData):
    # create a allCleanedGpsDf's name for each turtleData
    for turtleData in turtlesData:
        turtleData.generateAllCleanedGpsDfCsvName()

def checkIfAllCleanedGpsDfHasBeenSaved(turtlesData):
    filesInResultsFolder = []    
    
    for file in DATACLEANINGRESULTS_FOLDER_ITENS:
        filesInResultsFolder.append(file)    
    print(filesInResultsFolder)

    for turtleData in turtlesData:
        if not filesInResultsFolder:
            ## Saving AllGpsDf Data 
            #print(f"The filename {turtleData.allGpsDfCsvName} is not yet in the folder... saving csv")
            #pathToFilePlusCsvName = os.path.join(DATACLEANINGRESULTS_FOLDER, turtleData.allGpsDfCsvName)                       
            #turtleData.saveAllGpsData(pathToFilePlusCsvName)
            #print(f"{turtleData.allGpsDfCsvName} has been saved in the results folder!")            
            #print('--------------')
            ## Saving AllCleanedGps Data
            print(f"The filename {turtleData.allCleanedGpsDfCsvName} is not yet in the folder... saving csv")
            pathToFilePlusCsvName = os.path.join(DATACLEANINGRESULTS_FOLDER, turtleData.allCleanedGpsDfCsvName)
            turtleData.saveAllCleanedGpsData(pathToFilePlusCsvName)
            print(f"{turtleData.allCleanedGpsDfCsvName} has been saved in the results folder!")

        #elif turtleData.allGpsDfCsvName in filesInResultsFolder:
            #print(f"The CSV {turtleData.allGpsDfCsvName} has already been saved in the results folder")
        elif turtleData.allCleanedGpsDfCsvName in filesInResultsFolder:
            print(f"The CSV {turtleData.allCleanedGpsDfCsvName} has already been saved in the results folder")
        else:            
            ## Saving AllGpsDf Data
            #print(f"The filename {turtleData.allGpsDfCsvName} is not yet in the folder... saving csv")
            #pathToFilePlusCsvName = os.path.join(DATACLEANINGRESULTS_FOLDER, turtleData.allGpsDfCsvName)
            #turtleData.saveAllGpsData(pathToFilePlusCsvName)
            #print(f"{turtleData.allGpsDfCsvName} has been saved in the results folder!")              
            #print('--------------')
            ## Saving AllCleanedGps Data
            print(f"The filename {turtleData.allCleanedGpsDfCsvName} is not yet in the folder... saving csv")
            pathToFilePlusCsvName = os.path.join(DATACLEANINGRESULTS_FOLDER, turtleData.allCleanedGpsDfCsvName)
            turtleData.saveAllCleanedGpsData(pathToFilePlusCsvName)
            print(f"{turtleData.allCleanedGpsDfCsvName} has been saved in the results folder!")
        #print(filesInResultsFolder)
        print('--------------')

        # THIS FUNCTION ABOVE IS THE SAME FUNCTION TO SAVE THE ALL GPS DF, TRY TO DO ONLY ONE FUNCTION TO BOTH,
        # AND ALSO TRY TO MAKE THIS ONE FUNCTION TO WAIT UNTIL THE CLEANING HAS BEEN MADE TO THEN SAVE THE
        # ALL CLEANED GPS DF

In [5]:
def getReliableGpsDataframes(turtlesData):
    for turtleData in turtlesData:
        turtleData.giveReliableGpsDf()

In [6]:
# Check if some excel file has not been converted into csv yet
check_for_excel_files()
turtlesData = getTurtlesData()

# see instances for Obj turtleData created and its dfs
checkInstancesAndItsDfs(turtlesData)
#turtlesData[0].df
#turtlesData[1].df

# build dfs of all gps
getAllGpsDataframes(turtlesData)

# see dfs of all gps
displayAllGpsDf(turtlesData)
# or
#turtlesData[0].allGpsDf
#turtlesData[1].allGpsDf

# get name for each ALL GPS DF turtleData
createAllGpsDfCsvNameForEachInstance(turtlesData)

# SAVE THE ALL GPS DATAFRAME in the Results Folder
checkIfAllGpsDfHasBeenSaved(turtlesData)

# now we need to look at the all gps df and delete the duplicates rows, before calculating the errors by speed
# deleting duplicate rows and 2019 date
getAllCleanedGpsDataframes(turtlesData)

# get name for each ALL CLEANED GPS DF turtleData
createAllCleanedGpsDfCsvNameForEachInstance(turtlesData)

# SAVE THE ALL CLEANED GPS DATAFRAME in the Results Folder
checkIfAllCleanedGpsDfHasBeenSaved(turtlesData)

- Excel file = mytest.xlsx
-- Excellent! We've already converted the excel file 'mytest' into csv file
- Excel file = tag_710333a_20_sept.xlsx
-- Excellent! We've already converted the excel file 'tag_710333a_20_sept' into csv file
--- CSV files in the assets folder:  ['710333a_93_condensed.csv', '710348a_49_condensed.csv', 'mytest.csv', 'tag_710333a_20_sept.csv']
--------------
Found TAG (710333a) in filename , check if tag is already associated with an object...
Instance for TAG (710333a) NOT found! Creating a new instance...
Instance for TAG (710333a) CREATED!
--------------
Found TAG (710348a) in filename , check if tag is already associated with an object...
Instance for TAG (710348a) NOT found! Creating a new instance...
Instance for TAG (710348a) CREATED!
--------------
Found TAG (710333a) in filename , check if tag is already associated with an object...
Instance for TAG (710333a) ALREADY EXISTS, skipping object creation!
--------------
Created instances for Obj turtleData: 
71

GPS Horizontal Dilution
GPS Satellite Bitmap
GPS Satellite Count
Underwater Percentage
Dive Count
Average Dive Duration
Dive Duration Standard Deviation
Maximum Dive Duration
Maximum Dive Depth
Duration Limit 1 Dive Count
Duration Limit 2 Dive Count
Duration Limit 3 Dive Count
Duration Limit 4 Dive Count
Duration Limit 5 Dive Count
Duration Limit 6 Dive Count
Layer 1 Percentage
Layer 2 Percentage
Layer 3 Percentage
Layer 4 Percentage
Layer 5 Percentage
Layer 6 Percentage
Layer 7 Percentage
Layer 8 Percentage
Layer 9 Percentage
Layer 10 Percentage
Layer 1 Dive Count
Layer 2 Dive Count
Layer 3 Dive Count
Layer 4 Dive Count
Layer 5 Dive Count
Layer 6 Dive Count
Layer 7 Dive Count
Layer 8 Dive Count
Layer 9 Dive Count
Layer 10 Dive Count
Temperature
Satellite Uplink
Receive Time
Repetition Count
Low Voltage
Mortality
Saltwater Failsafe
Iridium Command
Schedule Set
Diagnostic Dive Data
Predeployment Data
Error
The dataframe contains all the GPS columns
-----DF with NaN values ---------
    

      All GPS's Track ID     Acquisition Time Acquisition Start Time  \
0                     12  2020.06.27 08:03:28    2020.06.27 08:02:40   
1                     13  2020.06.27 15:39:07    2020.06.27 15:35:59   
2                     14  2020.06.29 09:24:06    2020.06.29 09:22:54   
3                     15  2020.06.29 10:00:07    2020.06.29 10:00:00   
4                     16  2020.06.29 10:30:07    2020.06.29 10:30:00   
...                  ...                  ...                    ...   
3324                4648  2021.02.10 19:28:58    2021.02.10 19:00:00   
3325                4649  2021.02.10 22:42:32    2021.02.10 22:00:00   
3326                4650  2021.02.11 02:28:20    2021.02.11 02:00:00   
3327                4651  2021.02.11 05:55:38    2021.02.11 05:00:00   
3328                4652  2021.02.11 06:00:10    2021.02.11 06:00:00   

             GPS Fix Time GPS Fix Attempt  GPS Latitude  GPS Longitude  
0     2020.06.27 08:03:28       Succeeded     37.995522      1

In [7]:
# see dfs of reliable gps (Remove GPS Errors by Angular velocity/Rotational speed)
getReliableGpsDataframes(turtlesData)



Length of pointsToRemove List: 
101
remSpeeds List: [array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([1.18930457]), array([1.13096368]), array([inf]), array([inf]), array([inf]), array([inf]), array([42.3998984]), array([inf]), array([inf]), array([988.21761595]), array([inf]), array([282.15571977]), array([4.03555561]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([inf]), array([171.75018604]), array([59.



Length of pointsToRemove List: 
14
remSpeeds List: [array([32.6084126]), array([1.170557]), array([177.62047653]), array([177.47599964]), array([1.55690435]), array([192.31635835]), array([168.02518142]), array([1.17419418]), array([42.03013665]), array([5.1905697]), array([10.26749834]), array([44.55296201]), array([1.25665037]), array([21.49260763])]
pointsToRemove List: ['2020.08.13 02:04:04', '2020.08.14 21:01:15', '2020.08.16 21:01:00', '2020.08.17 02:36:48', '2020.08.20 13:09:28', '2020.09.07 01:30:11', '2020.09.17 21:09:34', '2020.09.27 20:00:17', '2020.10.11 18:09:28', '2020.11.22 12:00:48', '2020.11.22 18:02:52', '2020.12.04 00:00:49', '2020.12.21 19:00:03', '2021.01.27 18:00:17']
--------------


###### The errors is because we have still 2 data of the same time that must be removed the second one, that is that one that is giving us inf values

###### We need, first to remove the 2 acquisition time identicals and then do this same calculating distance and speed operation 