<a href="https://colab.research.google.com/github/MCasari-PMEL/IoTAS/blob/master/IoTAS_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# IoTAS Preprocessing


## Overview
This notebook performs the preprocessing on the raw IoTAS Short-burst data (SBD) messages that are pulled out of an inbox into a google drive.

## Order of operation
1. Authenticate with Google Drive
1. Convert Raw Files
  1. Loop through Raw Data Folders
  1. Loop through Raw Data Folder File
  1. Copy file into temporary Colab workspace file
  1. Run file contents through IoTAS byte parser
  1. Append data to appropriate system CSV file
  1. Copy temporary file to archived data folder
  1. Delete temporary file
  1. Remove original file
1. Create XXX_processed.csv file with all historic data
  1. Loop through CSV Folders
  1. Import Old _processed.csv file into pandas dataframe
  1. Import New CSV Files into Pandas Dataframe
  1. Append New dataframe to old dataframe
  1. Sort Dataframe (oldest to newest)
  1. Remove duplicate entries
  1. Save dataframe (overwrite previous _processed.csv)



## Authentication

In [1]:
# Packages needed for accessing google drive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [2]:
# Authenticating with your user account
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

## Path Variables

In [3]:
#@markdown ---
#@markdown ## Do not change this unless necessary
#@markdown ### Enter a Path  to Top Data Folder:
data_path = "Test/" #@param {type:"string"}

#@markdown ### Enter a Path to Archived Data Folder:
archive_path = "Test/Archive" #@param {type:"string"}

#@markdown ### Enter the Path to Converted Data CSV:
save_path = "Test/Converted" #@param {type:"string"}


## Packages to include

In [4]:
!git clone https://gist.github.com/dc7e60aa487430ea704a8cb3f2c5d6a6.git /tmp/colab_util_repo
!mv /tmp/colab_util_repo/colab_util.py colab_util.py 
!rm -r /tmp/colab_util_repo

Cloning into '/tmp/colab_util_repo'...
remote: Enumerating objects: 40, done.[K
remote: Total 40 (delta 0), reused 0 (delta 0), pack-reused 40[K
Unpacking objects:   2% (1/40)   Unpacking objects:   5% (2/40)   Unpacking objects:   7% (3/40)   Unpacking objects:  10% (4/40)   Unpacking objects:  12% (5/40)   Unpacking objects:  15% (6/40)   Unpacking objects:  17% (7/40)   Unpacking objects:  20% (8/40)   Unpacking objects:  22% (9/40)   Unpacking objects:  25% (10/40)   Unpacking objects:  27% (11/40)   Unpacking objects:  30% (12/40)   Unpacking objects:  32% (13/40)   Unpacking objects:  35% (14/40)   Unpacking objects:  37% (15/40)   Unpacking objects:  40% (16/40)   Unpacking objects:  42% (17/40)   Unpacking objects:  45% (18/40)   Unpacking objects:  47% (19/40)   Unpacking objects:  50% (20/40)   Unpacking objects:  52% (21/40)   Unpacking objects:  55% (22/40)   Unpacking objects:  57% (23/40)   Unpacking objects:  60% (24/40)   Unpacking objects:  62

In [5]:
from colab_util import *
drive_handler = GoogleDriveHandler()
import os
import math


In [6]:
# Import the IoTAS parse file here
drive_handler.download('iotas_parser.py', target_path="Colab Scripts/iotas_parser.py")

In [7]:
import iotas_parser as iotas

In [8]:
!pip install --upgrade --quiet gspread
import gspread
gc = gspread.authorize(GoogleCredentials.get_application_default())

## Raw Data Processing 

In [9]:
data_path_id = drive_handler.path_to_id(data_path)
archive_path_id = drive_handler.path_to_id(archive_path)
save_path_id = drive_handler.path_to_id(save_path)

In [10]:
data_folder = drive_handler.list_folder(root_folder_id=data_path_id)

In [11]:
for folder in data_folder:
  if ((folder['title'].lower().find('archive') < 0) and 
        (folder['title'].lower().find('converted') < 0)):
    files = drive_handler.list_folder(root_folder_id=folder['id'])

    save_folder = drive_handler.create_folder(folder['title'], parent_path=save_path)

    for idx, file in enumerate(files):
      # Upload and copy file to Archive 
      file_name = drive_handler.list_folder(root_folder_id=folder['id'])[idx]['title']
      file_path = data_path + folder['title'] + '/' + file_name      
      drive_handler.download(file_name, target_path=file_path)
      drive_handler.upload(file_name, parent_path=archive_path, overwrite=False)
  
      # Open the file and read and parse contents
      with open(file_name,'rb') as f:
        (i_date, contents) = iotas.read_file(f)

      # Create CSV in Data Folder
      save_file_name = file_name.replace('sbd','csv')
      uploaded = drive.CreateFile({
          'title':save_file_name,
          'parents': [{'kind': 'drive#fileLink', 'id': save_folder}]})
      uploaded.SetContentString(contents)
      uploaded.Upload()

      # Delete temporary file from Colab workspace
      !rm $file_name

    
    # Delete files already transfered
    if len(files) > 0:
      print(f"Delete files in folder: {folder['title']}\n")
      temp_str = f"\'{folder['id']}\' in parents"
      file_list = drive.ListFile({'q': temp_str}).GetList()

      for f in file_list:
        print(f"Deleting file {f['title']} from data")
        f.Delete()
    

300434064218040 already exists
300434064219040 already exists
300434064215020 already exists


## CSV Data Procesing
Convert all of the CSV files in the directory into a Pandas dataframe, and ultimately a processed CSV file for easy data access

In [12]:
import pandas as pd
columns = {'Date (yyyy-mm-dd hh:mm:ss)', 'latitude', 'longitude', 'pressure (mbar)', 'latitude offset', 'longitude offset'}

In [13]:
csv_folder = drive_handler.list_folder(root_folder_id=save_path_id)


In [29]:
for folder in csv_folder:
  # Load or Create Processed CSV Folder
  processed_name = folder['title'] + "_processed.csv"
  processed_path = save_path + '/' + folder['title'] + '/' 
  save_folder_id = drive_handler.path_to_id(processed_path)
  processed_file_path = processed_path + processed_name
  file_exists = True
  try:
    drive_handler.download(processed_name, target_path=processed_file_path )
  except:
    file_exists = False

  # Get the files in the current folder
  files = drive_handler.list_folder(root_folder_id=folder['id'])
  file_names = [x['title'] for x in files]
  
  # Create the dataframe
  data = pd.DataFrame(columns=columns)
  if file_exists:
    try:
      # Injest the old data, if it exists already
      data = pd.read_csv(processed_name,delimiter=',',header=0)
      files.pop(file_names.index(processed_name))
    except:
      pass

  # Create an archive folder
  save_folder = drive_handler.create_folder('Archive', parent_path=processed_path)
  
  # Remove Archive from list
  try:
    files.pop(file_names.index('Archive'))
  except:
    pass

  # Injest old data
  for f in files:
    file_name = f['title']
    if file_name != 'Archive':
      downloaded = drive.CreateFile({'id':f['id']}) 
      downloaded.GetContentFile(file_name)  
      try:
        df = pd.read_csv(file_name, header=7)
        data = pd.concat([data, df]).reset_index(drop=True)
      except Exception as e:
        print(f"Failure to injest {file_name}")
        print(e)

      # Save original file to Archive
      uploaded = drive.CreateFile({
          'title':file_name,
          'parents': [{'kind': 'drive#fileLink', 'id': save_folder}]})
      uploaded.SetContentFile(file_name)
      uploaded.Upload()

      # Delete the local instance
      !rm $file_name
  
  # Delete the original file
  temp_str = f"\'{folder['id']}\' in parents"
  file_list = drive.ListFile({'q': temp_str}).GetList()

  for f in file_list:
    if f['title'] != 'Archive' and f['title'] != processed_name:
      print(f"Deleting file {f['title']} from data")
      f.Delete()

  # Drop duplicate data
  data = data.drop_duplicates(subset=['Date (yyyy-mm-dd hh:mm:ss)'], keep='last')

  # Sort the data by date
  data = data.sort_values(['Date (yyyy-mm-dd hh:mm:ss)'])
  
  # Save the dataframe as a csv
  data.to_csv(processed_name)

  # Move CSV to Google Drive (drive_handler.upload fails with overwrite,do it manually)
  temp_str = f"\'{folder['id']}\' in parents"
  file_list = drive.ListFile({'q': temp_str}).GetList()
  for file in file_list:
    if file['title'] == processed_name:
      file.Delete()
  file = drive.CreateFile(
      {
          'title': processed_name, 
          'parents': [{'kind': 'drive#fileLink', 'id': save_folder_id}]

      }
  )
  file.SetContentFile(processed_name)
  file.Upload()

  # Delete the local processed file 
  !rm $processed_name

Archive already exists
Archive already exists
Archive already exists
