## This notebook processes ITP Level 3 data

In [2]:
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin

In [2]:

# URL of the HTML page
url = "https://scienceweb.whoi.edu/itp/data/"

def download_zip(url):
    # Send a GET request to the URL
    response = requests.get(url)

    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all the subdirectories
    subdirectories = [a['href'] for a in soup.find_all('a', href=lambda href: href and href.endswith('/') and href not in ['../', './'])]

    # Choose the base directory path to save the downloaded files
    base_directory = '/Users/ko389/Documents/Arctic_Water_Masses/Arctic_data/T_S_data_processing/data/itp_Arctic'

    # Create the base directory if it doesn't exist
    os.makedirs(base_directory, exist_ok=True)

    # Iterate through each subdirectory
    for subdirectory in subdirectories:
        subdirectory_url = urljoin(url, subdirectory)

        # Send a GET request to the subdirectory URL
        subdirectory_response = requests.get(subdirectory_url)

        # Parse the HTML content of the subdirectory
        subdirectory_soup = BeautifulSoup(subdirectory_response.text, 'html.parser')

        # Find all the <a> tags with href ending in 'final.zip' in the subdirectory
        links = subdirectory_soup.find_all('a', href=lambda href: href and href.endswith('final.zip'))

        # Download each file in the subdirectory
        for link in links:
            # Construct the absolute URL for the file
            file_url = urljoin(subdirectory_url, link['href'])
            file_name = link['href']
            file_path = os.path.join(base_directory, file_name)

            # Ensure the subdirectory structure is created
            subdirectory_path = os.path.join(base_directory, subdirectory)
            os.makedirs(subdirectory_path, exist_ok=True)

            # Send a GET request to download the file
            file_response = requests.get(file_url)

            # Save the file to disk
            with open(file_path, 'wb') as file:
                file.write(file_response.content)

            print(f"Downloaded: {file_name}")

    print("All files downloaded successfully.")

# Call the function with the provided URL
download_zip(url)


Downloaded: itp1final.zip
Downloaded: itp2final.zip
Downloaded: itp3final.zip
Downloaded: itp4final.zip
Downloaded: itp5final.zip
Downloaded: itp6final.zip
Downloaded: itp7final.zip
Downloaded: itp8final.zip
Downloaded: itp9final.zip
Downloaded: itp10final.zip
Downloaded: itp11final.zip
Downloaded: itp12final.zip
Downloaded: itp13final.zip
Downloaded: itp14final.zip
Downloaded: itp15final.zip
Downloaded: itp16final.zip
Downloaded: itp17final.zip
Downloaded: itp18final.zip
Downloaded: itp19final.zip
Downloaded: itp21final.zip
Downloaded: itp22final.zip
Downloaded: itp23final.zip
Downloaded: itp24final.zip
Downloaded: itp25final.zip
Downloaded: itp26final.zip
Downloaded: itp27final.zip
Downloaded: itp28final.zip
Downloaded: itp29final.zip
Downloaded: itp30final.zip
Downloaded: itp32final.zip
Downloaded: itp33final.zip
Downloaded: itp34final.zip
Downloaded: itp35final.zip
Downloaded: itp36final.zip
Downloaded: itp37final.zip
Downloaded: itp38final.zip
Downloaded: itp39_1final.zip
Download

In [3]:
import zipfile

directory_path = '/Users/ko389/Documents/Arctic_Water_Masses/Arctic_data/T_S_data_processing/data/itp_Arctic'

# Iterate over each .zip file in the directory
for file_name in os.listdir(directory_path):
    if file_name.endswith('.zip'):
        zip_file_path = os.path.join(directory_path, file_name)

        # Extract the contents of the .zip file
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            # Iterate over each file in the .zip file
            for inner_file_name in zip_ref.namelist():
                # Extract the file
                extracted_file = zip_ref.extract(inner_file_name, directory_path)
        # Delete the file
        os.remove(zip_file_path)

print(f"Downloaded {file_name} to {directory_path}")

Downloaded itpsys79 to /Users/ko389/Documents/Arctic_Water_Masses/Arctic_data/T_S_data_processing/data/itp_Arctic


In [4]:
def get_itp_and_profile_number(filename):
    match = re.search(r'itp(\d+)grd(\d+)', filename)
    if match:
        itp_number, profile_number = match.groups()
        return int(itp_number), int(profile_number)
    else:
        return None, None

In [5]:
import re
import pandas as pd

directory = '/Users/ko389/Documents/Arctic_Water_Masses/Arctic_data/T_S_data_processing/data/itp_Arctic'
dfs = []

for filename in os.listdir(directory):
    if filename.endswith('.dat') and ('sami' not in filename.lower() and 'micro' not in filename.lower()):
        try:
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r') as dat_file:
                lines = dat_file.readlines()
                # Remove the '%endofdat' at end of profiles
                lines = [line for line in lines if line != '%endofdat\n']
                # define headers
                header_1 = re.sub(r'%ITP \d+, profile \d+:', '', lines[0]).split()
                header_2 = lines[2].split()
                # get variable data
                itp_number, profile_number = get_itp_and_profile_number(filename)
                year, day, longitude, latitude, max_depth = lines[1].split()
                data = [line.strip() for line in lines[3:] if line.strip()]
                split_data = [line.split() for line in data]
                # add nans to itp profiles that don't have dissolved oxygen
                if len(header_2) == 4:
                    for sublist in split_data:
                        sublist.insert(3, 'NaN')
                # remove velocity data from itp profiles that have MAVS and no dissolved oxygen
                if len(header_2) == 8:
                    for sublist in split_data:
                        sublist[:] = sublist[:-4]
                        sublist.insert(3, 'NaN')
                # remove velocity data from itp profiles that have MAVS and dissolved oxygen
                if len(header_2) == 9:
                    for sublist in split_data:
                        sublist[:] = sublist[:-4]

                columns = ['pressure', 'temperature', 'salinity', 'dissolved_oxygen', 'nobs']
                df = pd.DataFrame(split_data, columns=columns)
                df['year'] = year
                df['day'] = day
                df['longitude'] = longitude
                df['latitude'] = latitude
                df['itp_no'] = itp_number
                df['prof_no'] = profile_number

                dfs.append(df)

        except Exception as e:
            print(f"Error processing file {filename}: {str(e)}")

# Now you can check the printed error messages to identify which file is causing the problem. 
# Hopefully no files are causing the problem


In [None]:
# Remove .dat files
import os
directory_path = "/Users/ko389/Documents/Arctic_Water_Masses/Arctic_data/T_S_data_processing/data/itp_Arctic"

for file_name in os.listdir(directory_path):
    if file_name.endswith('.dat'):
        dat_file_path = os.path.join(directory_path, file_name)
        os.remove(dat_file_path)

In [6]:
itp= pd.concat(dfs)
itp

Unnamed: 0,pressure,temperature,salinity,dissolved_oxygen,nobs,year,day,longitude,latitude,itp_no,prof_no
0,6.1,-1.5021,27.5793,,8,2020,7.34653,-148.8983,74.9978,114,1769
1,6.9,-1.5021,27.5794,,8,2020,7.34653,-148.8983,74.9978,114,1769
2,8.0,-1.5021,27.5794,,9,2020,7.34653,-148.8983,74.9978,114,1769
3,8.8,-1.5020,27.5794,,9,2020,7.34653,-148.8983,74.9978,114,1769
0,5.8,-1.6201,29.6559,,16,2013,120.75003,-117.6934,89.4396,61,80
...,...,...,...,...,...,...,...,...,...,...,...
244,251.0,-1.0485,33.9098,,5,2014,186.25141,-154.3825,77.0370,70,3592
245,252.1,-1.0369,33.9262,,13,2014,186.25141,-154.3825,77.0370,70,3592
246,252.5,-1.0375,,,2,2014,186.25141,-154.3825,77.0370,70,3592
0,6.7,-1.1376,29.0884,,1443,2019,190.81432,-129.8973,76.8103,103,4761


In [7]:
itp.sort_values(by='itp_no', inplace=True)

In [13]:
itp = itp.apply(pd.to_numeric, errors='coerce')

In [19]:
# Get datetime from year and day of year
itp['datetime'] = pd.to_datetime(itp['year'].astype(str) + ' ' + itp['day'].astype(int).astype(str), format='%Y %j')
itp.drop(columns=['year', 'day'], inplace=True)

In [22]:
# Has to have both T and S
itp.dropna(subset=['temperature', 'salinity'], inplace=True)

# Add a column for depth
import gsw as gsw
itp['depth'] = -gsw.z_from_p(itp['pressure'].values, itp['latitude'].values)

# Create a unique 'nprof' column based on 'itp_no' and 'prof_no'
itp['nprof'] = pd.factorize(itp['itp_no'].astype(str) + '_' + itp['prof_no'].astype(str))[0] + 1
itp.drop(['itp_no', 'prof_no', 'nobs'], axis=1, inplace=True)

# Rename temperature to potential temperature
itp.rename(columns={'temperature':'potential_temperature', 'salinity':'practical_salinity'},inplace=True)

# Name source
itp['source']='itp'


In [8]:
""""
import xarray as xr

ds = itp.to_xarray()

# Define the output file path and name for the NetCDF file
output_file_path = "/Users/ko389/Documents/Arctic_Water_Masses/Arctic_data/T_S_data_processing/data/itp_Arctic/"
output_file_name = "itp_processed.nc"

# Save the xarray Dataset as a NetCDF file
ds.to_netcdf(output_file_path + output_file_name)

print("DataFrame saved as NetCDF file successfully.")

"""

DataFrame saved as NetCDF file successfully.


In [30]:
# Save as .csv

itp.to_csv("/Users/ko389/Documents/Arctic_Water_Masses/Arctic_data/T_S_data_processing/data/itp_processed.csv")
