## This notebook processes Argo data


In [1]:
# Concatenate all .csv files in argo_Arctic folder 
# argo_Arctic folder = argo floats >60N downloaded from https://dataselection.euro-argo.eu)

import os 
import pandas as pd

folder_path = '/Users/ko389/Documents/Arctic_Water_Masses/Arctic_data/argo_Arctic'

# get a list of all .csv files in the foler
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

dataframes = []

# loop through each .csv file and read it into a Pandas DataFrame
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

# Concatenate all DataFrames into a single DataFrame
argo = pd.concat(dataframes, ignore_index=True)
argo

Unnamed: 0,PLATFORM_CODE,DATE (YYYY-MM-DDTHH:MI:SSZ),DATE_QC,LATITUDE (degree_north),LONGITUDE (degree_east),POSITION_QC,PRES (decibar),PRES_QC,PRES_ADJUSTED (decibar),PRES_ADJUSTED_QC,DOX2_ADJUSTED (micromole/kg),DOX2_ADJUSTED_QC,TEMP_ADJUSTED (degree_Celsius),TEMP_ADJUSTED_QC,PSAL_ADJUSTED (psu),PSAL_ADJUSTED_QC
0,4902602,2021-10-26T14:23:07Z,1,72.71641,-66.70568,1,0.0,1,,,361.36469,1.0,,,,
1,4902602,2021-10-26T14:23:07Z,1,72.71641,-66.70568,1,0.1,1,,,361.86853,1.0,,,,
2,4902602,2021-10-26T14:23:07Z,1,72.71641,-66.70568,1,1.4,1,,,361.93671,1.0,,,,
3,4902602,2021-10-26T14:23:07Z,1,72.71641,-66.70568,1,2.4,1,,,362.13568,1.0,,,,
4,4902602,2021-10-26T14:23:07Z,1,72.71641,-66.70568,1,2.9,1,,,362.38297,1.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19182322,6901231,2015-06-28T22:54:17Z,1,70.14200,7.56400,1,1799.7,1,1799.9,1.0,,,-0.693,1.0,34.90789,1.0
19182323,6901231,2015-06-28T22:54:17Z,1,70.14200,7.56400,1,1849.9,1,1850.1,1.0,,,-0.706,1.0,34.90889,1.0
19182324,6901231,2015-06-28T22:54:17Z,1,70.14200,7.56400,1,1900.4,1,1900.6,1.0,,,-0.716,1.0,34.90789,1.0
19182325,6901231,2015-06-28T22:54:17Z,1,70.14200,7.56400,1,1949.8,1,1950.0,1.0,,,-0.727,1.0,34.90889,1.0


In [2]:
# remove some columns
argo.drop(columns = ['DATE_QC', 'POSITION_QC','PRES (decibar)','PRES_QC','PRES_ADJUSTED_QC','DOX2_ADJUSTED_QC','DOX2_ADJUSTED_QC','TEMP_ADJUSTED_QC','PSAL_ADJUSTED_QC'], inplace=True)
# rename headers
argo.rename(columns={'DATE (YYYY-MM-DDTHH:MI:SSZ)': 'date', 'LATITUDE (degree_north)': 'latitude', 'LONGITUDE (degree_east)': 'longitude','PRES_ADJUSTED (decibar)': 'pressure','DOX2_ADJUSTED (micromole/kg)': 'dissolved_oxygen',
              'TEMP_ADJUSTED (degree_Celsius)': 'insitu_temperature', 'PSAL_ADJUSTED (psu)': 'practical_salinity'},inplace=True)
argo

Unnamed: 0,PLATFORM_CODE,date,latitude,longitude,pressure,dissolved_oxygen,insitu_temperature,practical_salinity
0,4902602,2021-10-26T14:23:07Z,72.71641,-66.70568,,361.36469,,
1,4902602,2021-10-26T14:23:07Z,72.71641,-66.70568,,361.86853,,
2,4902602,2021-10-26T14:23:07Z,72.71641,-66.70568,,361.93671,,
3,4902602,2021-10-26T14:23:07Z,72.71641,-66.70568,,362.13568,,
4,4902602,2021-10-26T14:23:07Z,72.71641,-66.70568,,362.38297,,
...,...,...,...,...,...,...,...,...
19182322,6901231,2015-06-28T22:54:17Z,70.14200,7.56400,1799.9,,-0.693,34.90789
19182323,6901231,2015-06-28T22:54:17Z,70.14200,7.56400,1850.1,,-0.706,34.90889
19182324,6901231,2015-06-28T22:54:17Z,70.14200,7.56400,1900.6,,-0.716,34.90789
19182325,6901231,2015-06-28T22:54:17Z,70.14200,7.56400,1950.0,,-0.727,34.90889


In [3]:
# Drop rows with any nans
argo = argo.dropna(subset=['insitu_temperature', 'practical_salinity', 'pressure'], how='any')

# Convert the date column to datetime format
from datetime import datetime
argo['datetime'] = pd.to_datetime(argo['date']) 
argo = argo.drop(columns = {'date'})

# Get depth
import gsw as gsw
argo['depth'] = -gsw.z_from_p(argo['pressure'], argo['latitude'])

# Add nprof based on platform code
argo['nprof'] = pd.factorize(argo['PLATFORM_CODE'])[0] + 1
argo = argo.drop(['PLATFORM_CODE'], axis=1)

# Add source column
argo['source']='argo'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  argo['datetime'] = pd.to_datetime(argo['date'])


Unnamed: 0,latitude,longitude,pressure,dissolved_oxygen,insitu_temperature,practical_salinity,datetime,depth,nprof,source
74833,62.8446,-6.758,12.4,,12.418,35.34700,2009-08-09 11:41:54+00:00,12.280025,1,argo
74834,62.8446,-6.758,15.4,,12.401,35.34700,2009-08-09 11:41:54+00:00,15.250888,1,argo
74835,62.8446,-6.758,18.5,,12.330,35.34900,2009-08-09 11:41:54+00:00,18.320734,1,argo
74836,62.8446,-6.758,21.7,,12.250,35.35200,2009-08-09 11:41:54+00:00,21.489559,1,argo
74837,62.8446,-6.758,24.2,,12.190,35.35300,2009-08-09 11:41:54+00:00,23.965170,1,argo
...,...,...,...,...,...,...,...,...,...,...
19182322,70.1420,7.564,1799.9,,-0.693,34.90789,2015-06-28 22:54:17+00:00,1774.018005,549,argo
19182323,70.1420,7.564,1850.1,,-0.706,34.90889,2015-06-28 22:54:17+00:00,1823.281135,549,argo
19182324,70.1420,7.564,1900.6,,-0.716,34.90789,2015-06-28 22:54:17+00:00,1872.827122,549,argo
19182325,70.1420,7.564,1950.0,,-0.727,34.90889,2015-06-28 22:54:17+00:00,1921.282705,549,argo


In [12]:
# Save as .csv (needed for chunking)

argo.to_csv('/Users/ko389/Documents/Arctic_Water_Masses/Arctic_data/T_S_data_processing/data/argo_processed.csv', index=False)