# Extracting files directly from OPENDAC website

### libraries : netCDF4, pydap.client (READ netCDF files)
### Library: 'BeautifulSoup' to read the contents of the webpage
* Reading the contents from the desired web page (here, OCO2)
* We pass the parameters: Instrument and year
* Save the links of different datasets in a list
* open up the lists using <b>netCDF</b>

### Website: 
https://oco2.gesdisc.eosdis.nasa.gov/opendap/

# Data Retrieving parameter:
### Instrument and version: 
* OCO2_L2_Lite_FP.10r
### Year: 2020

In [1]:
from pydap import client
import netCDF4 as nc

# for data-preprocessing
import pandas as pd
import numpy as np
import matplotlib.pyplot

In [2]:
# Reading Single file from the link
my_df= nc.Dataset('https://oco2.gesdisc.eosdis.nasa.gov/opendap/OCO2_L2_Lite_FP.10r/2020/oco2_LtCO2_200109_B10206Ar_200728203551s.nc4')

In [3]:
len(my_df.variables['xco2'])

148936

In [4]:
data_set= client.open_url('https://oco2.gesdisc.eosdis.nasa.gov/opendap/OCO2_L2_Lite_FP.10r/2020/oco2_LtCO2_200109_B10206Ar_200728203551s.nc4')

In [5]:
my_df.variables.keys()

dict_keys(['xco2_apriori', 'file_index', 'xco2_qf_simple_bitflag', 'pressure_levels', 'xco2', 'time', 'pressure_weight', 'Preprocessors_co2_ratio', 'Preprocessors_max_declocking_sco2', 'Preprocessors_max_declocking_o2a', 'Preprocessors_xco2_strong_idp', 'Preprocessors_max_declocking_wco2', 'Preprocessors_dp_abp', 'Preprocessors_co2_ratio_offset_per_footprint', 'Preprocessors_h2o_ratio', 'Preprocessors_xco2_weak_idp', 'Preprocessors_h2o_ratio_offset_per_footprint', 'solar_zenith_angle', 'longitude', 'xco2_qf_bitflag', 'latitude', 'sensor_zenith_angle', 'Meteorology_psurf_apriori_o2a', 'Meteorology_psurf_apriori_wco2', 'Meteorology_psurf_apriori_sco2', 'Meteorology_windspeed_u_met', 'Meteorology_windspeed_v_met', 'xco2_quality_flag', 'xco2_averaging_kernel', 'date', 'Retrieval_dp_o2a', 'Retrieval_dust_height', 'Retrieval_aod_water', 'Retrieval_s32', 'Retrieval_chi2_sco2', 'Retrieval_aod_dust', 'Retrieval_albedo_slope_wco2', 'Retrieval_aod_bc', 'Retrieval_aod_strataer', 'Retrieval_aod_sea

# Reading the contents from the WEB page and retrieving only the links
### Libraries: BeautifulSoup to read webElements
- Collecting the lists and using the links to open datasets using <b>NETCDF</b>

## WEB PAGE:
- Here, we can pass the user input for <b>Instrument and Year</b>
* url= 'https://oco2.gesdisc.eosdis.nasa.gov/opendap/'+str(instrument)+'/'+ str(year)+'/contents.html'

In [6]:
import requests

import urllib3
from bs4 import BeautifulSoup

In [7]:
url= 'https://oco2.gesdisc.eosdis.nasa.gov/opendap/OCO2_L2_Lite_FP.10r/2020/contents.html'

In [11]:
# Pass the following parameters for this test
# Instrument: current testing(Pass) -> OCO2_L2_Lite_FP.10r
# year: 2020

instrument= input('Enter Instrument Version: ')
year=input("Enter Year: ")

Enter Instrument Version: OCO2_L2_Lite_FP.10r
Enter Year: 2020


# READ contents from the URL:
* By passing the parameters from the USER input

In [12]:
my_url= 'https://oco2.gesdisc.eosdis.nasa.gov/opendap/'+str(instrument)+'/'+ str(year)+'/contents.html'

In [13]:
my_url

'https://oco2.gesdisc.eosdis.nasa.gov/opendap/OCO2_L2_Lite_FP.10r/2020/contents.html'

In [14]:
# Get the content from the webpage
reqs= requests.get(my_url)

# selecting the lxml parser
soup= BeautifulSoup(reqs.text, 'lxml')



In [15]:
# HTML contents
type(soup)

bs4.BeautifulSoup

# Filtering: to Get contents from the tag 
* "\<a>" only, which lists the contents
### Cleaning and saving the LINKS for the datasets only

In [16]:
# total links
oco2_links= []

for link in soup.find_all('a'):
    print(link.get('href'))
    oco2_links.append(link.get('href'))

#
oco2_LtCO2_200101_B10206Ar_200728183348s.nc4.html
oco2_LtCO2_200101_B10206Ar_200728183348s.nc4.ddx
oco2_LtCO2_200101_B10206Ar_200728183348s.nc4.dds
oco2_LtCO2_200101_B10206Ar_200728183348s.nc4.das
oco2_LtCO2_200101_B10206Ar_200728183348s.nc4.info
oco2_LtCO2_200101_B10206Ar_200728183348s.nc4.html
oco2_LtCO2_200101_B10206Ar_200728183348s.nc4.rdf
oco2_LtCO2_200101_B10206Ar_200728183348s.nc4.covjson
/opendap/viewers/viewers?dapService=/opendap/hyrax&datasetID=/OCO2_L2_Lite_FP.10r/2020/oco2_LtCO2_200101_B10206Ar_200728183348s.nc4
oco2_LtCO2_200101_B10206Ar_200728183348s.nc4.xml
oco2_LtCO2_200102_B10206Ar_200728203252s.nc4.html
oco2_LtCO2_200102_B10206Ar_200728203252s.nc4.ddx
oco2_LtCO2_200102_B10206Ar_200728203252s.nc4.dds
oco2_LtCO2_200102_B10206Ar_200728203252s.nc4.das
oco2_LtCO2_200102_B10206Ar_200728203252s.nc4.info
oco2_LtCO2_200102_B10206Ar_200728203252s.nc4.html
oco2_LtCO2_200102_B10206Ar_200728203252s.nc4.rdf
oco2_LtCO2_200102_B10206Ar_200728203252s.nc4.covjson
/opendap/viewers/vi

oco2_LtCO2_200318_B10206Ar_210917175426s.nc4.html
oco2_LtCO2_200318_B10206Ar_210917175426s.nc4.rdf
oco2_LtCO2_200318_B10206Ar_210917175426s.nc4.covjson
/opendap/viewers/viewers?dapService=/opendap/hyrax&datasetID=/OCO2_L2_Lite_FP.10r/2020/oco2_LtCO2_200318_B10206Ar_210917175426s.nc4
oco2_LtCO2_200318_B10206Ar_210917175426s.nc4.xml
oco2_LtCO2_200319_B10206Ar_210917175553s.nc4.html
oco2_LtCO2_200319_B10206Ar_210917175553s.nc4.ddx
oco2_LtCO2_200319_B10206Ar_210917175553s.nc4.dds
oco2_LtCO2_200319_B10206Ar_210917175553s.nc4.das
oco2_LtCO2_200319_B10206Ar_210917175553s.nc4.info
oco2_LtCO2_200319_B10206Ar_210917175553s.nc4.html
oco2_LtCO2_200319_B10206Ar_210917175553s.nc4.rdf
oco2_LtCO2_200319_B10206Ar_210917175553s.nc4.covjson
/opendap/viewers/viewers?dapService=/opendap/hyrax&datasetID=/OCO2_L2_Lite_FP.10r/2020/oco2_LtCO2_200319_B10206Ar_210917175553s.nc4
oco2_LtCO2_200319_B10206Ar_210917175553s.nc4.xml
oco2_LtCO2_200320_B10206Ar_210917175945s.nc4.html
oco2_LtCO2_200320_B10206Ar_2109171759

oco2_LtCO2_200702_B10206Ar_210920045208s.nc4.html
oco2_LtCO2_200702_B10206Ar_210920045208s.nc4.ddx
oco2_LtCO2_200702_B10206Ar_210920045208s.nc4.dds
oco2_LtCO2_200702_B10206Ar_210920045208s.nc4.das
oco2_LtCO2_200702_B10206Ar_210920045208s.nc4.info
oco2_LtCO2_200702_B10206Ar_210920045208s.nc4.html
oco2_LtCO2_200702_B10206Ar_210920045208s.nc4.rdf
oco2_LtCO2_200702_B10206Ar_210920045208s.nc4.covjson
/opendap/viewers/viewers?dapService=/opendap/hyrax&datasetID=/OCO2_L2_Lite_FP.10r/2020/oco2_LtCO2_200702_B10206Ar_210920045208s.nc4
oco2_LtCO2_200702_B10206Ar_210920045208s.nc4.xml
oco2_LtCO2_200703_B10206Ar_210920045226s.nc4.html
oco2_LtCO2_200703_B10206Ar_210920045226s.nc4.ddx
oco2_LtCO2_200703_B10206Ar_210920045226s.nc4.dds
oco2_LtCO2_200703_B10206Ar_210920045226s.nc4.das
oco2_LtCO2_200703_B10206Ar_210920045226s.nc4.info
oco2_LtCO2_200703_B10206Ar_210920045226s.nc4.html
oco2_LtCO2_200703_B10206Ar_210920045226s.nc4.rdf
oco2_LtCO2_200703_B10206Ar_210920045226s.nc4.covjson
/opendap/viewers/view

oco2_LtCO2_200719_B10206Ar_210920053239s.nc4.das
oco2_LtCO2_200719_B10206Ar_210920053239s.nc4.info
oco2_LtCO2_200719_B10206Ar_210920053239s.nc4.html
oco2_LtCO2_200719_B10206Ar_210920053239s.nc4.rdf
oco2_LtCO2_200719_B10206Ar_210920053239s.nc4.covjson
/opendap/viewers/viewers?dapService=/opendap/hyrax&datasetID=/OCO2_L2_Lite_FP.10r/2020/oco2_LtCO2_200719_B10206Ar_210920053239s.nc4
oco2_LtCO2_200719_B10206Ar_210920053239s.nc4.xml
oco2_LtCO2_200720_B10206Ar_210920053632s.nc4.html
oco2_LtCO2_200720_B10206Ar_210920053632s.nc4.ddx
oco2_LtCO2_200720_B10206Ar_210920053632s.nc4.dds
oco2_LtCO2_200720_B10206Ar_210920053632s.nc4.das
oco2_LtCO2_200720_B10206Ar_210920053632s.nc4.info
oco2_LtCO2_200720_B10206Ar_210920053632s.nc4.html
oco2_LtCO2_200720_B10206Ar_210920053632s.nc4.rdf
oco2_LtCO2_200720_B10206Ar_210920053632s.nc4.covjson
/opendap/viewers/viewers?dapService=/opendap/hyrax&datasetID=/OCO2_L2_Lite_FP.10r/2020/oco2_LtCO2_200720_B10206Ar_210920053632s.nc4
oco2_LtCO2_200720_B10206Ar_2109200536

oco2_LtCO2_200826_B10206Ar_210920192051s.nc4.html
oco2_LtCO2_200826_B10206Ar_210920192051s.nc4.rdf
oco2_LtCO2_200826_B10206Ar_210920192051s.nc4.covjson
/opendap/viewers/viewers?dapService=/opendap/hyrax&datasetID=/OCO2_L2_Lite_FP.10r/2020/oco2_LtCO2_200826_B10206Ar_210920192051s.nc4
oco2_LtCO2_200826_B10206Ar_210920192051s.nc4.xml
oco2_LtCO2_200827_B10206Ar_210920192328s.nc4.html
oco2_LtCO2_200827_B10206Ar_210920192328s.nc4.ddx
oco2_LtCO2_200827_B10206Ar_210920192328s.nc4.dds
oco2_LtCO2_200827_B10206Ar_210920192328s.nc4.das
oco2_LtCO2_200827_B10206Ar_210920192328s.nc4.info
oco2_LtCO2_200827_B10206Ar_210920192328s.nc4.html
oco2_LtCO2_200827_B10206Ar_210920192328s.nc4.rdf
oco2_LtCO2_200827_B10206Ar_210920192328s.nc4.covjson
/opendap/viewers/viewers?dapService=/opendap/hyrax&datasetID=/OCO2_L2_Lite_FP.10r/2020/oco2_LtCO2_200827_B10206Ar_210920192328s.nc4
oco2_LtCO2_200827_B10206Ar_210920192328s.nc4.xml
oco2_LtCO2_200828_B10206Ar_210920192601s.nc4.html
oco2_LtCO2_200828_B10206Ar_2109201926

oco2_LtCO2_201104_B10206Ar_210921221726s.nc4.xml
oco2_LtCO2_201105_B10206Ar_210921221909s.nc4.html
oco2_LtCO2_201105_B10206Ar_210921221909s.nc4.ddx
oco2_LtCO2_201105_B10206Ar_210921221909s.nc4.dds
oco2_LtCO2_201105_B10206Ar_210921221909s.nc4.das
oco2_LtCO2_201105_B10206Ar_210921221909s.nc4.info
oco2_LtCO2_201105_B10206Ar_210921221909s.nc4.html
oco2_LtCO2_201105_B10206Ar_210921221909s.nc4.rdf
oco2_LtCO2_201105_B10206Ar_210921221909s.nc4.covjson
/opendap/viewers/viewers?dapService=/opendap/hyrax&datasetID=/OCO2_L2_Lite_FP.10r/2020/oco2_LtCO2_201105_B10206Ar_210921221909s.nc4
oco2_LtCO2_201105_B10206Ar_210921221909s.nc4.xml
oco2_LtCO2_201106_B10206Ar_210921222243s.nc4.html
oco2_LtCO2_201106_B10206Ar_210921222243s.nc4.ddx
oco2_LtCO2_201106_B10206Ar_210921222243s.nc4.dds
oco2_LtCO2_201106_B10206Ar_210921222243s.nc4.das
oco2_LtCO2_201106_B10206Ar_210921222243s.nc4.info
oco2_LtCO2_201106_B10206Ar_210921222243s.nc4.html
oco2_LtCO2_201106_B10206Ar_210921222243s.nc4.rdf
oco2_LtCO2_201106_B10206A

# Get the links ending with 'html' only
### Rest of the links have different purpose
### NOTE: another important link ending with 'info' gives information on Product


In [17]:
# storing the html links
dataset_links=[]

for k in range(0, len(oco2_links)):
    if oco2_links[k].endswith(".html"):
        #print(oco2_links[k])
        
        # Strip the 'html' from the links 
        dataset_links.append(oco2_links[k].strip('.html'))

In [18]:
# to avoid duplicate records
p=0
complete_oco2_links=[]

for i in range(0, len(dataset_links)):
    try:
        complete_oco2_links.append(dataset_links[i+p])
        p+=1
    # Ignoring the Out of Index error
    except IndexError as e:
        continue

In [19]:
# TESTING: for duplicate records, output: half
len(dataset_links),len(complete_oco2_links)

(712, 356)

# Other Important links:
* Information on the product

# Using the lists to Retrieve datasets
* Using <b>netCDF</b> library to get the data

In [20]:
# CHECK: JAN to DEC dates on the filenames

complete_oco2_links[:5], print('****') ,complete_oco2_links[-4:]

****


(['oco2_LtCO2_200101_B10206Ar_200728183348s.nc4',
  'oco2_LtCO2_200102_B10206Ar_200728203252s.nc4',
  'oco2_LtCO2_200103_B10206Ar_200728203534s.nc4',
  'oco2_LtCO2_200108_B10206Ar_200728203546s.nc4',
  'oco2_LtCO2_200109_B10206Ar_200728203551s.nc4'],
 None,
 ['oco2_LtCO2_201228_B10206Ar_210922003931s.nc4',
  'oco2_LtCO2_201229_B10206Ar_210922003948s.nc4',
  'oco2_LtCO2_201230_B10206Ar_210922004428s.nc4',
  'oco2_LtCO2_201231_B10206Ar_210922004512s.nc4'])

### TEST
* Attaching the full link + dataset
* my_url= 'https://oco2.gesdisc.eosdis.nasa.gov/opendap/'+str(instrument)+'/'+ str(year)+ complete_oco2_links[0]

In [21]:
complete_oco2_links[0]

'oco2_LtCO2_200101_B10206Ar_200728183348s.nc4'

In [22]:
first_element_oco2= nc.Dataset('https://oco2.gesdisc.eosdis.nasa.gov/opendap/'+str(instrument)+'/'+ str(year)+'/'+complete_oco2_links[2])
first_element_oco2.variables.keys()

dict_keys(['xco2_apriori', 'file_index', 'xco2_qf_simple_bitflag', 'pressure_levels', 'xco2', 'time', 'pressure_weight', 'Preprocessors_co2_ratio', 'Preprocessors_max_declocking_sco2', 'Preprocessors_max_declocking_o2a', 'Preprocessors_xco2_strong_idp', 'Preprocessors_max_declocking_wco2', 'Preprocessors_dp_abp', 'Preprocessors_co2_ratio_offset_per_footprint', 'Preprocessors_h2o_ratio', 'Preprocessors_xco2_weak_idp', 'Preprocessors_h2o_ratio_offset_per_footprint', 'solar_zenith_angle', 'longitude', 'xco2_qf_bitflag', 'latitude', 'sensor_zenith_angle', 'Meteorology_psurf_apriori_o2a', 'Meteorology_psurf_apriori_wco2', 'Meteorology_psurf_apriori_sco2', 'Meteorology_windspeed_u_met', 'Meteorology_windspeed_v_met', 'xco2_quality_flag', 'xco2_averaging_kernel', 'date', 'Retrieval_dp_o2a', 'Retrieval_dust_height', 'Retrieval_aod_water', 'Retrieval_s32', 'Retrieval_chi2_sco2', 'Retrieval_aod_dust', 'Retrieval_albedo_slope_wco2', 'Retrieval_aod_bc', 'Retrieval_aod_strataer', 'Retrieval_aod_sea

contents of URL written to path
