# Donwloading GPM-IMERG Data

In [16]:
"""==============================================================================

 Title          :HadUK_download.ipynb
 Description    :Download HadUK-Grid data from CEDA Archive 
 Author         :LF Velasquez - MA
 Date           :July 02 2021
 Version        :1.0
 Usage          :HadUK_download.ipynb
 Notes          : 
                - The user must have a CEDA account
 python version :3.8.5

=============================================================================="""



**Importing Modules**

In [14]:
# from http.cookiejar import CookieJar
# from urllib.parse import urlencode

# import urllib.request as urllib2
# import requests
from pathlib import Path
import wget
from datetime import datetime
from dateutil.relativedelta import relativedelta
import calendar

**Adding data directory**

In [2]:
FILEPATH = Path('/mnt/d/MRes_dataset/search_data/haduk_cedac_uk/') # path for CEH data store data


# uncomment below to check if it is the right path
# !ls {FILEPATH}

**Functions**

In [3]:
def data_download(data_url, file_2save):
    result = requests.get(data_url)
    try:
        result.raise_for_status()
        f = open(file_2save,'wb')
        f.write(result.content)
        f.close()
        print('contents of URL written to: {} '.format(file_2save))
    except:
        print('requests.get() returned an error code: {}'.format(str(result.status_code)))

**Set CEDA Archiveinformation**

In [25]:
# # Credential

# # CEDA Archive
# username = "geofelpave"
# password = "wIU2qQgf9h4D0"

# # Create password manager for 401 response
# password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm()
# password_manager.add_password(None, "https://data.ceda.ac.uk/", username, password) #Earthdata url
# # password_manager.add_password(None, "https://catalogue.ceh.ac.uk", username, password) #CEH data store url

# # Create cookie jar to store cookies. This will avoid having to authenticate user everytime data is requested
# cookie_jar = CookieJar()

# # Install handlers
# opener = urllib2.build_opener(
#     urllib2.HTTPBasicAuthHandler(password_manager),
#     #urllib2.HTTPHandler(debuglevel=1),    # Uncomment these two lines to see
#     #urllib2.HTTPSHandler(debuglevel=1),   # details of the requests/responses
#     urllib2.HTTPCookieProcessor(cookie_jar))
# urllib2.install_opener(opener)


# Working with Earthdata data
**Create list with dates**

In [23]:
# Create dates list ready to pass in URL

# Set loop to go through years - from 2000 to 2019
dates_lst = []
year = 2000

# create date objects
begin_year = datetime(year, 1, 1)
end_year = datetime(2019, 12, 31)
next_year = begin_year

while next_year <= end_year:
    dates_lst.append(next_year)
    next_year = next_year +  relativedelta(months=+1)


**Working with the date list to request the data**


In [24]:
# set list for url that can be reached
for fecha in dates_lst:
    lst_day = calendar.monthrange(fecha.year, fecha.month)[1]
    
#     https://dap.ceda.ac.uk/badc/ukmo-hadobs/data/insitu/MOHC/HadOBS/HadUK-Grid/v1.0.2.1/5km/rainfall/day/v20200731/rainfall_hadukgrid_uk_5km_day_20191201-20191231.nc
#     https://dap.ceda.ac.uk/badc/ukmo-hadobs/data/insitu/MOHC/HadOBS/HadUK-Grid/v1.0.2.1/5km/rainfall/day/v20200731/rainfall_hadukgrid_uk_5km_day_20000101-20000131.nc
    
    url_p1 = 'https://dap.ceda.ac.uk/badc/ukmo-hadobs/data/insitu/MOHC/HadOBS/HadUK-Grid/v1.0.2.1/5km/rainfall/day/v20200731/'
    url_p2 = 'rainfall_hadukgrid_uk_5km_day_'
    
    # Use f-strings to get padded date and month
    url = url_p1 + url_p2 + f"{fecha:%Y%m%d}" + '-' + f"{fecha:%Y%m}" + str(lst_day) + '.nc'
    print(url)
   
    # Create file to save data in local drive
    FILENAME = url_p2 + f"{fecha:%Y%m%d}" + '-' + f"{fecha:%Y%m}" + str(lst_day) + '.nc'
    SAVEFILE = Path(FILEPATH / FILENAME)
    
    # Run request using function
    SAVEFILE = Path(FILEPATH / FILENAME)
#     print(SAVEFILE)
    
#     wget.download(url, str(SAVEFILE))
    data_download(url, SAVEFILE)
    break
    
print ('--------------')
print ('--------------')
print ('Check files here: {}'.format(FILEPATH ))
print ('--------------')
print ('--------------')
print ('This is the URL to check')

https://dap.ceda.ac.uk/badc/ukmo-hadobs/data/insitu/MOHC/HadOBS/HadUK-Grid/v1.0.2.1/5km/rainfall/day/v20200731/rainfall_hadukgrid_uk_5km_day_20000101-20000131.nc
requests.get() returned an error code: 500
--------------
--------------
Check files here: /mnt/d/MRes_dataset/search_data/haduk_cedac_uk
--------------
--------------
This is the URL to check


# Working with CEH datastore
**Create list of years** (*The daily data is stored by year*)

In [15]:
# Create dates list ready to pass in URL
# add relativedata module
from dateutil.relativedelta import relativedelta

# Set loop to go through years - from 2000 to 2019
year_lst = []
year = 2000

# Create date objects
begin_year = datetime.date(2000,1,1)
end_year = datetime.date(2017, 12, 31)
one_year = relativedelta(years=1)

# Append year as a string to the year list
next_year = begin_year
while next_year <= end_year:
    year_lst.append(f"{next_year:%Y}")
    next_year += one_year
print(year_lst)

['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017']


**Working with the date list to request the data**

In [18]:
# set list for url that can be reached
for year in year_lst:
    url_p1 = 'https://catalogue.ceh.ac.uk/datastore/eidchub/ee9ab43d-a4fe-4e73-afd5-cd4fc4c82556/GB/daily/CEH_GEAR_daily_GB_'

    # Use f-strings to get padded date and month
    url = url_p1 + year + '.nc'
    print(url)
    
    # Create file to save data in local drive
    FILENAME = 'CEH_GEAR_daily_GB_' + year + '.nc'
    SAVEFILE = Path(FILEPATH / FILENAME)
    
    # Run request using function
    data_download(url, SAVEFILE)
    break
    
print ('--------------')
print ('--------------')
print ('Check files here: {}'.format(FILEPATH ))
print ('--------------')
print ('--------------')
print ('This is the URL to check')

https://catalogue.ceh.ac.uk/datastore/eidchub/ee9ab43d-a4fe-4e73-afd5-cd4fc4c82556/GB/daily/CEH_GEAR_daily_GB_2000.nc
contents of URL written to: /mnt/d/MRes_dataset/search_data/gear_ceh_uk/CEH_GEAR_daily_GB_2000.nc 
--------------
--------------
Check files here: /mnt/d/MRes_dataset/search_data/gear_ceh_uk
--------------
--------------
This is the URL to check


## Data download check

- The code below will only need to be run after doing the download of the data
- The code checks the files downloaded against the files that are expected based on the years
- If the file has not been downloaded, the code will try to download the code again


In [25]:
# Create list of file names
name_lst = []
for date in dates_lst:
    part1 = '3B-DAY.MS.MRG.3IMERG.'
    part2 = '-S000000-E235959.V06.nc4.nc4'
    file_name = part1 + f"{date:%Y%m%d}" + part2
    name_lst.append(file_name)

# Create list of file names in folder
file_lst = []
python_files = FILEPATH.glob('**/*.nc4') 
for pf in python_files:
    file_lst.append(pf.parts[6])
    
# Compare the two list and get missing values
# the main list is name_lst as it contains names of expected files
missing = list(sorted(set(name_lst) - set(file_lst)))
print ('There are {} missing files in the folder \n'.format(len(missing)))

# print ('--------------')
# print ('--------------')
print ('Trying to download the files again')
# print ('--------------')
# print ('--------------')

# set list for url that can be reached
'3B-DAY.MS.MRG.3IMERG.20000604-S000000-E235959.V06.nc4.nc4'

if len(missing) > 0:
    fd
    url_toCheck = []
    for file_name in missing:
        url_p1 = 'https://gpm1.gesdisc.eosdis.nasa.gov/opendap/GPM_L3/GPM_3IMERGDF.06/'
        url_p2 = file_name
        url_p3 = '?precipitationCal[0:0][1709:1819][1389:1509],time_bnds[0:0][0:1],time,lon[1709:1819],lat[1389:1509]'

        # Get year and month from the file name
        year = file_name.split('.')[4][:4]
        month = file_name.split('.')[4][:6][-2:]

    #     build URL
        url = url_p1 + year + '/' + month + '/' + file_name + url_p3
        print(url)

        # Create file to save data in local drive
        FILENAME = file_name
        SAVEFILE = Path(FILEPATH / FILENAME)

        # Run request using function
        data_nasa(url, SAVEFILE)

    print ('--------------')
    print ('--------------')
    print ('Check files here: {}'.format(FILEPATH ))
    print ('--------------')
    print ('--------------')
else:
    print ('--------------')
    print ('--------------')
    print ('All the files have been downloaded!')
    print ('--------------')
    print ('--------------')
    


There are 0 missing files in the folder 

Trying to download the files again
--------------
--------------
All the files have been downloaded!
--------------
--------------
