# Exercise 1: Generate a Map

In [3]:
server = "opendata.dwd.de"
user   = "anonymous"
passwd = ""

### FTP Directory Definition and Station Description Filename Pattern

In [16]:
# path for historical hourly precipitation data. 
topic_dir = "/hourly/precipitation/historical/"
 
# Common pattern of all the Station info files
station_desc_file_pattern = "_Beschreibung_Stationen.txt"

# Main path for climate data on the website
climate_data_dir_website = "/climate_environment/CDC/observations_germany/climate/"

#Forming the entire path as on the website
website_path =  climate_data_dir_website + topic_dir

### Local Directories

In [17]:
# Replicating the subdirectory path as on the website
local_ts_dir = "../data/original/DWD/" + topic_dir  
local_station_file_path = local_ts_dir # station info directory.

In [18]:
print(local_station_file_path)

../data/original/DWD//hourly/precipitation/historical/


In [19]:
# Directory trees are created. Ignore errors if they already exist.
import os
os.makedirs(local_station_file_path,exist_ok = True) # it does not complain if the dir already exists.
os.makedirs(local_ts_dir,exist_ok = True) # it does not complain if the dir already exists.

### FTP Connect

In [20]:
# Opening the FTP session. Loging in.
import ftplib
ftp = ftplib.FTP(server)
result = ftp.login(user=user, passwd = passwd)
print(result)

230 Login successful.


In [21]:
# Checking whether the connection is still open (not having reached a timeout yet)
ret = ftp.cwd(".")

### FTP Grab File Function

In [22]:
def grabFile(ftpfilename,localfilename):
    try:
        ret = ftp.cwd(".") # A dummy action to check the connection and to provoke an exception if necessary.
        localfile = open(localfilename, 'wb')
        ftp.retrbinary('RETR ' + ftpfilename, localfile.write, 1024)
        localfile.close()
    
    except ftplib.error_perm:
        print("FTP ERROR. Operation not permitted. File not found?")

    except ftplib.error_temp:
        print("FTP ERROR. Timeout.")

    except ConnectionAbortedError:
        print("FTP ERROR. Connection aborted.")



### Generate Pandas Dataframe from FTP Directory Listing

In [23]:
import pandas as pd
import os

# generate a pandas dataframe from a FTP directory listing. 
def gen_df_from_ftp_dir_listing(ftp, ftpdir):
    lines = []
    flist = []
    try:
        # issue the command LIST in the FTP connection 
        res = ftp.retrlines("LIST "+ftpdir, lines.append)
    except:
        print("Error: ftp.retrlines() failed. ftp timeout? Reconnect!")
        return
        
    if len(lines) == 0:
        print("Error: ftp dir is empty")
        return
    
    for line in lines:
#        print(line)
        [ftype, fsize, fname] = [line[0:1], int(line[31:42]), line[56:]]
        
        fext = os.path.splitext(fname)[-1]
        
        if fext == ".zip":
            station_id = int(fname.split("_")[2])
        else:
            station_id = -1 
        
        flist.append([station_id, fname, fext, fsize, ftype])
        
        

    df_ftp_dir = pd.DataFrame(flist,columns=["station_id", "name", "ext", "size", "type"])
    return(df_ftp_dir)

In [25]:
# Generate a pandas dataframe from the FTP directory listing 
df_ftp_dir = gen_df_from_ftp_dir_listing(ftp, website_path)

In [26]:
df_ftp_dir.head(10)

Unnamed: 0,station_id,name,ext,size,type
0,-1,BESCHREIBUNG_obsgermany_climate_hourly_precipi...,.pdf,71445,-
1,-1,DESCRIPTION_obsgermany_climate_hourly_precipit...,.pdf,69716,-
2,-1,RR_Stundenwerte_Beschreibung_Stationen.txt,.txt,209079,-
3,3,stundenwerte_RR_00003_19950901_20110401_hist.zip,.zip,419265,-
4,20,stundenwerte_RR_00020_20040814_20191231_hist.zip,.zip,407378,-
5,44,stundenwerte_RR_00044_20070401_20191231_hist.zip,.zip,320516,-
6,53,stundenwerte_RR_00053_20051001_20191231_hist.zip,.zip,361931,-
7,71,stundenwerte_RR_00071_20041022_20191231_hist.zip,.zip,402880,-
8,73,stundenwerte_RR_00073_20070401_20191231_hist.zip,.zip,333070,-
9,78,stundenwerte_RR_00078_20041101_20191231_hist.zip,.zip,384729,-


### Download the Station Description File

In [27]:
station_fname = df_ftp_dir[df_ftp_dir['name'].str.contains(station_desc_file_pattern)]["name"].values[0]
print(station_fname)

RR_Stundenwerte_Beschreibung_Stationen.txt


In [28]:
print("grab file: " + station_fname + "\nfrom ftp dir: " + website_path)
grabFile(website_path + station_fname, local_station_file_path + station_fname)

grab file: RR_Stundenwerte_Beschreibung_Stationen.txt
from ftp dir: /climate_environment/CDC/observations_germany/climate//hourly/precipitation/historical/


In [29]:
# extracting column names since they are in German, to convert to English names.
# We have to use codecs because of difficulties with character encoding (German Umlaute)
import codecs

def station_desc_txt_to_csv(txtfile, csvfile):
    file = codecs.open(txtfile,"r","utf-8")
    r = file.readline()
    file.close()
    colnames_de = r.split()
    colnames_de
    
    # German-English dictionary
    translate = \
    {'Stations_id':'station_id',
     'von_datum':'date_from',
     'bis_datum':'date_to',
     'Stationshoehe':'altitude',
     'geoBreite': 'latitude',
     'geoLaenge': 'longitude',
     'Stationsname':'name',
     'Bundesland':'state'}
    
    colnames_en = [translate[h] for h in colnames_de]
    
    # Skip the first two rows and set the column names.
    df_temp = pd.read_fwf(txtfile,skiprows=2,names=colnames_en, parse_dates=["date_from","date_to"],index_col = 0)
    
    # write CSV file with field separator semicolon
    df_temp.to_csv(csvfile, sep = ";")
    return(df_temp)

In [30]:
# Extracting name of the file
basename = os.path.splitext(station_fname)[0]
basename

'RR_Stundenwerte_Beschreibung_Stationen'

In [31]:
# converting text file to CSV for further use
df_stations = station_desc_txt_to_csv(local_station_file_path + station_fname, local_station_file_path + basename + ".csv")

# Final Formatted station description file
df_stations

Unnamed: 0_level_0,date_from,date_to,altitude,latitude,longitude,name,state
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,1995-09-01,2011-04-01,202,50.7827,6.0941,Aachen,Nordrhein-Westfalen
20,2004-08-14,2021-03-28,432,48.9220,9.9129,Abtsgmünd-Untergröningen,Baden-Württemberg
44,2007-04-01,2021-03-28,44,52.9336,8.2370,Großenkneten,Niedersachsen
53,2005-10-01,2021-03-28,60,52.5850,13.5634,Ahrensfelde,Brandenburg
71,2004-10-22,2020-01-01,759,48.2156,8.9784,Albstadt-Badkap,Baden-Württemberg
...,...,...,...,...,...,...,...
15478,2015-02-01,2021-03-28,515,48.2095,10.0654,Dietenheim,Baden-Württemberg
15490,2016-12-01,2021-03-28,298,50.8053,8.3382,Eschenburg-Eibelshausen,Hessen
15512,2016-09-01,2021-03-28,66,53.0859,10.1611,Rehlingen-Ehlbeck,Niedersachsen
15514,2017-11-01,2021-03-28,104,49.3766,8.6177,Heidelberg-Kurpfalzhof,Baden-Württemberg
