# Create, enrich, and clean station metadata from `ghcnd-stations.txt`, `ghcnd-countries.txt`, and `GeoInformation_County.csv`

In [0]:
import urllib.request
import numpy as np
import pandas as pd

URL_station = "https://www.ncei.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt"

with urllib.request.urlopen(URL_station) as file:

    lines = [line.decode('utf-8').rstrip() for line in file]
 
    ID_list = [line[:11] for line in lines]
 
    lat_list = [line[12:20].lstrip() for line in lines]
 
    lon_list = [line[21:30].lstrip() for line in lines]
   
    state_list = [line[38:40].lstrip() for line in lines]
 
    station_name_list = [line[41:71].strip() for line in lines]


metadata_station = pd.DataFrame({  
    'ID' : ID_list,
    'Latitude' : lat_list,
    'Longitude' : lon_list,
    'State' : state_list,
    'StationName' : station_name_list
})

metadata_station['State'] = metadata_station['State'].replace('', np.nan)
metadata_station['FIPS'] = metadata_station.ID.str[:2]

In [0]:
URL_countries = "https://www.ncei.noaa.gov/pub/data/ghcn/daily/ghcnd-countries.txt"

with urllib.request.urlopen(URL_countries) as file:
    
    lines = [line.decode('utf-8').rstrip().split(" ", 1) for line in file]
    
    Country_code = [line[0] for line in lines]
    
    Country = [line[1] for line in lines]
    
metadata_countries = pd.DataFrame({  
    'FIPS' : Country_code,
    'Country' : Country
})

In [0]:
geoinfo = pd.read_csv('GeoInformation_Country.csv')
geoinfo.drop('Country', axis=1, inplace = True)

In [0]:
middle_metedata = pd.merge(metadata_station, metadata_countries, on = 'FIPS')
final_metadata = pd.merge(middle_metedata, geoinfo, on = 'FIPS', how = 'left')

missing_data_fips_list = final_metadata[final_metadata.Region.isnull() == True]['FIPS'].unique()
missing_data_continent_list = ['Asia', 'Atlantic Ocean', 'Oceania', 'Europe', 'Americas', 'Europe', 'Americas']
missing_data_region_list = ['South West Asia', 'South Atlantic Ocean', 'North Pacific Ocean', 'South East Europe', 'North America', 'South East Europe', 'South America']
new_dict = {i:[j, k] for i, j, k in zip(missing_data_fips_list, missing_data_continent_list, missing_data_region_list)}

for k, v in new_dict.items():
    
    final_metadata.loc[final_metadata['FIPS'] == k, 'Continent'] = v[0]

    final_metadata.loc[final_metadata['FIPS'] == k, 'Region'] = v[1]

final_metadata.loc[final_metadata['State'].isna(), 'State'] = 'Out of States'
final_metadata.loc[final_metadata['FIPS'] == 'AE', 'Capital'] = 'Abu Dhabi'
final_metadata.rename({'ISO (2)' : 'ISO-2', 'ISO (3)' : 'ISO-3', 'ISO (No)' : 'ISO-No'}, axis = 1, inplace = True)
final_metadata.fillna('No Record', inplace = True)

final_metadata.isna().sum()

ID             0
Latitude       0
Longitude      0
State          0
StationName    0
FIPS           0
Country        0
Continent      0
Region         0
Capital        0
ISO-2          0
ISO-3          0
ISO-No         0
Internet       0
Note           0
dtype: int64

In [0]:
final_metadata.to_csv('station_metadata.csv', index = False)