In [13]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common import action_chains, keys
import time
from os import listdir
from os.path import isfile, join
import os
import shutil
import gzip
import re
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [14]:
def create_station_info_dict(line):
    line_split = line.split()
    return {'wmo_id': int(line_split[0]), 'locality': line_split[1], 
            'latitude': float(line_split[2]), 'longitude': float(line_split[4]),
            'height': int(line_split[6])}

In [26]:
with open('/mnt/HARD/MinMax94/data/data_all/CSV/wmo_russia_list.txt', encoding='cp1251') as f:
    content = f.readlines()
    
stations_info_list = [create_station_info_dict(line) for line in content]
stations_info = pd.DataFrame(stations_info_list)
stations_info = stations_info.set_index('wmo_id')
stations_info.head()

Unnamed: 0_level_0,height,latitude,locality,longitude
wmo_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20046,21,80.6,"Им.Э.Т.Кренкеля,ГМО",58.0
20069,10,79.5,Остров_Визе,76.98
20087,7,79.55,Голомянный,90.62
20107,73,78.07,Баренцбург,14.25
20289,9,77.2,Русский,96.4


In [87]:
def extract_gzip(file_path):
    inF = gzip.GzipFile(file_path, 'rb')
    data = inF.read()                                                                                      
    inF.close()
    
    filename = file_path.split('/')[-1]
    newfilename = filename.split('.')[0] + '.csv'
    extracted_file_path = file_path.replace(filename, newfilename)
    
    outF = open(extracted_file_path, 'wb')
    outF.write(data)
    outF.close()
    
    try:
        os.remove(file_path)
    except OSError:
        pass

def load_station_rp5(wmo_id, driver, start, end, target_directory):
    driver.get("https://rp5.ru/%D0%90%D1%80%D1%85%D0%B8%D0%B2_%D0%BF%D0%BE%D0%B3%D0%BE%D0%B4%D1%8B_%D0%B2_%D0%9C%D0%B0%D0%BB%D0%BE%D1%8F%D1%80%D0%BE%D1%81%D0%BB%D0%B0%D0%B2%D1%86%D0%B5")
    action = action_chains.ActionChains(driver)
    
    wmo_id_field = driver.find_element_by_id('wmo_id')
    wmo_id_field.clear()
    wmo_id_field.send_keys(str(wmo_id))
    
    # click on station in the dropdown menu
    xpath_of_dropdown = "//ul[@style='max-height: 180px; overflow: auto;']//li[@class='ac_even ac_over']"
    # wait until it becomes clickable
    try:
        print('here')
        wmo_id_drop_down_list = WebDriverWait(driver, 6).until(
                                EC.element_to_be_clickable((By.XPATH, xpath_of_dropdown)))
        print('here')
        wmo_id_drop_down_list.click()
        action.send_keys(keys.Keys.ENTER)
        action.perform()
    except:
        print('Warning: No station with such ID in rp5.ru')
        return None
    
    download_archive_button = driver.find_element_by_id('tabSynopDLoad')
    download_archive_button.click()

    start_date_field = driver.find_element_by_id('calender_dload')
    start_date_field.clear()
    start_date_field.send_keys(start)

    end_date_field = driver.find_element_by_id('calender_dload2')
    end_date_field.clear()
    end_date_field.send_keys(end)

    driver.execute_script("document.getElementById('format2').click();")
    driver.execute_script("document.getElementById('coding2').click();")

    create_gz_button = driver.find_element_by_xpath("//*[contains(text(), 'Выбрать в файл GZ (архив)')]")
    create_gz_button.click()

    #try:
    download_ref = WebDriverWait(driver, 15).until(
                                EC.element_to_be_clickable((By.LINK_TEXT, "Скачать")))
    download_ref.click()
    filename = download_ref.get_attribute('href').split('/')[-1]
    # wait until the file is loaded
    while not os.path.exists(join(target_directory, filename)):
        time.sleep(0.1)

    # when the downloading is completed change the name
    if os.path.isfile(join(target_directory, filename)):
        extract_gzip(join(target_directory, filename))
    #except:
    #    print("Data from WMO station #{0} can't be downloaded".format(wmo_id))
    
    driver.stop_client()
    

    
def load_stations_list_rp5(wmo_id_list, start='01.01.2012', end='01.01.2017', 
              target_directory=None, verbose=True):
    
    #setting special options of chrome browser
    chromeOptions = webdriver.ChromeOptions()
    
    # specifying directory for files download (default is /home/user/Downloads)
    if target_directory:
        prefs = {"download.default_directory": target_directory}
        chromeOptions.add_experimental_option("prefs", prefs)

    driver = webdriver.Chrome(chrome_options=chromeOptions)

    # getting list of all stations which have been already downloaded
    rp5_loaded_files_names = [f for f in listdir(target_directory) if 
                              (isfile(join(target_directory, f)) and (f.endswith('.csv')))]
    
    rp5_loaded_stations_ids = [int(file_name.split('.')[0]) for file_name in rp5_loaded_files_names]

    for wmo_id in wmo_id_list:        
        # check if station data file is already downloaded
        # if not download it in target directory 
        if wmo_id not in rp5_loaded_stations_ids:
            if verbose:
                print('Downloading', wmo_id)
            
            load_station_rp5(wmo_id, driver, start, end, target_directory)
            rp5_loaded_stations_ids.append(wmo_id)
        
        else:
            if verbose:
                print('In directory', wmo_id)
        
        if verbose:
                print('----------------------------')
        
    driver.quit()

def parse_coordinates_data(data_content_string):
    latitude = re.findall("[0-9]{1,3}° [0-9]{1,2}[.[0-9]*]?'", data_content_string)[0]
    grades, minutes = latitude.replace("°", "").replace("'", "").split()
    latitude = float(grades) + float(minutes) / 60

    longitude = re.findall("[0-9]{1,3}° [0-9]{1,2}[.[0-9]*]?'", data_content_string)[1]
    grades, minutes = longitude.replace("°", "").replace("'", "").split()
    longitude = float(grades) + float(minutes) / 60

    height = int(re.findall("[0-9]{1,4} м$", data_content_string)[0].split()[0])
    
    return {"latitude": latitude, "longitude": longitude, "height": height}

def load_one_station_info_rp5(wmo_id, driver):
    driver.get("https://rp5.ru/%D0%90%D1%80%D1%85%D0%B8%D0%B2_%D0%BF%D0%BE%D0%B3%D0%BE%D0%B4%D1%8B_%D0%B2_%D0%9C%D0%B0%D0%BB%D0%BE%D1%8F%D1%80%D0%BE%D1%81%D0%BB%D0%B0%D0%B2%D1%86%D0%B5")
    action = action_chains.ActionChains(driver)

    wmo_id_field = driver.find_element_by_id('wmo_id')
    wmo_id_field.clear()
    wmo_id_field.send_keys(str(wmo_id))

    # click on station in the dropdown menu
    xpath_of_dropdown = "//ul[@style='max-height: 180px; overflow: auto;']//li[@class='ac_even ac_over']"

    # wait until it becomes clickable
    try:
        wmo_id_drop_down_list = WebDriverWait(driver, 10).until(
                                EC.element_to_be_clickable((By.XPATH, xpath_of_dropdown)))
        wmo_id_drop_down_list.click()
        print('Station #{0} is downloaded'.format(wmo_id))
    except:
        print('Warning: No station with ID #{0} in rp5.ru'.format(wmo_id))
        return None
    
    download_archive_button = driver.find_element_by_id('tabSynopDLoad')
    download_archive_button.click()

    #finding coordinates (lat, lon, height)
    
    data_content = driver.find_element_by_id('mapa')
    station_info = parse_coordinates_data(data_content.get_attribute('data-content'))
    station_info['wmo_id'] = wmo_id

    #locality (for inctance: в Перми)
    station_info['locality'] = data_content.get_attribute('data-town-alt')
    
    #finding timezone (+UTC)
    timezone_element_xpath = "//td[@class='download']//input[@name='f_time_zone_add']"
    timezone_element = driver.find_element_by_xpath(timezone_element_xpath)
    station_info['timezone'] = int(timezone_element.get_attribute('value'))
    driver.stop_client()
    return station_info
    
def load_station_info_list_rp5(wmo_id_list):
    driver = webdriver.Chrome()
    station_info_list = [load_one_station_info_rp5(wmo_id, driver) for wmo_id in wmo_id_list]
    driver.quit()
    return station_info_list

## Loading information about wmo station from rp5 (lon, lat, height, timezone)

In [None]:
wmo_id_list = stations_info.index
station_info = load_station_info_list_rp5(wmo_id_list)

In [110]:
station_info_df = pd.DataFrame([station for station in station_info if station])
station_info_df['latitude'] = station_info_df['latitude'].round(2)
station_info_df['longitude'] = station_info_df['longitude'].round(2)
cols = ['wmo_id', 'locality', 'latitude', 'longitude', 'height', 'timezone']
station_info_df = station_info_df[cols]
station_info_df
station_info_df.to_csv('/mnt/HARD/MinMax94/data/data_all/CSV/stations_def_rp5.csv', index=False)

## Loading archive data from wmo station from rp5

In [None]:
wmo_id_list = stations_info.index[949:]
target_directory = "/mnt/HARD/MinMax94/data/data_all/CSV/RP5"

load_stations_list_rp5(wmo_id_list=wmo_id_list, 
                       target_directory=target_directory, verbose=True)