# **downloadMeteoData.ipynb**

Author: Zhixian Yang

Email: [yangzhx28@mail2.sysu.edu.cn](mailto:yangzhx28@mail2.sysu.edu.cn) or [yimu01439@gmail.com](mailto:yimu01439@gmail.com)

GitHub: [https://github.com/koar-create](https://github.com/koar-create)

Date created: August 2th, 2023

Last modified: August 10th, 2023

<br><br>

---

<br><br>

## **Description**
None.

In [9]:
import os, sys, time, glob
import pytz
import requests
import platform
import numpy as np, pandas as pd
from datetime import datetime


def import_station_info(dirname=None):
    addr_filename = 'stations.xlsx' # another one is 'China_SURF_Station.xlsx'
    if not os.path.exists(os.path.join(dirname, addr_filename)):
        print(f"{addr_filename} does not exist, run Reorganizing_addr_sheet.py first.")
        sys.exit()
    df_addr = pd.read_excel(os.path.join(dirname, addr_filename), sheet_name='开放站点')
    sites     = df_addr['区站号'].values
    provinces = df_addr['省份'  ].values
    stations  = df_addr['站名'  ].values
    return sites, provinces, stations

def synchronize(sites=None, dirname=None):
    data_dirname = os.path.join(dirname, 'data')
    china_timezone = pytz.timezone('Asia/Shanghai')
    
    # create 'final_record_date.csv'
    if not os.path.exists(os.path.join(dirname, 'final_record_date.csv')):
        datetime_final_records = np.array([])
        for site in sites:
            
            # obtain corresponding paths
            existing_abspath = glob.glob(os.path.join(data_dirname, f"*{site}.csv"))
            if existing_abspath:
                existing_filename = [fpath.split(os.sep)[-1] for fpath in existing_abspath]
                datetime_final_record = china_timezone.localize(max([datetime.strptime(fname.split('.')[0].split('-')[-1], '%y%m%d%H') for fname in existing_filename]))
            else:
                datetime_final_record = datetime(2023, 8, 1, 0, tzinfo=china_timezone)
            datetime_final_records = np.append(datetime_final_records, datetime_final_record)
        df_update = pd.DataFrame({'number': sites, 'final record date': datetime_final_records})
        df_update.to_csv(os.path.join(dirname, 'final_record_date.csv'), encoding='utf-8', index=False)
        print('Successfully create final_record_date.csv!')
        
    # read and update 'final_record_date.csv'
    else:
        df_update = pd.read_csv(os.path.join(dirname, 'final_record_date.csv'))
        df_update.set_index('number', inplace=True)
        
        change = False
        for site in sites:
                
            # obtain corresponding paths
            existing_abspath = glob.glob(os.path.join(data_dirname, f"*{site}.csv"))
            if existing_abspath:
                existing_filename = [fpath.split(os.sep)[-1] for fpath in existing_abspath]
                datetime_final_record = china_timezone.localize(max([datetime.strptime(fname.split('.')[0].split('-')[-1], '%y%m%d%H') for fname in existing_filename])) # 23080113-23080212.59488.csv
                if type(df_update.loc[site, 'final record date']) == str:
                    df_update.loc[site, 'final record date'] = china_timezone.localize(datetime.strptime(df_update.loc[site, 'final record date'][:-6], '%Y-%m-%d %H:%M:%S'))
                if datetime_final_record > df_update.loc[site, 'final record date']:
                    df_update.loc[site, 'final record date'] = datetime_final_record
                    change = True
                    
        if change == True:
            df_update.reset_index(inplace=True)
            df_update.to_csv(os.path.join(dirname, 'final_record_date.csv'), encoding='utf-8', index=False)
            print('Successfully update final_record_date.csv to the latest state!')
        else:
            print('No updates required for final_record_date.csv.')
    return df_update

def daily_auto_download(df_update=None, interval=12, mode='stable', dirname=None):
    data_dirname = os.path.join(dirname, 'data')
    sites, provinces, stations = import_station_info(dirname=dirname)
    len_p, len_s = max(len(i) for i in provinces), max(len(ii) for ii in stations)
    total_hours = []
    china_timezone = pytz.timezone('Asia/Shanghai')
    if 'number' in df_update.columns:
        df_update.set_index('number', inplace=True)
    
    for site, province, station in zip(sites, provinces, stations):
        url = f"https://q-weather.info/weather/{site}/today/"
        
        # skip updated data
        datetime_now = datetime.now(china_timezone)
        if type(df_update.loc[site, 'final record date']) == str:
            df_update.loc[site, 'final record date'] = china_timezone.localize(datetime.strptime(df_update.loc[site, 'final record date'][:-6], '%Y-%m-%d %H:%M:%S'))
        total_hours.append((datetime_now - df_update.loc[site, 'final record date']).total_seconds())
        if (datetime_now - df_update.loc[site, 'final record date']).total_seconds() <= (interval / 24 * 86400):
            continue
            
        success = False
        while not success:
            try:
                headers = headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Connection': 'keep-alive', 'Host': 'q-weather.info', 'Referer': 'https://www.google.com/', 'Sec-Ch-Ua-Platform': "Windows", 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'cross-site', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'}
                response = requests.get(url, headers=headers, timeout=10)
                response.raise_for_status()
            except requests.exceptions.RequestException as e:
                print(f"!!!!Error while fetching the webpage: {e}", "\n", f"~~~~[{site}]")
                time.sleep(10)
            try:
                df = pd.read_html(response.content)[0]
                
                # assign new column, reorder columns, rename visibility, remove '时次'
                df['number'] = site
                df = df.loc[:, ['number'] + list(df.columns[:-1])]
                if '能见度' in df.columns:
                    df.rename(columns={'能见度': '10分钟平均能见度'}, inplace=True)
                if ('时次' in df.columns):
                    for idx in df.index:
                        date = datetime.strptime(df.loc[idx, '时次'][:-6], '%Y-%m-%d %H:%M')
                        df.loc[idx, 'year' ] = date.year
                        df.loc[idx, 'month'] = date.month
                        df.loc[idx, 'day'  ] = date.day
                        df.loc[idx, 'hour' ] = date.hour
                    for label in ['number', 'year', 'month', 'day', 'hour']:
                        df[label] = df[label].astype(np.int64)
                    df.drop(columns=['时次'], inplace=True)
                    df.sort_values(by=['number', 'year', 'month', 'day', 'hour'], ascending=[True, True, True, True, True], inplace=True)
                    df.reset_index(inplace=True)
                    df.drop(columns=['index'], inplace=True)
                
                # save sheet as csv file
                l = df.shape[0]
                start_date = datetime(df.loc[  0, 'year'], df.loc[  0, 'month'], df.loc[  0, 'day'], df.loc[  0, 'hour'], tzinfo=china_timezone).strftime('%y%m%d%H')
                end_date   = datetime(df.loc[l-1, 'year'], df.loc[l-1, 'month'], df.loc[l-1, 'day'], df.loc[l-1, 'hour'], tzinfo=china_timezone).strftime('%y%m%d%H')
                filename = f"{start_date}-{end_date}.{site}.csv"
                df.to_csv(os.path.join(data_dirname, filename), index=False, encoding='utf-8')
                print(f"--{province}{chr(12288) * (len_p - len(province))}, {station}{chr(12288) * (len_s - len(station))}. Saved as {filename}")
                
                # sleep randomly
                sleep_time = max(np.abs(0.1 + 0.1 * np.random.randn(1)[0]), 0)
                time.sleep(sleep_time)
                success = True
                
            except Exception as e:
                print(f"!!!!An error occurred: {e} \n [{site}]")
                sys.exit()
    print(f"The last update was made {(max(total_hours)/3600):.2f} hours ago.")
    

if platform.system() == 'Linux':
    dirname = os.getcwd()
elif platform.system() == 'Windows':
    dirname = os.path.join("D:\\Documents", "A-threads", "less important ones", "thread2308-4_try_to_purchase_chinese_station_api")

sites, provinces, stations = import_station_info(dirname=dirname)
df_update = synchronize(sites=sites, dirname=dirname)
daily_auto_download(df_update=df_update, interval=8, mode='stable', dirname=dirname)
_ = synchronize(sites=sites, dirname=dirname)

No updates required for final_record_date.csv.
The last update was made 2.76 hours ago.
No updates required for final_record_date.csv.
