In [6]:
import os
# from sklearn.model_selection import train_test_split
from tqdm import tqdm
tqdm.pandas()
import numpy as np
import pandas as pd

In [7]:
stations_info = pd.read_csv('static/stations_info.txt',
                            delimiter='#',
                            header=None,
                            names=['city', 'href', 'data_type', 'script_start_date', 'station_id', 'country', 'ws_id','lat', 'lon']) \
                            .drop(columns=['data_type', 'ws_id'])

In [8]:
stations_info

Unnamed: 0,city,href,script_start_date,station_id,country,lat,lon
0,Архангельск,https://rp5.ru/Архив_погоды_в_Архангельске,2022-12-09,22550,Россия,64.533333,40.566667
1,Санкт-Петербург,https://rp5.ru/Архив_погоды_в_Санкт-Петербурге,2022-12-09,26063,Россия,59.9691,30.2841
2,Петрозаводск,https://rp5.ru/Архив_погоды_в_Петрозаводске,2022-12-09,22820,Россия,61.783333,34.35
3,Курск,"https://rp5.ru/Архив_погоды_в_Курске,_Поповке",2022-12-09,34009,Россия,51.7667,36.1667
4,Воронеж,https://rp5.ru/Архив_погоды_в_Воронеже,2022-12-09,34123,Россия,51.666667,39.183333
5,Тамбов,https://rp5.ru/Архив_погоды_в_Тамбове,2022-12-09,27947,Россия,52.733333,41.466667
6,Брянск,https://rp5.ru/Архив_погоды_в_Брянске_(аэропорт),2022-12-09,26898,Россия,53.2142,34.1764


In [9]:
stations_info.to_csv('static/station_info.csv')

In [10]:
def to_min_interval(df: pd.DataFrame, col_name: str) -> pd.DataFrame:
    s = df[col_name]
    min_interval = s.diff(-1).fillna(method='bfill').min()
    new_df = df.set_index(col_name)
    index = pd.date_range(start=s.min(), end=s.max(), freq=min_interval, name=col_name)[::-1]
    return new_df.reindex(index, fill_value=None).reset_index()

In [13]:
def prepare_data(station_info: pd.DataFrame) -> pd.DataFrame:
    # Track spaces in samples
    sample_spaces = {}
    full_data = []
    for i, row in station_info.iterrows():
        station_data = []
        for file in os.listdir(f'static/{row.station_id}'):
            csv = pd.read_csv(f'static/{row.station_id}/{file}', delimiter=';', skiprows=6, index_col=False)
            csv.rename(columns={csv.columns[0]: "local_time"}, inplace=True)
            csv.local_time = pd.to_datetime(csv.local_time, format='%d.%m.%Y %H:%M')
            station_data.append(csv)

        full_station_data = pd.concat(station_data, ignore_index=True).sort_values('local_time', ascending=False)

        hours = full_station_data.local_time.map(lambda x: x.hour)
        full_station_data = full_station_data[hours % 3 == 0]

        before_extends = full_station_data.shape[0]
        full_station_data = to_min_interval(full_station_data, 'local_time')
        after_extends = full_station_data.shape[0]
        sample_spaces[row.station_id] = after_extends - before_extends

        full_station_data['station_id'] = row.station_id

        full_data.append(full_station_data)

    weather_dataset = pd.concat(full_data, ignore_index=True)

    print(f'Total spaces in samples: {sum(sample_spaces.values())}')
    print(f'Details:')
    print(sample_spaces)

    return weather_dataset

In [14]:
dataset = prepare_data(stations_info)

Total spaces in samples: 1866
Details:
{22550: 197, 26063: 209, 22820: 198, 34009: 250, 34123: 423, 27947: 268, 26898: 321}


In [15]:
stations_info.to_csv('static/station_info.csv', sep=';')
dataset.to_csv('static/weather_dataset.csv', sep=';')