Importing libraries

In [1]:
from urllib import request 
from datetime import datetime
import pandas as pd
import re
import os

### Task 1: Save all data into files {to folder "DATA"}

In [2]:
def request_province_data(proovince_id = 1):
    url = "https://www.star.nesdis.noaa.gov/smcd/emb/vci/VH/get_TS_admin.php?country=UKR&provinceID="+str(proovince_id)+"&year1=1981&year2=2023&type=Mean"
    wp = request.urlopen(url)
    data = wp.read()
    data = data.decode('utf-8')
    start_line = data.find("year,week, SMN,SMT,VCI,TCI, VHI")
    data = data[start_line:]
    data = re.sub(r'<.*?>', '', data)   # regex to getting rid of html tags
    return data
def write_province_data_into_file(str_csv_data_provinceID, folder_name="DATA"):
    data, num = str_csv_data_provinceID
    if(type(str_csv_data_provinceID)!=tuple):
        print("function write_province_data_into_file()\n       tuple of data and provinceID is required \n        for example (data, 1),\n        where data - str of csv and 1-noaa ukr region number of Cherkasy")
        return
    open(folder_name+"\\NOAA_data_province_ID="+str(num)+"_time="+datetime.now().strftime("%d-%m-%Y_%H_%M_%S")+".csv", 'w').write(data)
def save_all_province_datas(folder_name="DATA"):
    saving_time = datetime.now()
    if not os.path.isdir(folder_name):
        os.makedirs(folder_name)

    for i in range(1, 28):
        write_province_data_into_file((request_province_data(i), str(i)), folder_name=folder_name)
    print("Files are saved!")
    print("Saving process time: ", datetime.now()-saving_time)

### Task 2: getting all data into the structure for dataframes {dict}

In [3]:
def get_id_from_name(str_name):
    if("ID=" not in str_name):
        print("Corrupted name given get_id_from_name() ...\n     in ... get_df_from_files()")
        return
    id = re.search("ID=(.*)_time", str_name).group(1)
    return int(id)

from settings import header

def get_df_from_files(folder_name="DATA"):
    saved_csvs = dict()
    for file_name in os.listdir(folder_name):
        if(".csv" in file_name):
            df = pd.read_csv(folder_name+"\\"+file_name, usecols = ['year', 'week', ' SMN', 'SMT', 'VCI', 'TCI', ' VHI'])
            df.columns = header
            saved_csvs[get_id_from_name(file_name)] = df
    return(saved_csvs)

executing:

In [4]:
folder_name="DATA"
if(folder_name not in os.listdir() or len(os.listdir(folder_name)) == 0):
    save_all_province_datas(folder_name="DATA")
dfs = get_df_from_files()

In [5]:
print(dfs[1].columns)
print(dfs.keys())

Index(['Year', 'Week', 'SMN', 'SMT', 'VCI', 'TCI', 'VHI'], dtype='object')
dict_keys([10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 1, 20, 21, 22, 23, 24, 25, 26, 27, 2, 3, 4, 5, 6, 7, 8, 9])


In [6]:
dfs[1]

Unnamed: 0,Year,Week,SMN,SMT,VCI,TCI,VHI
0,1982,1,0.053,260.31,45.01,39.46,42.23
1,1982,2,0.054,262.29,46.83,31.75,39.29
2,1982,3,0.055,263.82,48.13,27.24,37.68
3,1982,4,0.053,265.33,46.09,23.91,35.00
4,1982,5,0.050,265.66,41.46,26.65,34.06
...,...,...,...,...,...,...,...
2179,2023,48,-1.000,-1.00,-1.00,-1.00,-1.00
2180,2023,49,-1.000,-1.00,-1.00,-1.00,-1.00
2181,2023,50,-1.000,-1.00,-1.00,-1.00,-1.00
2182,2023,51,-1.000,-1.00,-1.00,-1.00,-1.00


### Task 3: Change indecies names

In [7]:
from settings import change_map

def change_indices(dict_to_change, change_layout):
    if(type(change_layout) is not dict):
        return
    rewritten_dict = dict()
    for key in dict_to_change:
        rewritten_dict[change_layout[key]] = dict_to_change[key]
    return rewritten_dict

In [8]:
dfs = change_indices(dfs, change_map)

#### Handling missing data:

In [9]:
begin = sorted(list(dfs.keys()))[0]
for df_ind in range(begin, len(dfs)+begin):
    dfs[df_ind] = dfs[df_ind].drop(
        dfs[df_ind].loc[dfs[df_ind]["VHI"]<=-0.9].index)

In [10]:
dfs[1]

Unnamed: 0,Year,Week,SMN,SMT,VCI,TCI,VHI
0,1982,1,0.068,263.59,63.47,28.34,45.90
1,1982,2,0.074,265.78,67.62,23.05,45.34
2,1982,3,0.076,267.19,69.37,20.40,44.88
3,1982,4,0.075,268.57,65.26,17.93,41.60
4,1982,5,0.072,269.24,58.58,20.00,39.29
...,...,...,...,...,...,...,...
2166,2023,35,0.387,298.34,67.57,15.85,41.71
2167,2023,36,0.370,297.67,68.23,9.60,38.91
2168,2023,37,0.354,296.68,70.60,5.31,37.95
2169,2023,38,0.338,295.54,71.58,2.38,36.98


### Task 4: Statisical functions

In [11]:
def VHI_extr(df):
    vhi_df = df[['Year','VHI']]
    return(vhi_df, min(vhi_df['VHI']), max(vhi_df['VHI']))

In [12]:
vhi_1, minv, maxv = VHI_extr(dfs[1])
print("min_vhi: ", minv, "\n","max_vhi: ", maxv)
vhi_1

min_vhi:  11.25 
 max_vhi:  82.64


Unnamed: 0,Year,VHI
0,1982,45.90
1,1982,45.34
2,1982,44.88
3,1982,41.60
4,1982,39.29
...,...,...
2166,2023,41.71
2167,2023,38.91
2168,2023,37.95
2169,2023,36.98


In [13]:
def VHI_drought_above(df, min_VHI):
    return df[(min_VHI<df["VHI"]) & (df["VHI"]<40)][["Year", "VHI"]].reset_index(drop=True)

In [14]:
cond_VHI = VHI_drought_above(dfs[1], 30)
cond_VHI

Unnamed: 0,Year,VHI
0,1982,39.29
1,1982,37.65
2,1982,35.03
3,1982,34.46
4,1982,35.04
...,...,...
437,2022,35.79
438,2023,38.91
439,2023,37.95
440,2023,36.98


In [15]:
def moderate_VHI_below(df, max_VHI):
    return df[(60<df["VHI"]) & (df["VHI"]<max_VHI)][["Year","VHI"]].reset_index(drop=True)

In [16]:
moderate_VHI = moderate_VHI_below(dfs[1], 80)
moderate_VHI

Unnamed: 0,Year,VHI
0,1984,63.44
1,1984,68.14
2,1984,71.68
3,1984,73.35
4,1984,74.32
...,...,...
305,2021,64.17
306,2021,62.16
307,2023,60.75
308,2023,61.39
