# Easily use dataset

There is a make_dataset.py file with our data. This module is to be easier to create a time_series dataframe to be used to create models.  
This function take the rawdata and make all and organize in the same date_time index.

In [1]:
import os
directory = '../input/hourly-weather-surface-brazil-southeast-region/'
os.chdir(directory)
import make_dataset as mk

## Make dataset Function

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

renamed_columns = ['data','hora','precipitacao total,horario (mm)','pressao atmosferica ao nivel da estacao (mb)','pressao atmosferica max. na hora ant. (aut) (mb)','pressao atmosferica min. na hora ant. (aut) (mb)','radiation (kj/m2)','temperatura do ar - bulbo seco (°c)','temperatura do ponto de orvalho (°c)','temperatura maxima na hora ant. (aut) (°c)','temperatura minima na hora ant. (aut) (°c)','temperatura orvalho max. na hora ant. (aut) (°c)','temperatura orvalho min. na hora ant. (aut) (°c)','umidade rel. max. na hora ant. (aut) (%)','umidade rel. min. na hora ant. (aut) (%)','umidade relativa do ar, horaria (%)','vento direcao horaria (gr) (° (gr))','vento rajada maxima (m/s)','vento velocidade horaria (m/s)','region','state','station','station_code','latitude','longitude','height']
renamed_columns_en = ['date','hour','total precipitation (mm)','pressao atmosferica ao nivel da estacao (mb)','atmospheric pressure max. in the previous hour (mb)','atmospheric pressure min. in the previous hour (mb)','radiation (kj/m2)','air temperature - dry bulb (°c)','dew point temperature (°c)','max. temperature in the previous hour (°c)','min. temperature in the previous hour (°c)','dew temperature max. in the previous hour (°c)','dew temperature min. in the previous hour (°c)','relative humidity max. in the previous hour (%)','relative humidity min. in the previous hour (%)','air relative humidity (%)','wind direction (° (gr))','wind rajada maxima (m/s)','wind speed (m/s)','region','state','station','station_code','latitude','longitude','height']
abbreviation = ['date','hour','prcp', 'stp', 'smax', 'smin','gbrd','temp','dewp','tmax','tmin','dmax','dmin','hmax','hmin','hmdy','wdct', 'gust', 'wdsp', 'regi','prov','wsnm','inme','lat','lon','elvt']

def process_raw(df, pt_br = False):
    """
    ! drop index
    ! rename the columns
    ! combine hour and date columns -> transform to date_time
    ! append columns data from all stations to keep the data granularity
    ! remove not useful columns
    ! return table 
    """
    df.drop(['index'],inplace=True, axis=1)
    df.columns = abbreviation
    df['date_time'] = pd.to_datetime(df['date'] + ' ' +  df['hour'])
    columns_to_drop = ['date','hour','regi', 'prov', 'wsnm', 'lat', 'lon', 'elvt', 'gbrd']
    df.drop(columns_to_drop, inplace=True, axis=1)

    station_codes = list(df.inme.unique())
    df_models = by_code(df,station_codes[0])
    if len(station_codes)>1:
        for i in station_codes[1:]:
            df_temp = by_code(df,i)
            df_models = pd.concat([df_models,df_temp],axis=1)
    return df_models

def by_code(df, station_code):
    """
    ! function that filters the dataframe by station_code and rename the columns by the status code
    """
    df_station = None
    df_station = df[df['inme'] == station_code]
    df_station.index = df_station.iloc[:,-1]
    df_station.drop(['inme','date_time'],axis=1,inplace=True)
    df_station.columns = list(map(lambda x: station_code+'_'+x, df_station.columns))
    return df_station

def clean_na(df, na_value = -9999):
    """ 
    ! replace Na from dataset
    """
    df = df.replace(to_replace=na_value,value=np.NaN)
    df = df[df.iloc[:,1].first_valid_index():]
    df = df.fillna(method='ffill')
    return df

def make_dataset(stations, start_date, df):
    """
    ! filter raw data from date and stations code
    ! process data
    ! clean na
    """
    df = df[df['Data'] >= start_date]
    df = df[df['station_code'].apply(lambda x: x in stations)]
    df = process_raw(df)
    df = clean_na(df)
    return df.reset_index()

In [3]:
stations = ['A612']
start_date = '2006-11-01'
df2 = make_dataset(stations, start_date, pd.read_csv('southeast.csv'))
# df1 = mk.make_dataset(stations, start_date, pd.read_csv('southeast.csv'))
# df1=df2
df_raw = None

In [4]:
df2

Unnamed: 0,date_time,A612_prcp,A612_stp,A612_smax,A612_smin,A612_temp,A612_dewp,A612_tmax,A612_tmin,A612_dmax,A612_dmin,A612_hmax,A612_hmin,A612_hmdy,A612_wdct,A612_gust,A612_wdsp
0,2006-11-01 00:00:00,0.0,1013.4,1013.4,1012.6,24.8,21.9,24.9,24.7,21.9,21.5,84.0,81.0,84.0,3.0,6.7,2.1
1,2006-11-01 01:00:00,0.0,1013.9,1013.9,1013.4,24.8,21.7,24.8,24.7,21.9,21.7,84.0,83.0,83.0,11.0,8.0,3.3
2,2006-11-01 02:00:00,0.0,1013.9,1014.0,1013.8,24.6,21.3,24.9,24.6,21.7,21.2,83.0,80.0,82.0,360.0,8.0,2.7
3,2006-11-01 03:00:00,0.0,1013.4,1013.9,1013.4,24.3,21.5,24.6,24.2,21.6,21.3,85.0,82.0,84.0,5.0,6.8,2.8
4,2006-11-01 04:00:00,1.0,1012.8,1013.4,1012.8,23.3,22.0,24.4,23.3,22.0,21.5,92.0,84.0,92.0,344.0,8.2,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127075,2021-04-30 19:00:00,0.0,1020.3,1020.3,1020.1,24.8,16.7,25.9,24.8,17.2,15.7,61.0,54.0,61.0,151.0,8.4,2.1
127076,2021-04-30 20:00:00,0.0,1020.6,1020.6,1020.3,23.6,17.3,24.8,23.5,17.5,16.6,69.0,61.0,68.0,182.0,6.1,1.6
127077,2021-04-30 21:00:00,0.0,1020.7,1020.7,1020.5,22.8,16.8,23.7,22.7,17.3,16.8,70.0,67.0,69.0,188.0,5.8,1.4
127078,2021-04-30 22:00:00,0.0,1021.0,1021.1,1020.7,23.1,16.9,23.1,22.4,17.3,16.8,72.0,68.0,68.0,176.0,5.2,1.4
