# Dependencies

In [1]:
import pandas as pd 

# Processing scheme 
1. Set timestamp as index
2. Remove gust, phenomenon (other), pressure (station), and clouds columns
3. Process Visibility 
4. Process Wind Speed 
5. Process Wind Direction
6. Process Phenomenon <br>
    a) Sometimes there are multiple phenomena going on at once, thus a special method for one-hot encoding phenomena is needed 
7. Remove rows with missing values 

#### 1. Set timestamp as index 
#### 2. Remove gust, phenomenon (other), pressure (station), and clouds columns 

In [13]:
PATH = '../TF/data/delhi_weather_data' # Replace with path to weather csv 

def read_weather_csv(path):
    INDEX_COL = 'Timestamp'
    COLUMNS = ['Timestamp', 'Temperature', 'Pressure', 'Humidity', 'Wind Direction', 'Wind Speed', 
           'Phenomenon','Visibility', 'Dewpoint',]
    return pd.read_csv(
        PATH, 
        index_col=INDEX_COL, 
        usecols=COLUMNS,
    )

#### 3. Process visibility 
The value "less than 0.05" is replaced with 0.0.

In [10]:
def process_visibility(srs):
    return pd.to_numeric(srs.replace('less than 0.05', 0.0), errors='ignore')

#### 4. Process wind speed
a) Weed speed values are changed from strings to floats in meters per second and the value e.g. "10 (m\s)" is converted to 10.0 <br>
b) The value "Calm" is replaced with 0.0. 

In [9]:
def process_wind_speed(srs):
    return pd.to_numeric(srs.replace('Calm', 'Calm (0 m/s)').str.
                         replace('[\w\s]+\((?P<speed>\d+)\sm/s\)', lambda m: m.group('speed'), regex=True))

#### 5. Process Wind Direction
a) Wind direction values are changed to abbreviated forms e.g. "west-northwest" to "WNW", "Calm" to "C", and "Variable" to "V". This is not really necessary due to the next encoding step; however, one might find it helpful in isolation. <br>
b) Wind direction values are one-hot encoded. 

In [8]:
def merge_with_dummies(df, dummies, column=None):
    """
    General use function for merging a dataframe with a new dummies dataframe made from its column. 
    """
    if column is None: 
        return df.merge(dummies, right_index=True, left_index=True)
    else:
        return df.merge(dummies.add_prefix(prefix=f'{column}: '), right_index=True, left_index=True).drop(columns=column)

In [7]:
def abbreviate_wind_direction(srs):
    return srs.str.findall('(north|south|east|west|Calm|Variable)').apply(
        lambda ms: ''.join(map(lambda m: m[0].upper(), ms)) 
        if type(ms) == list else ms)


def process_wind_direction(srs, sparse=True):
    return pd.get_dummies(abbreviate_wind_direction(srs=srs), sparse=sparse)

#### 6. Process phenomenon
Sometimes there are multiple phenomena going on at once; thus, a special one-hot encoding function, which allows for more than one column to be nonzero is needed for processing the Phenomenon column. 

In [6]:
def process_phenomenon(srs):
    return srs.str.lower().str.get_dummies(', ')

### Complete processing function 
(with optional missing values removal)

In [11]:
def process_weather(path, dropna=True):
    """
    Convenience function for applying all processing steps with optional missing values removal.
    """
    df = read_weather_csv(path=path)
    df['Visibility'] = process_visibility(df['Visibility'])
    df['Wind Speed'] = process_wind_speed(df['Wind Speed'])
    df = merge_with_dummies(
        df=df, 
        dummies=process_wind_direction(df['Wind Direction']), 
        column='Wind Direction',
    )
    df = merge_with_dummies(
        df=df, 
        dummies=process_phenomenon(df['Phenomenon']), 
        column='Phenomenon',
    )
    if dropna is True:
        df.dropna(how='any', inplace=True)
    return df

## Example

In [16]:
df = process_weather(PATH)
df

Unnamed: 0_level_0,Temperature,Pressure,Humidity,Wind Speed,Visibility,Dewpoint,Wind Direction: C,Wind Direction: E,Wind Direction: ENE,Wind Direction: ESE,...,Phenomenon: light rain,Phenomenon: light thunderstorm,Phenomenon: mist,Phenomenon: rain,Phenomenon: shower(s),Phenomenon: small hail and/or snow pellets,Phenomenon: smoke,Phenomenon: thunderstorm,Phenomenon: thunderstorm in the vicinity,Phenomenon: widespread dust
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-08-08 00:00:00,27.0,750.8,89.0,1.0,2.5,25.0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2018-08-08 00:30:00,27.0,750.8,94.0,2.0,2.5,26.0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2018-08-08 01:30:00,27.0,750.1,94.0,2.0,2.5,26.0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2018-08-08 02:00:00,27.0,750.1,94.0,2.0,2.5,26.0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2018-08-08 02:30:00,27.0,750.1,94.0,2.0,2.5,26.0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-01-30 22:00:00,14.0,762.0,67.0,2.0,2.2,8.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-30 22:30:00,14.0,762.0,77.0,2.0,2.2,10.0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2020-01-30 23:00:00,14.0,762.0,82.0,2.0,2.2,11.0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2020-01-30 23:30:00,14.0,762.0,82.0,2.0,2.2,11.0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
