# Project 4: West Nile Virus prediction
#### Ivan Tan, Andy Chan, Jeremy Tan, Gan Hong Yee DSI-17 Singapore
#### 15 October 2020

#  4a Engineer Weather data

## Contents
- [Add rolling average weather data for 14, 30, 90 days](#Add-rolling-average-weather-data-for-14,-30,-90-days)
- [Export](#Export)

## Setup
### Libraries

In [2]:
import pandas as pd
import numpy as np

# convert to unix timestamp
import datetime as dt
import datetime
import time

### Load Data

In [2]:
train_data = pd.read_csv('../datasets/cleaned_datasets/combined_train_cleaned.csv')
test_data = pd.read_csv('../datasets/cleaned_datasets/combined_test_cleaned.csv')

## Add rolling average weather data for 14, 30, 90 days

#### Create a dataframe of important weather features

In [3]:
weather_data = pd.read_csv('../datasets/cleaned_datasets/weather_cleaned.csv')
weather_data = weather_data[['Date', 'Station', 'Tmax', 'Tmin', 'Tavg', 'DewPoint', 'WetBulb', 'PrecipTotal']]
weather_data['Date'] = pd.to_datetime(weather_data['Date'])
weather_data.set_index(['Date'], inplace=True)
weather_data

Unnamed: 0_level_0,Station,Tmax,Tmin,Tavg,DewPoint,WetBulb,PrecipTotal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2007-05-01,1,83,50,67,51,56.0,0.00
2007-05-01,2,84,52,68,51,57.0,0.00
2007-05-02,1,59,42,51,42,47.0,0.00
2007-05-02,2,60,43,52,42,47.0,0.00
2007-05-03,1,66,46,56,40,48.0,0.00
...,...,...,...,...,...,...,...
2014-10-29,2,49,40,45,34,42.0,0.00
2014-10-30,1,51,32,42,34,40.0,0.00
2014-10-30,2,53,37,45,35,42.0,0.05
2014-10-31,1,47,33,40,25,33.0,0.03


#### Functions to calculate rolling mean of weather features

In [1]:
'''
Description:
helper functions for generating rolling average weather data
'''
# get df of weather by station
def station(n):
    return weather_data.groupby('Station').get_group(n)

# get rolling mean of weather by rolling days and station
def rolling_mean(stn, n_days):
    return station(stn).rolling(n_days).mean()[n_days-1:]

# get the rolled means of a particular day
def get_roll_day(stn:int, n_days:int, date:object):
    return rolling_mean(stn, n_days).loc[date]

# get a rolled value within a particular day
# usage: get_roll_val(2, 30, dt.datetime(2007,6,3), 'Tmax')
# output: 77.03333333333333 
# mean 30 day rolling Tmax of station2 Tmax feature
def get_roll_val(stn:int, n_days:int, date:object, feature:str)->float:
    return get_roll_day(stn, n_days, date)[feature]


def get_formatted_roll_day(stn:int, n_days:int, date:object, should_fake_label:bool=False, fake_label:int=None):
    '''
    Description:
     get a dataframe of rolled values on a particular day, with formatted labels
     should_fake_label: set True if we are to use fallback values
     fake_label: day to fake. ie.fake_label=30>> we can set a 29d rolled Tmax to be Tmax_30d instead of Tmax_29d
    '''
    d = pd.DataFrame(rolling_mean(stn, n_days).drop(columns=['Station']).loc[date]).T
    d.rename(columns={
        'Tmax' : f"Tmax_{n_days if not should_fake_label else fake_label}d",
        'Tmin' : f"Tmin_{n_days if not should_fake_label else fake_label}d",
        'Tavg' : f"Tavg_{n_days if not should_fake_label else fake_label}d",
        'DewPoint' : f"DewPoint_{n_days if not should_fake_label else fake_label}d",
        'WetBulb' : f"WetBulb_{n_days if not should_fake_label else fake_label}d",
        'PrecipTotal' : f"PrecipTotal_{n_days if not should_fake_label else fake_label}d",
    }, inplace=True)
        
    return d

In [2]:
def generate_rolling_avg_df(df, n_days:int, fallback_n_days:int):
    '''
    Description:
    Takes the date of the datum and matches it to the corresponding weather datum, then finds the n rolling mean.
    Returns a combined dataframe with the new features. example: Tmax_30d 
    ---
    Params:

    - df: (pandas.DataFrame) dataframe to populate
    - n_days: (int) number of days to roll
    - fallback_n_days: (int) number of days to roll if there are insufficient prior days

    ---
    Returns:

    - (pandas.DataFrame) df with all the new features
    '''
    # tmp df to add rolled weather data
    roll = pd.DataFrame()
    
    # access each row in the dataset
    for i, r in df.iterrows():
        try:
            # get all the rolled weather data for that day and add it back to the temp df
            roll = pd.concat([
                roll, 
                get_formatted_roll_day(
                    r['Station'],
                    n_days,
                    dt.datetime(int(r['Year']), int(r['Month']), int(r['Day']))
                )
            ])

        except: 
            # if there are insufficient prior days to roll, use a fallback value to compute
            roll = pd.concat([
                roll, 
                get_formatted_roll_day(
                    r['Station'],
                    fallback_n_days,
                    dt.datetime(int(r['Year']), int(r['Month']), int(r['Day'])),
                    should_fake_label=True,
                    fake_label=n_days,
                )
            ])
    # reset the index to match original df
    roll.reset_index(drop=True, inplace=True)
    # combine the original df and the rolled weather data
    return pd.concat([df, roll], axis=1, sort=False)

#### Computing rolling average weather features

In [6]:
%%time
train_data = generate_rolling_avg_df(train_data, n_days=14, fallback_n_days=14)
train_data = generate_rolling_avg_df(train_data, n_days=30, fallback_n_days=29)
train_data = generate_rolling_avg_df(train_data, n_days=90, fallback_n_days=1)

test_data = generate_rolling_avg_df(test_data, n_days=14, fallback_n_days=14)
test_data = generate_rolling_avg_df(test_data, n_days=30, fallback_n_days=29)
test_data = generate_rolling_avg_df(test_data, n_days=90, fallback_n_days=1)

Wall time: 22min 34s


#### Observe resultant dataframes

In [7]:
train_data.tail()

Unnamed: 0.1,Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,...,Tavg_30d,DewPoint_30d,WetBulb_30d,PrecipTotal_30d,Tmax_90d,Tmin_90d,Tavg_90d,DewPoint_90d,WetBulb_90d,PrecipTotal_90d
8605,8605,2013-09-26,"South Stony Island Avenue, Chicago, IL, USA",CULEX PIPIENS,10,S STONY ISLAND AVE,T138,"1000 S STONY ISLAND AVE, Chicago, IL",41.726465,-87.585413,...,70.633333,56.4,61.9,0.084333,81.211111,64.455556,73.111111,58.255556,64.011111,0.062111
8606,8606,2013-09-26,"South Stony Island Avenue, Chicago, IL, USA",CULEX PIPIENS/RESTUANS,10,S STONY ISLAND AVE,T138,"1000 S STONY ISLAND AVE, Chicago, IL",41.726465,-87.585413,...,70.633333,56.4,61.9,0.084333,81.211111,64.455556,73.111111,58.255556,64.011111,0.062111
8607,8607,2013-09-26,"South Vincennes Avenue, Chicago, IL, USA",CULEX PIPIENS/RESTUANS,10,S VINCENNES,T089,"1000 S VINCENNES, Chicago, IL",41.723195,-87.64997,...,70.633333,56.4,61.9,0.084333,81.211111,64.455556,73.111111,58.255556,64.011111,0.062111
8608,8608,2013-09-26,"University of Illinois at Chicago, 1100 South ...",CULEX PIPIENS/RESTUANS,11,S ASHLAND AVE,T090,"1100 S ASHLAND AVE, Chicago, IL",41.868077,-87.666901,...,70.633333,56.4,61.9,0.084333,81.211111,64.455556,73.111111,58.255556,64.011111,0.062111
8609,8609,2013-09-26,"West Garfield Boulevard, Chicago, IL, USA",CULEX PIPIENS/RESTUANS,90,W GARFIELD BLVD,T226,"9000 W GARFIELD BLVD, Chicago, IL",41.793818,-87.654234,...,70.633333,56.4,61.9,0.084333,81.211111,64.455556,73.111111,58.255556,64.011111,0.062111


In [8]:
test_data.tail()

Unnamed: 0.1,Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,...,Tavg_30d,DewPoint_30d,WetBulb_30d,PrecipTotal_30d,Tmax_90d,Tmin_90d,Tavg_90d,DewPoint_90d,WetBulb_90d,PrecipTotal_90d
116288,116288,116289,2014-10-02,"2100 North Cannon Drive, Chicago, IL 60614, USA",CULEX SALINARIUS,21,N CANNON DR,T054C,"2100 N CANNON DR, Chicago, IL",41.925652,...,65.566667,53.766667,58.733333,0.115667,79.677778,62.788889,71.488889,59.0,63.855556,0.2
116289,116289,116290,2014-10-02,"2100 North Cannon Drive, Chicago, IL 60614, USA",CULEX TERRITANS,21,N CANNON DR,T054C,"2100 N CANNON DR, Chicago, IL",41.925652,...,65.566667,53.766667,58.733333,0.115667,79.677778,62.788889,71.488889,59.0,63.855556,0.2
116290,116290,116291,2014-10-02,"2100 North Cannon Drive, Chicago, IL 60614, USA",CULEX TARSALIS,21,N CANNON DR,T054C,"2100 N CANNON DR, Chicago, IL",41.925652,...,65.566667,53.766667,58.733333,0.115667,79.677778,62.788889,71.488889,59.0,63.855556,0.2
116291,116291,116292,2014-10-02,"2100 North Cannon Drive, Chicago, IL 60614, USA",UNSPECIFIED CULEX,21,N CANNON DR,T054C,"2100 N CANNON DR, Chicago, IL",41.925652,...,65.566667,53.766667,58.733333,0.115667,79.677778,62.788889,71.488889,59.0,63.855556,0.2
116292,116292,116293,2014-10-02,"2100 North Cannon Drive, Chicago, IL 60614, USA",CULEX ERRATICUS,21,N CANNON DR,T054C,"2100 N CANNON DR, Chicago, IL",41.925652,...,65.566667,53.766667,58.733333,0.115667,79.677778,62.788889,71.488889,59.0,63.855556,0.2


In [9]:
train_data.isnull().sum()

Unnamed: 0                0
Date                      0
Address                   0
Species                   0
Block                     0
Street                    0
Trap                      0
AddressNumberAndStreet    0
Latitude                  0
Longitude                 0
AddressAccuracy           0
WnvPresent                0
NumMosquitos              0
Day                       0
Month                     0
Year                      0
Station                   0
Tmax                      0
Tmin                      0
Tavg                      0
DewPoint                  0
WetBulb                   0
Heat                      0
Cool                      0
Sunrise                   0
Sunset                    0
CodeSum                   0
PrecipTotal               0
StnPressure               0
SeaLevel                  0
ResultSpeed               0
ResultDir                 0
AvgSpeed                  0
Week                      0
geometry                  0
Tmax_14d            

In [10]:
test_data.isnull().sum()

Unnamed: 0                0
Id                        0
Date                      0
Address                   0
Species                   0
Block                     0
Street                    0
Trap                      0
AddressNumberAndStreet    0
Latitude                  0
Longitude                 0
AddressAccuracy           0
Week                      0
Year                      0
Month                     0
Day                       0
Station                   0
Tmax                      0
Tmin                      0
Tavg                      0
DewPoint                  0
WetBulb                   0
Heat                      0
Cool                      0
Sunrise                   0
Sunset                    0
CodeSum                   0
PrecipTotal               0
StnPressure               0
SeaLevel                  0
ResultSpeed               0
ResultDir                 0
AvgSpeed                  0
Tmax_14d                  0
Tmin_14d                  0
Tavg_14d            

## Export

In [12]:
train_data.to_csv('../datasets/cleaned_datasets//train_w_rolled_weather.csv',index=False)
test_data.to_csv('../datasets/cleaned_datasets/test_w_rolled_weather.csv',index=False)

**Next notebook: [04b_Feature_Engineering](4b_Feature_Engineering.ipynb)**