# 01A Data Cleansing Weather

## Imports

In [167]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
%matplotlib inline

## Read Data

In [168]:
weather_df = pd.read_csv('../west_nile/west_nile/input/weather.csv')

## One Hot Encode CodeSum Data

In [169]:
codes = {code: [0]*weather_df.shape[0] for codes in weather_df.CodeSum.unique() for code in codes.split()}

weather_df = weather_df.join(pd.DataFrame(codes))
for i, codesum in enumerate(weather_df.CodeSum):
    for code in codesum.split():
        weather_df.at[i, code] = 1
        
codes_df = weather_df.groupby('Date', as_index=True)[list(codes.keys())].max()

## Export codes_df

In [170]:
with open('../Cleansed_Data/codes_df.pkl', 'wb') as f:
    pickle.dump(codes_df, f)

----------

# New Weather Processing

## Import weather data

In [171]:
weather = pd.read_csv('../west_nile/west_nile/input/weather.csv')

## Impute missing Tavg Values and Convert column to Int

In [172]:
missing_Tavg = weather[weather.Tavg == 'M'].index

weather.loc[missing_Tavg, 'Tavg'] = round((weather.loc[missing_Tavg, 'Tmax'] 
                                              + weather.loc[missing_Tavg, 'Tmin'])/2)
weather.Tavg = weather.Tavg.astype(int)

## Impute missing AvgSpeed and Convert column to float

In [173]:
median_wind = weather[weather.AvgSpeed!='M'].AvgSpeed.median()
weather.AvgSpeed.replace('M', median_wind, inplace=True)
weather.AvgSpeed = weather.AvgSpeed.astype(float)

## Create Temperature Columns in Celsius

In [174]:
def celsius(x):
    c = ((x - 32) * 5.0)/9.0
    return float(c)

In [175]:
weather['TavgC'] = weather['Tavg'].apply(celsius)
weather['TminC'] = weather['Tmin'].apply(celsius)
weather['TmaxC'] = weather['Tmax'].apply(celsius)

## Calculate Relative Humidity

In [176]:
weather['relative_humidity'] = 100 * (np.exp((17.625 * weather.DewPoint.apply(celsius)) / 
                                             (243.04 + weather.DewPoint.apply(celsius))) / 
                                      np.exp((17.625 * weather.Tavg.apply(celsius)) / 
                                             (243.04 + weather.Tavg.apply(celsius))))

## Convert DewPoint to Celsius

In [177]:
weather['DewPointC'] = weather['DewPoint'].apply(celsius)

## Add rain boolean

- TS THUNDERSTORM
- GR HAIL
- RA RAIN
- DZ DRIZZLE
- SH SHOWER

In [178]:
def filter_codesum(x):
    for code in rain:
        if code in x:
            return 1
        else:
            return 0

In [179]:
rain = ['RA', 'DZ', 'SH', 'TS', 'GR']

In [180]:
weather['rain'] = weather.CodeSum.apply(filter_codesum)

## Add Ideal Environment boolean

In [181]:
weather.loc[(weather['TavgC'] >= 10) & (weather['TavgC'] <= 35) & 
            (weather['relative_humidity'] <= 95) & (weather['relative_humidity'] >= 40) & 
            (weather['AvgSpeed'] <= 6), 
            'ideal_environment']=1

weather.ideal_environment.fillna(0, inplace=True)

## Add Ideal Feeding Conditions boolean

In [182]:
weather.loc[(weather['TavgC'] >= 15) & (weather['TavgC'] <= 30) & 
            (weather['relative_humidity'] >= 8) & (weather['relative_humidity'] <= 72) & 
            (weather['AvgSpeed'] <= 6), 
            'ideal_feeding']=1

weather.ideal_feeding.fillna(0, inplace=True)

## Average values to drop station

In [183]:
new_weather_df = weather.groupby('Date')['TavgC','TmaxC','TminC','DewPointC','relative_humidity',
                                         'rain','ideal_environment','ideal_feeding'].mean()

## Calculate Photoperiod

In [184]:
sun = weather[weather.Station == 1][['Date','Sunset','Sunrise']]

In [185]:
sun['sundiff'] = sun.apply(lambda x: int(x.Sunset) - int(x.Sunrise), axis=1)

In [186]:
sun['photoperiod'] =sun.sundiff.map(lambda x: int(str(x)[:2]) + int(str(x)[-2:])/60)

In [187]:
sun.index = sun.Date

In [188]:
sun = sun.drop(['Date','Sunset','Sunrise','sundiff'], axis=1)

## Join Photoperiod with new_weather_df

In [189]:
new_weather_df['Date'] = new_weather_df.index

In [190]:
new_weather_df = new_weather_df.join(sun, on='Date')

In [191]:
new_weather_df = new_weather_df.drop(['Date'], axis=1)

## Export new_weather_df

In [192]:
with open('../Cleansed_Data/new_weather_df.pkl', 'wb') as f:
    pickle.dump(new_weather_df, f)