In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [29]:
def read_data():
    df = pd.read_csv("data/weather_data.csv",parse_dates=["day"])
    return df

# to change index of dataframe --> use reindex(d)

In [30]:
df = read_data()
df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,,9.0,Sunny
2,2017-01-05,28.0,,Snow
3,2017-01-06,,7.0,
4,2017-01-07,32.0,,Rain
5,2017-01-08,,,Sunny
6,2017-01-09,,,
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [31]:
df.dtypes

day            datetime64[ns]
temperature           float64
windspeed             float64
event                  object
dtype: object

In [32]:
# Number of missing values in the dataframe
df.isnull().sum()

day            0
temperature    4
windspeed      4
event          2
dtype: int64

#### Then Split data into train and test set, and then fill them out seperately

#### but for now, we will just learn the methods required

## Handling Missing Values

### 2. Filling rows with missing data - fillna()

* fillna()
* fillna(method='ffill')
* fillna(dict)

axis=0 or axis='index' --> to fill row, 
axis=1 or axis='columns' --> to fill columns

In [36]:
df["event"].fillna("missing",inplace=True)
df["windspeed"].fillna(df['windspeed'].mean(),inplace=True)
df["temperature"].fillna(30.0,inplace=True)

In [37]:
# event is an object. convert it from object to string 

df['event'] = df['event'].astype("string")
df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,30.0,9.0,Sunny
2,2017-01-05,28.0,8.4,Snow
3,2017-01-06,30.0,7.0,missing
4,2017-01-07,32.0,8.4,Rain
5,2017-01-08,30.0,8.4,Sunny
6,2017-01-09,30.0,8.4,missing
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [38]:
df.isnull().sum()

day            0
temperature    0
windspeed      0
event          0
dtype: int64

In [40]:
df2 = read_data()
df2

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,,9.0,Sunny
2,2017-01-05,28.0,,Snow
3,2017-01-06,,7.0,
4,2017-01-07,32.0,,Rain
5,2017-01-08,,,Sunny
6,2017-01-09,,,
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [42]:
nan_map = {
    'windspeed':df['windspeed'].mean(),
    'temperature':30.0,
    'event':'missing'
}
df2.fillna(nan_map,inplace=True)

df2

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,30.0,9.0,Sunny
2,2017-01-05,28.0,8.4,Snow
3,2017-01-06,30.0,7.0,missing
4,2017-01-07,32.0,8.4,Rain
5,2017-01-08,30.0,8.4,Sunny
6,2017-01-09,30.0,8.4,missing
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [43]:
df3 = read_data()
df3

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,,9.0,Sunny
2,2017-01-05,28.0,,Snow
3,2017-01-06,,7.0,
4,2017-01-07,32.0,,Rain
5,2017-01-08,,,Sunny
6,2017-01-09,,,
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [45]:
# forwards data one row to next missing row cell (fill horizantally)
df3.fillna(method="ffill",inplace=True)
df3

# axis='columns' --> fill vertically

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,32.0,9.0,Sunny
2,2017-01-05,28.0,9.0,Snow
3,2017-01-06,28.0,7.0,Snow
4,2017-01-07,32.0,7.0,Rain
5,2017-01-08,32.0,7.0,Sunny
6,2017-01-09,32.0,7.0,Sunny
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [46]:
len(df)

9