### Q7a)

In [16]:
# Imports
import pandas as pd
from matplotlib.pyplot import xcorr

In [17]:
daily_rainfall_data = pd.read_csv('../../datasets/daily_rainfall_brandywine.csv')
daily_rainfall_data.head(5)

Unnamed: 0,year,month,day,Amount_mm
0,1961.0,1.0,1.0,1.0
1,1961.0,1.0,2.0,14.0
2,1961.0,1.0,3.0,8.6
3,1961.0,1.0,5.0,7.6
4,1961.0,1.0,6.0,2.2


### Check for missing:
* any missing data
* any erroneous data (e.g. negative, or large values which are clearly incorrect)
* any samples which are too small or not matching in size if they are to be compared against other samples


In [18]:
# using isnull() function
daily_rainfall_data.isnull().head(5)

Unnamed: 0,year,month,day,Amount_mm
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False


In [19]:
# Any missing values?
daily_rainfall_data.isnull().values.any()

True

In [20]:
print(daily_rainfall_data.isnull().values.sum())

1336


#### Remove missing values

In [21]:
daily_rainfall_data = daily_rainfall_data.dropna(how='all')
print(daily_rainfall_data.isnull().values.sum())

0


In [22]:
daily_rainfall_data = daily_rainfall_data.astype({"year":"int", "month":"int", "day":"int"})
daily_rainfall_data.head(5)

Unnamed: 0,year,month,day,Amount_mm
0,1961,1,1,1.0
1,1961,1,2,14.0
2,1961,1,3,8.6
3,1961,1,5,7.6
4,1961,1,6,2.2


### Original data copy

In [23]:
daily_rainfall_data_copy = daily_rainfall_data.copy()

### Transform the data to time series format

In [24]:
daily_rainfall_data_copy['dateInt'] = daily_rainfall_data_copy['year'].astype(str) + daily_rainfall_data_copy['month'].astype(str).str.zfill(2)+ daily_rainfall_data_copy['day'].astype(str).str.zfill(2)
daily_rainfall_data_copy['Date'] = pd.to_datetime(daily_rainfall_data_copy['dateInt'], format='%Y%m%d')
daily_rainfall_data_copy.head(5)

Unnamed: 0,year,month,day,Amount_mm,dateInt,Date
0,1961,1,1,1.0,19610101,1961-01-01
1,1961,1,2,14.0,19610102,1961-01-02
2,1961,1,3,8.6,19610103,1961-01-03
3,1961,1,5,7.6,19610105,1961-01-05
4,1961,1,6,2.2,19610106,1961-01-06


### Daily Annual Series

In [25]:
# daily_rainfall_data_copy = daily_rainfall_data_copy.rename(columns={'Date': 'daily'})
daily_rainfall_data_series =  daily_rainfall_data_copy.filter(['Date', 'Amount_mm'], axis=1)
daily_rainfall_data_series.set_index('Date').resample('D').sum()
daily_rainfall_data_series.head(5)

Unnamed: 0,Date,Amount_mm
0,1961-01-01,1.0
1,1961-01-02,14.0
2,1961-01-03,8.6
3,1961-01-05,7.6
4,1961-01-06,2.2


### Monthly Annual Series

In [26]:
monthly_rainfall_data_series =  daily_rainfall_data_copy.filter(['Date', 'Amount_mm'], axis=1)
monthly_rainfall_data_series = monthly_rainfall_data_series.set_index('Date').resample('M').sum()
monthly_rainfall_data_series.head(5)

Unnamed: 0_level_0,Amount_mm
Date,Unnamed: 1_level_1
1961-01-31,79.2
1961-02-28,27.7
1961-03-31,13.9
1961-04-30,55.2
1961-05-31,38.2


### Annual Time Series

In [27]:
annual_rainfall_data_series =  daily_rainfall_data_copy.filter(['Date', 'Amount_mm'], axis=1)
annual_rainfall_data_series = annual_rainfall_data_series.set_index('Date').resample('Y').sum()
annual_rainfall_data_series.head(5)

Unnamed: 0_level_0,Amount_mm
Date,Unnamed: 1_level_1
1961-12-31,615.2
1962-12-31,518.8
1963-12-31,710.8
1964-12-31,458.0
1965-12-31,803.1


### Correlation for specific daily, monthly and yearly series.

In [28]:
#daily_rainfall_data_series = daily_rainfall_data_series.squeeze(axis=0)
daily_rainfall_data_series['Amount_mm'].squeeze().autocorr(lag=5)

0.022291581239872887

In [29]:
#monthly_rainfall_data_series = monthly_rainfall_data_series.squeeze(axis=0)
monthly_rainfall_data_series['Amount_mm'].squeeze().autocorr()

0.04378834871914533

In [30]:
#monthly_rainfall_data_series = monthly_rainfall_data_series.squeeze(axis=0)
annual_rainfall_data_series['Amount_mm'].squeeze().autocorr(lag=7)

0.018757197187676256