In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# What is Time Series Data?

Important to consider progression of time
> - Is the temporal information a key focus of the data?  

## Examples

- Stock prices
- Temperature over the year
- Atmoshperic changes over the course of decades

## Loading in time series

In [None]:
# Load and display
df = pd.read_csv("min_temp.csv")
display(df.head(10))
display(df.info())

## Make data readable as a datetime

In [None]:
# Creating a proper datetime using the string formatting
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%y')
# Make the temporal data as the focus
df = df.set_index('Date')

In [None]:
display(df.head(10))
display(df.info())

## Slicing time series data

In [None]:
after_1990 = df['1990':]
display(after_1990.head())

## Follow-up: Why should we make the date as the index?

# Resampling

Converting the time series into a particular frequency

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.resample.html
https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#resampling

## Downsampling

- resample at a lower rate
- may loose information
- more computationally efficient

### Example

In [None]:
# Average out so we have monthly means (compared to using days)
monthly = df.resample('MS')
month_mean = monthly.mean()

In [None]:
month_mean.head(10)

## Upsampling

- resample at a higher rate
- should keep information

### Example

In [None]:
# Data to every 12hours but only fill the parts known (blank otherwise)
bidaily = df.resample('12H').asfreq()
bidaily.head(10)

In [None]:
# Interpolate to every 12hours but fill the parts unknown (no blanks)
bidaily = df.resample('12H').ffill()
bidaily.head(10)

In [None]:
hourly = df.resample('1H').ffill()
hourly.head(30)

# Visualizing Time Series

## Showing Changes Over Time

Can identify patterns and trends with visualizations

In [None]:
# New York Stock Exchange average monthly returns [1961-1966] from curriculum
nyse = pd.read_csv("NYSE_monthly.csv")
col_name= 'Month'
nyse[col_name] = pd.to_datetime(nyse[col_name])
nyse.set_index(col_name, inplace=True)

display(nyse.head(10))
display(nyse.info())

### Line Plot

In [None]:
nyse.plot(figsize = (16,6))
import matplotlib.pyplot as plt
plt.show()

### Dot Plot

In [None]:
nyse.plot(figsize = (16,6), style = "o")
import matplotlib.pyplot as plt
plt.show()

### Question time: Dot vs Line Plots

Note the difference between this and the line plot

When would you want a dot vs a line plot?

### Grouping Plots

What if we wanted to look at year-to-year (e.g., temperature throughout many years)

Couple options to choose from

### Example all separated annual (from curriculum)

In [None]:
# Annual Frequency
year_groups = nyse.groupby(pd.Grouper(freq ='A'))

In [None]:
#Create a new DataFrame and store yearly values in columns 
nyse_annual = pd.DataFrame()

for yr, group in year_groups:
    nyse_annual[yr.year] = group.values.ravel()
    
# Plot the yearly groups as subplots
nyse_annual.plot(figsize = (13,8), subplots=True, legend=True)
plt.show()

### Example all together annual (from curriculum)

In [None]:
# Plot overlapping yearly groups 
nyse_annual.plot(figsize = (15,5), subplots=False, legend=True)
plt.show()

## Showing Distributions

Sometimes the distribution of the values are important.

What are some reasons?

- Checking for normality (for stat testing)
- First check on raw & transformed data

### Histogram

In [None]:
nyse.hist(figsize = (10,6))
plt.show()

In [None]:
# Bin it to make it more obvious if normal
nyse.hist(figsize = (10,6), bins = 7)
plt.show()

### Density

In [None]:
nyse.plot(kind='kde', figsize = (15,10))
plt.show()

### Box Plot

- Shows distribution over time
- Can help show outliers
- Seasonal trends

#### Example

In [None]:
# Generate a box and whiskers plot for temp_annual dataframe
nyse_annual.boxplot(figsize = (12,7))
plt.show()

### Heat Maps

Use color to show patterns throughout a time period for data

#### Example

In [None]:
# Year and month 
year_matrix = nyse_annual.T
plt.matshow(year_matrix, interpolation=None, aspect='auto', cmap=plt.cm.Spectral_r)
plt.show()

# Types of Trends

## Stationary

### Definition:
> images from [https://www.analyticsvidhya.com/blog/2015/12/complete-tutorial-time-series-modeling/](https://www.analyticsvidhya.com/blog/2015/12/complete-tutorial-time-series-modeling/)

- The series' **mean** is **not** a function of time
![https://www.analyticsvidhya.com/wp-content/uploads/2015/02/Mean_nonstationary.png](images/Mean_nonstationary.png)
- The series' **varaince** is **not** a function of time (heteroscedacity)
![https://www.analyticsvidhya.com/wp-content/uploads/2015/02/Var_nonstationary.png](images/Var_nonstationary.png)
- The series' **covaraince** is **not** a function of time
![https://www.analyticsvidhya.com/wp-content/uploads/2015/02/Cov_nonstationary.png](images/Cov_nonstationary.png)

### No Trend

In [None]:
data = nyse
data.plot(figsize=(12,6), linewidth=2, fontsize=14)
plt.xlabel(col_name, fontsize=20)
plt.ylabel("Monthly NYSE returns", fontsize=20)
plt.ylim((-0.15,0.15));

## Linear Trend

### Upward

![](https://github.com/learn-co-students/dsc-3-25-05-types-of-trends-online-ds-sp-000/raw/master/index_files/index_15_0.png)

### Downward

![](https://github.com/learn-co-students/dsc-3-25-05-types-of-trends-online-ds-sp-000/raw/master/index_files/index_19_0.png)

## Exponential

![](https://github.com/learn-co-students/dsc-3-25-05-types-of-trends-online-ds-sp-000/raw/master/index_files/index_22_0.png)

## Periodic

![](https://github.com/learn-co-students/dsc-3-25-05-types-of-trends-online-ds-sp-000/raw/master/index_files/index_25_0.png)

![](https://github.com/learn-co-students/dsc-3-25-05-types-of-trends-online-ds-sp-000/raw/master/index_files/index_30_0.png)

# Assessing Trends 

In [None]:
# generated data 
years = pd.date_range('2012-01', periods=72, freq="M")
index = pd.DatetimeIndex(years)

np.random.seed(3456)
sales= np.random.randint(-4, high=4, size=72)
bigger = np.array([0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,3,3,3,3,
                   3,3,3,3,3,3,3,3,7,7,7,7,7,7,7,7,7,7,7,
                   11,11,11,11,11,11,11,11,11,11,18,18,18,
                   18,18,18,18,18,18,26,26,26,26,26,36,36,36,36,36])
data = pd.Series(sales+bigger+6, index=index)
ts=data
fig = plt.figure(figsize=(12,6))
plt.plot(data)
plt.xlabel("month", fontsize=16)
plt.ylabel("monthly sales", fontsize=16)
plt.show()

## Rolling Statistics

Take the average of a number of past data points (over a time period)

### Example

In [None]:
rolmean = ts.rolling(window = 8, center = False).mean()
rolstd = ts.rolling(window = 8, center = False).std()

fig = plt.figure(figsize=(12,7))
orig = plt.plot(ts, color='blue',label='Original')
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
std = plt.plot(rolstd, color='black', label = 'Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=False)

## Dickey-Fuller Test

Statistical test for testing stationarity; $H_0$ is that time series is stationary

Doc Resource: http://www.statsmodels.org/dev/generated/statsmodels.tsa.stattools.adfuller.html

### Code Example

In [None]:
from statsmodels.tsa.stattools import adfuller

dftest = adfuller(ts)

# Extract and display test results in a user friendly manner
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
    dfoutput['Critical Value (%s)'%key] = value
print(dftest)

In [None]:
print(dfoutput)

# Removing Trends

# Time Series Decomposition