# <font color="red"> 1 Introduction <font>

### Welcome to the Course

In [1]:
# Time series in the field of Data Science
    # Time series are a fundamental way to store and analyze many types of data
    # Financial, weather and device data are all best handled as time series

#######################################    
# Define WD & Reading data with Pandas
#######################################
# Import Relevant libraries
import pandas as pd
import os

# Show our current working directory and list content
os.getcwd() # get current WD
os.listdir() # list the content

# Define the path to the directory and file of interest
ch2_co2_levels = os.path.join('data', 'ch2_co2_levels.csv')
#ch2_co2_levels = os.path.join('..', 'data2', 'ch2_co2_levels.csv') #if data is 1 level up

# load the file as a dataframe
df = pd.read_csv(ch2_co2_levels)


# Preview data with Pandas
print(df.head())
print(df.head(n=5))
print(df.tail(n=5))

# Check data types with Pandas
print(df.dtypes)


# Working with dates
#####################
# To work with time series data in pandas , your date columns needs to be of the datetime64 type.
pd.to_datetime(['2009/07/31', 'test']) # ValueError: Unknown string format
pd.to_datetime(['2009/07/31', 'test'], errors='coerce') # Correct

    datestamp    co2
0  1958-03-29  316.1
1  1958-04-05  317.3
2  1958-04-12  317.6
3  1958-04-19  317.5
4  1958-04-26  316.4


### Plot your first time series

In [None]:
# The Matplotlib library
# In Python, matplotlib is an extensive package used to plot data
# The pyplot submodule of matplotlib is traditionally imported using the plt alias
import matplotlib.pyplot as plt

# Plotting time series data
import matplotlib.pyplot as plt
import pandas as pd
df = df.set_index('date_column')
df.plot()
plt.show()


# Adding style to your plots
plt.style.use('fivethirtyeight')
df.plot()
plt.show()

# Matplotlib style sheets
print(plt.style.available)

# Describing your graphs with labels
ax = df.plot(color='blue')

ax.set_xlabel('Date')
ax.set_ylabel('The values of my Y axis')
ax.set_title('The title of my plot')
plt.show()

# Figure size, linewidth, linestyle and fontsize
ax = df.plot(figsize=(12, 5), fontsize=12,
             linewidth=3, linestyle='--')
ax.set_xlabel('Date', fontsize=16)
ax.set_ylabel('The values of my Y axis', fontsize=16)
ax.set_title('The title of my plot', fontsize=16)
plt.show()

### Customise your time series plot

In [None]:
# Slicing time series data
discoveries['1960':'1970']
discoveries['1950-01':'1950-12']
discoveries['1960-01-01':'1960-01-15']

# Plotting subset of your time series data
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
df_subset = discoveries['1960':'1970']

ax = df_subset.plot(color='blue', fontsize=14)
plt.show()

# Adding markers
ax.axvline(x='1969-01-01', 
           color='red', 
           linestyle='--')

ax.axhline(y=100,
           color='green',
           linestyle='--')

# Using markers: the full code
ax = discoveries.plot(color='blue')
ax.set_xlabel('Date')
ax.set_ylabel('Number of great discoveries')
ax.axvline('1969-01-01', color='red', linestyle='--')
ax.axhline(4, color='green', linestyle='--')

# Highlighting regions of interest
ax.axvspan('1964-01-01', '1968-01-01',
           color='red', alpha=0.5)

ax.axhspan(8, 6, color='green',
           alpha=0.2)

# Highlighting regions of interest: the full code
ax = discoveries.plot(color='blue')
ax.set_xlabel('Date')
ax.set_ylabel('Number of great discoveries')

ax.axvspan('1964-01-01', '1968-01-01', color='red', alpha=0.3)
ax.axhspan(8, 6, color='green', alpha=0.3)

# <font color="red"> 2 Summary Statistics and Diagnostics for Time Series Data <font>

### 2.1 Clean your time series data

In [None]:
##############################
# Notes - Dealing with Missing values
##############################
# Noise in data can include things like outliers, misformatted data points and missing values. It is therefore important to carefully process and clean your data

# Checking your data for missing values
# Finding missing values in a DataFrame
# Check for null
print(df.isnull())
# Check for not nu;;
print(df.notnull())

# Counting missing values in a DataFrame
# How many rows contain missing values
print(df.isnull().sum())

# Replacing missing values in a DataFrame
# "backfill"The method is to replace the missing value with previous values
df = df.fillna(method='bfill') #We cold also have "ffill"
print(df)

##############################
# Notes - Find missing values
##############################
# Import pandas and plotting modules
import pandas as pd
import matplotlib.pyplot as plt

# Display first seven rows of co2_levels
print(co2_levels.head(n=7))

### 2.2 Plot Aggregates of your data

In [None]:
######################################
# Aggregating and plotting your data
#####################################

# The moving average model
###########################
co2_levels_mean = co2_levels.rolling(window=52).mean()
ax = co2_levels_mean.plot()
ax.set_xlabel("Date")
ax.set_ylabel("The values of my Y axis")
ax.set_title("52 weeks rolling mean of my time series")
plt.show()


# Computing aggregate values of your time series
#################################################
co2_levels.index
print(co2_levels.index.month)
print(co2_levels.index.year)


# Plotting aggregate values of your time series
################################################
index_month = co2_levels.index.month
co2_levels_by_month = co2_levels.groupby(index_month).mean()
co2_levels_by_month.plot()
plt.show()

### 2.3 Summarize the values in your time series data

In [None]:
####################################
# Summarizing your time series data
####################################

# Obtaining numerical summaries of your data
#############################################
print(df.describe())


# Summarizing your data with boxplots
#######################################
ax1 = df.boxplot()
ax1.set_xlabel('Your first boxplot')
ax1.set_ylabel('Values of your data')
ax1.set_title('Boxplot values of your data')
plt.show()


# Summarizing your data with histograms
#########################################
ax2 = df.plot(kind='hist', bins=100)
ax2.set_xlabel('Your first histogram')
ax2.set_ylabel('Frequency of values in your data')
ax2.set_title('Histogram of your data with 100 bins')
plt.show()


# Summarizing your data with density plots
##########################################
ax3 = df.plot(kind='density', linewidth=2)
ax3.set_xlabel('Your first density plot')
ax3.set_ylabel('Density values of your data')
ax3.set_title('Density plot of your data')
plt.show()

# <font color="red"> 3 Seasonality, Trend and Noise <font>

### 3.1 Autocorrelation and Partial autocorrelation

In [None]:
# Autocorrelation in time series data


# Statsmodels


# Plotting Auto correlations
import matplotlib.pyplot as plt
from statsmodels.graphics import tsaplots
fig = tsaplots.plot_acf(co2_levels['co2'], lags=40)
plt.show()


# Partial autocorrelation in time series data
import matplotlib.pyplot as plt
from statsmodels.graphics import tsaplots
fig = tsaplots.plot_pacf(co2_levels['co2'], lags=40)
plt.show()


### 3.2 Seasonality, trend and noise in time series data

In [None]:
# Properties of time series

# Time series decomposition
import statsmodels.api as sm
import matplotlib.pyplot as plt
from pylab import rcParams

rcParams['figure.figsize'] = 11, 9
decomposition = sm.tsa.seasonal_decompose(co2_levels['co2'])
fig = decomposition.plot()
plt.show()


# Extracting components from time series decomposition
print(dir(decomposition))
print(decomposition.seasonal)


# Seasonality component in time series
decomp_seasonal = decomposition.seasonal
ax = decomp_seasonal.plot(figsize=(14, 2))
ax.set_xlabel('Date')
ax.set_ylabel('Seasonality of time series')
ax.set_title('Seasonal values of the time series')
plt.show()

# Trend component in timeseries
decomp_trend = decomposition.trend
ax = decomp_trend.plot(figsize=(14, 2))
ax.set_xlabel('Date')
ax.set_ylabel('Trend of time series')
ax.set_title('Trend values of the time series')
plt.show()

# Noise component in time series
decomp_resid = decomp.resid
ax = decomp_resid.plot(figsize=(14, 2))
ax.set_xlabel('Date')
ax.set_ylabel('Residual of time series')
ax.set_title('Residual values of the time series')
plt.show()

### 3.2 A review on what you have learned so far

# <font color="red"> 4 Work with Multiple Time Series <font>

### 4.1 Working with more than one time series

In [None]:
# Working with multiple time series
import pandas as pd
meat = pd.read_csv("meat.csv")
print(meat.head(5))

# Summarizing and plotting multiple time series
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
ax = df.plot(figsize=(12, 4), fontsize=14)
plt.show()

# Area Chart
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
ax = df.plot.area(figsize=(12, 4), fontsize=14)
plt.show()

### 4.2 Plot multiple time series

In [None]:
# The colormap argument
ax = df.plot(colormap='Dark2', figsize=(14, 7))
ax.set_xlabel('Date')
ax.set_ylabel('Production Volume (in tons)')
plt.show()


# Enhancing your plot with information
ax = df.plot(colormap='Dark2', figsize=(14, 7))
df_summary = df.describe()


# Specify values of cells in the table
ax.table(cellText=df_summary.values,
        # Specify width of the table
        colWidths=[0.3]*len(df.columns),
        # Specify row labels
        rowLabels=df_summary.index,
        # Specify column labels
        colLabels=df_summary.columns,
        # Specify location of the table
        loc='top')
plt.show()


# Facet plots
df.plot(subplots=True,
        linewidth=0.5,
        layout=(2, 4),
        figsize=(16, 10),
        sharex=False,
        sharey=False)
plt.show()

### 4.3 Find relationships between multiple time series

In [None]:
# Compute correlations
from scipy.stats.stats import pearsonr
from scipy.stats.stats import spearmanr
from scipy.stats.stats import kendalltau
x = [1, 2, 4, 7]
y = [1, 3, 4, 8]
pearsonr(x, y)

spearmanr(x, y)
kendalltau(x, y)


# Computing Correlation Matrices with Pandas
corr_p = meat[['beef', 'veal','turkey']].corr(method='pearson')
print(corr_p)

corr_s = meat[['beef', 'veal','turkey']].corr(method='spearman')
print(corr_s)

# Computing Correlation Matrices with Pandas
corr_mat = meat.corr(method='pearson')


# Heatmap
import seaborn as sns
sns.heatmap(corr_mat)


# ClusterMap
sns.clustermap(corr_mat)


# <font color="red"> 5 Work with Multiple Time Series <font>

In [None]:
### 5.1 Apply your knowledge to a new dataset

In [None]:
### 5.2 Beyond summary statistics

In [None]:
# Facet plots of the jobs dataset
jobs.plot(subplots=True, layout=(4, 4), figsize=(20, 16), sharex=True, sharey=False)
plt.show()

# Annotating events in the jobs dataset
ax = jobs.plot(figsize=(20, 14), colormap='Dark2')
ax.axvline('2008-01-01', color='black', linestyle='--')
ax.axvline('2009-01-01', color='black', linestyle='--')


# Taking seasonal average in the jobs dataset
print(jobs.index)

index_month = jobs.index.month
jobs_by_month = jobs.groupby(index_month).mean()
print(jobs_by_month)


# Monthly averages in the jobs dataset
ax = jobs_by_month.plot(figsize=(12, 5), colormap='Dark2')
ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left')

### 5.3 Decompose time series data

In [None]:
########################
# Python dictionaries
#######################
# Initialize a Python dictionnary
my_dict = {}

# Add a key and value to your dictionnary
my_dict['your_key'] = 'your_value'

# Add a second key and value to your dictionnary
my_dict['your_second_key'] = 'your_second_value'

# Print out your dictionnary
print(my_dict)


#############################################################
# Decomposing multiple time series with Python dictionaries
############################################################
# Import the statsmodel library
import statsmodels.api as sm
# Initialize a dictionary
my_dict = {}
# Extract the names of the time series
ts_names = df.columns
print(ts_names)


# Run time series decomposition
for ts in ts_names:
ts_decomposition = sm.tsa.seasonal_decompose(jobs[ts])
my_dict[ts] = ts_decomposition


# Extract decomposition components of multiple time series
# Initialize a new dictionnary
my_dict_trend = {}
# Extract the trend component
for ts in ts_names:
my_dict_trend[ts] = my_dict[ts].trend
# Convert to a DataFrame
trend_df = pd.DataFrame.from_dict(my_dict_trend)
print(trend_df)

### 5.4 Compute correlations between time series

In [None]:
# Trends in Jobs data
print(trend_df)

# Plotting a clustermap of the jobs correlation matrix
# Get correlation matrix of the seasonality_df DataFrame
trend_corr = trend_df.corr(method='spearman')
# Customize the clustermap of the seasonality_corr
correlation matrix
fig = sns.clustermap(trend_corr, annot=True, linewidth=0.4)
plt.setp(fig.ax_heatmap.yaxis.get_majorticklabels(),
rotation=0)
plt.setp(fig.ax_heatmap.xaxis.get_majorticklabels(),
rotation=90)