# MNLE Time Series

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import colormaps  # Use colormaps from Matplotlib
from pandas.plotting import lag_plot
from pandas.plotting import autocorrelation_plot
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose
import plotly.express as px
from IPython.display import clear_output
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [None]:
# In order to access data on Google Drive, you need to mount the drive to access it's content
# from google.colab import drive
# drive.mount('/content/drive')

# Load some time series data and do some plotting

In [3]:
# Pandas is used to read a csv file and store data in a DataFrame
# Make sure all file are in My Drive, subdirectory Datasets

# Read Time Series files:

# Airline passengers
airline_df = pd.read_csv('airline-passengers.csv')

# Daily minimum temperatures Melbourne Australia 1981-1990
mintemp_df = pd.read_csv('daily-minimum-temperatures.csv')

# Stock prices
stock_price_df = pd.read_csv('stocks.csv')
# Read the stocks volume data
stock_vol_df = pd.read_csv("stock_volume.csv")

# Daily total female births
# Shampoo sales
# Sunspots

In [4]:
# Print Daily minimum temperatures Melbourne Australia 1981-1990
print(mintemp_df)

            Date  Temp
0     1981-01-01  20.7
1     1981-01-02  17.9
2     1981-01-03  18.8
3     1981-01-04  14.6
4     1981-01-05  15.8
...          ...   ...
3645  1990-12-27  14.0
3646  1990-12-28  13.6
3647  1990-12-29  13.5
3648  1990-12-30  15.7
3649  1990-12-31  13.0

[3650 rows x 2 columns]


In [None]:
# Ensure Date is in datetime format
mintemp_df['Date'] = pd.to_datetime(mintemp_df['Date'])

In [None]:
# Plot Daily minimum temperatures Melbourne Australia 1981-1990 with a lin eplot
plt.figure(figsize=(12, 6))
plt.plot(mintemp_df['Date'], mintemp_df['Temp'])
plt.xlabel('Date')
plt.ylabel('Temp')
plt.title('Daily minimal temperature - Melbourne Australia')
plt.xticks(rotation=45, ha='right')  # Rotate labels for better visibility
plt.show()

In [None]:
# Plot Daily minimum temperatures Melbourne Australia 1981-1990 with a dot plot
plt.figure(figsize=(12, 6))
plt.scatter(mintemp_df['Date'], mintemp_df['Temp'], color='black', s=15, alpha=0.20)  # Use black dots
plt.xlabel('Date')
plt.ylabel('Temp')
plt.title('Daily minimal temperature - Melbourne Australia')
plt.xticks(rotation=45, ha='right')  # Rotate labels for better visibility
plt.show()

In [None]:
# Plot Daily minimum temperatures Melbourne Australia 1981-1990 with a stacked line plot
# Ensure the 'Date' column is in datetime format and add day of the year and year columns
#mintemp_df['Date'] = pd.to_datetime(mintemp_df['Date'])
mintemp_df['DayOfYear'] = mintemp_df['Date'].dt.dayofyear
mintemp_df['Year'] = mintemp_df['Date'].dt.year

# Group the data by year
grouped = mintemp_df.groupby('Year')

# Create subplots, one for each year, with reduced height for each plot
num_years = grouped.ngroups
fig, axes = plt.subplots(num_years, 1, figsize=(12, num_years * 1), sharex=True, sharey=True)

# Access the 'tab10' colormap and create evenly spaced colors
colors = colormaps['tab10']  # Access colormap directly

# Loop through each group and create a line plot
for i, ((year, group), ax) in enumerate(zip(grouped, axes)):
    ax.plot(group['DayOfYear'], group['Temp'], label=str(year), color=colors(i / (num_years - 1)))  # Normalize color
    ax.set_title(f"Year: {year}", fontsize=10)
    ax.set_ylabel("Temp (Â°C)", fontsize=8)
    ax.tick_params(axis='both', labelsize=8)
    #ax.legend(loc="upper right", fontsize=8)

# Add a shared x-axis label
plt.xlabel("Day of the Year", fontsize=10)
plt.tight_layout()
plt.show()

In [None]:
# Plot a heatmap showing seasonality (each year) of daily minimum temperatures Melbourne Australia 1981-1990
# Extract year and day of year from the 'Date' column
mintemp_df['Year'] = mintemp_df['Date'].dt.year
mintemp_df['Day_of_Year'] = mintemp_df['Date'].dt.dayofyear

# Create a pivot table for the heatmap
heatmap_data = mintemp_df.pivot_table(index='Year', columns='Day_of_Year', values='Temp', aggfunc='mean')

# Create the heatmap using Seaborn
plt.figure(figsize=(15, 8))
sns.heatmap(heatmap_data, cmap='coolwarm', cbar_kws={'label': 'Temp'})
plt.title('Temperature Heatmap - Melbourne Australia')
plt.xlabel('Day of Year')
plt.ylabel('Year')
plt.show()

In [None]:
# Show a lag plot to show relation between values at timestamp t and values at the next timestamp t+1

lagvalue = 1
lag_plot(mintemp_df['Temp'], lag=lagvalue)
# Customize plot labels and title
plt.title('Lag Plot (lag=' + str(lagvalue) + ') daily minimal temperature - Melbourne Australia 1981-1990')
plt.xlabel('Value(t)')
plt.ylabel('Value(t + ' + str(lagvalue) + ')')

# Clear the output in Google Colab
clear_output(wait=True)

# Show the plot
plt.show()

In [None]:
# Calculate the correlation value temperatures
# So we calculate the correlation between all daily temperatures at timestamp t
# compared with all daily temperatures at timestamp t + lag

def calc_corr(lagvalue):
  # Create a lagged version of the 'Temperature' column (shifted by one day)
  lag_value = lagvalue
  mintemp_lagged_df = mintemp_df
  mintemp_lagged_df['Temp_lagged'] = mintemp_df['Temp'].shift(lag_value)

  # Drop the NaN row resulting from the shift operation
  mintemp_lagged_df = mintemp_lagged_df.dropna()

  # Calculate the correlation between 'Temp' and 'Temp_lagged'
  correlation = mintemp_lagged_df['Temp'].corr(mintemp_lagged_df['Temp_lagged'])

  return correlation

# Print the correlation value
# print(f"Correlation between daily temperature and lagged temperature (lag = {lag_value}): {correlation}")

In [None]:
# Show a lag plot to show relation between values at timestamp t and values at the next timestamp t+1

max_lags = 365 # set the maximum lag value to plot, 365 would be a whole year
step_lags = 7 # set the step size for the lag values, 7 would be a week

for lagvalue in range(1, max_lags, step_lags):
    lag_plot(mintemp_df['Temp'], lag=lagvalue)
    # Customize plot labels and title
    plt.title('Lag Plot\nlag=' + str(lagvalue) + '\nr=' + str(round(calc_corr(lagvalue),2)) + '\ndaily minimal temperature - Melbourne Australia 1981-1990')
    plt.xlabel('Value(t)')
    plt.ylabel('Value(t + ' + str(lagvalue) + ')')

    # Clear the output in Google Colab
    clear_output(wait=True)

    # Show the plot
    plt.show()

In [None]:
# We can quantify the strength and type of relationships between observations and their lags
# This is called correlation
# When calculated against lag values in time series, this is called autocorrelation
# The autocorrelation plot will show the lag value on the x-axis and the correlation coefficient value on the y-axis

autocorrelation_plot(mintemp_df['Temp'])
# Customize plot labels and title
plt.title('Autocorrelation Plot daily minimal temperature - Melbourne Australia 1981-1990')
plt.xlabel('Lag')
plt.ylabel('Correlation coefficient')

# Customize the x-axis to show multiples of 365
max_lag = len(mintemp_df['Temp'])  # Maximum lag value
step = 365  # Step size for labels
ticks = range(0, max_lag, step)  # Generate ticks at multiples of 365
plt.xticks(ticks, labels=[str(t) for t in ticks])  # Set ticks and labels

# Show the plot
plt.show()

In [None]:
print(airline_df)

In [None]:
# Plot Airline Passengers
plt.figure(figsize=(12, 6))
plt.plot(airline_df['Month'], airline_df['Passengers'])
plt.xlabel('Month')
plt.ylabel('Passengers')
plt.title('Monthly airline passengers')
ticks = airline_df['Month'][::6]
plt.xticks(ticks=ticks, labels=ticks, rotation=45, ha='right')  # Set custom ticks and rotate
plt.show()

In [None]:
# Ensure Date is in datetime format
airline_df['Month'] = pd.to_datetime(airline_df['Month'])
# Set 'Month' as the index
airline_df = airline_df.set_index('Month')

In [None]:
# decompose airline passengers dataset
result = seasonal_decompose(airline_df['Passengers'], model='multiplicative')

# Plot the original time series, trend, seasonal, and residual components
plt.figure(figsize=(12, 8))

plt.subplot(4, 1, 1)
plt.plot(airline_df['Passengers'], label='Original')
plt.legend(loc='upper left')
plt.title('Monthly airline passengers')

plt.subplot(4, 1, 2)
plt.plot(result.trend, label='Trend')
plt.legend(loc='upper left')
plt.title('Trend Component')

plt.subplot(4, 1, 3)
plt.plot(result.seasonal, label='Seasonal')
plt.legend(loc='upper left')
plt.title('Seasonal Component')

plt.subplot(4, 1, 4)
plt.plot(result.resid, label='Residual')
plt.legend(loc='upper left')
plt.title('Residual Component')

plt.tight_layout()
plt.show()

# Predict with stock data

In [None]:
# print daily stock prices and stock volume of 9 stocks
print('Stock price data\n')
print(stock_price_df)
print('Stock volume data\n')
print(stock_vol_df)

In [None]:
# Function to plot interactive plots using Plotly Express
def interactive_plot(df, title):
  fig = px.line(title = title)
  for i in df.columns[1:]:
    fig.add_scatter(x = df['Date'], y = df[i], name = i)
  fig.show()

In [None]:
# plot interactive chart for stock price data
interactive_plot(stock_price_df, 'Stock Prices')

In [None]:
# plot interactive chart for stock volume data
interactive_plot(stock_vol_df, 'Stock Volumes')

In [None]:
# plot interactive chart for stock volume data without S&P 500
interactive_plot(stock_vol_df.drop('sp500', axis='columns'), 'Stock Volumes without S&P 500')

# Predict Tesla stock prices

In [None]:
stock_to_predict = 'TSLA'

In [None]:
# Function to concatenate the date, stock price, and volume in one dataframe
def individual_stock(price_df, vol_df, name):
    return pd.DataFrame({'Date': price_df['Date'], 'Close': price_df[name], 'Volume': vol_df[name]})

In [None]:
# Function to return the input/output (target) data for AI/ML Model
# Note that our goal is to predict the future stock price
# Target stock price today will be the stock price 1 working day in the future
def trading_window(data):

  # 1 day window
  n = 1

  # Create a column containing the prices for the next 1 days
  data['Target'] = data[['Close']].shift(-n)

  # return the new dataset
  return data

In [None]:
# Let's test the functions and get individual stock prices and volumes
price_volume_df = individual_stock(stock_price_df, stock_vol_df, stock_to_predict)
price_volume_df

In [None]:
# Add the target data to the dataframe
price_volume_target_df = trading_window(price_volume_df)
price_volume_target_df

In [None]:
# Remove the last rows as it will be a null value
price_volume_target_df = price_volume_target_df.dropna()
price_volume_target_df

In [None]:
# Remove Date column
full_df = price_volume_target_df.drop(columns = ['Date'])
X = full_df.drop(columns = ['Target'])
y = full_df['Target']
full_df

In [None]:
# The autocorrelation plot will show the lag value on the x-axis and the correlation coefficient value on the y-axis

autocorrelation_plot(full_df['Target'])
# Customize plot labels and title
plt.title(f'Autocorrelation Plot {stock_to_predict} stock')
plt.xlabel('Lag')
plt.ylabel('Correlation coefficient')

# Show the plot
plt.show()

In [None]:
# Split into train and test set
split1 = int(0.7 * len(full_df)) # train first 70% prices, test last 30% prices
full_df_train = full_df.iloc[:split1]
full_df_test = full_df.iloc[split1:]

In [None]:
full_df_train

In [None]:
# Scale the data

# Create a MinMaxScaler object
scaler = MinMaxScaler()

# Fit the scaler on the training data and transform it
full_df_train_scaled = pd.DataFrame(scaler.fit_transform(full_df_train), columns=full_df_train.columns)

# Transform the validation and test sets using the fitted scaler
full_df_test_scaled = pd.DataFrame(scaler.transform(full_df_test), columns=full_df_test.columns)

In [None]:
full_df_train_scaled

In [None]:

# Separate scaled input and output(label)
X_train = full_df_train_scaled[['Close', 'Volume']]
y_train = full_df_train_scaled['Target']
X_test = full_df_test_scaled[['Close', 'Volume']]
y_test = full_df_test_scaled['Target']

In [None]:
print("X : ", X.shape)
print("X_train : ", X_train.shape)
print("X_test : ", X_test.shape)

In [None]:
X_train

In [None]:
# Define a data plotting function

def show_plot(data, title):
  plt.figure(figsize = (13, 5))
  plt.plot(data, linewidth = 1)
  plt.title(title)
  plt.grid()

show_plot(X_train, 'Scaled training data')
show_plot(X_test, 'Scaled test data')

In [None]:
# Perform a linear regression
regression_model = LinearRegression()


# Perform Ridge regression
#regression_model = Ridge()

# Perform fit
regression_model.fit(X_train, y_train)

In [None]:
# r2 score
train_preds = regression_model.predict(X_train)
test_preds = regression_model.predict(X_test)

train_accuracy = r2_score(y_train, train_preds)
test_accuracy = r2_score(y_test, test_preds)

print("Train R^2 Accuracy:", train_accuracy)
print("Test R^2 Accuracy:", test_accuracy)

In [None]:
train_preds

In [None]:
# Put actual training labels and predicted labels in a dataframe
# First add the actual value and then delete close and volume columns

pred_train_df = pd.DataFrame({'Actual': y_train, 'Predicted': train_preds})
pred_test_df = pd.DataFrame({'Actual': y_test, 'Predicted': test_preds})

In [None]:
# Show plots

show_plot(pred_train_df, 'Scaled prediction training data')
show_plot(pred_test_df, 'Scaled prediction test data')

In [None]:
# scale the actual values and predictions back with the fitted scaler
# the scaling was done using a 3 columns input, so the inverse scaling needs a 3 columns input too

# Take X_train and X_test dataframes and append it with train_preds and test_preds
X_train_preds = X_train.copy()
X_test_preds = X_test.copy()
X_train_preds['train_preds'] = train_preds
X_test_preds['test_preds'] = test_preds

# Scale the predictions
X_train_preds_rescaled = pd.DataFrame(scaler.inverse_transform(X_train_preds), columns=X_train_preds.columns)
X_test_preds_rescaled = pd.DataFrame(scaler.inverse_transform(X_test_preds), columns=X_test_preds.columns)

# Change the index of the rescaled test preditions to be able to concatenate later
startindex = X_train.shape[0]
X_test_preds_rescaled.index = pd.RangeIndex(start=startindex, stop=startindex + len(X_test_preds_rescaled), step=1)

In [None]:
X_test_preds_rescaled

In [None]:
full_df_test

In [None]:
# Add the actual values

pred_rescaled_train_df = pd.concat([full_df_train['Target'], X_train_preds_rescaled], axis=1)
pred_rescaled_train_df = pred_rescaled_train_df.drop(columns = ['Close', 'Volume'])
pred_rescaled_test_df = pd.concat([full_df_test['Target'], X_test_preds_rescaled], axis=1)
pred_rescaled_test_df = pred_rescaled_test_df.drop(columns = ['Close', 'Volume'])

In [None]:
pred_rescaled_test_df

In [None]:
# interactive plot

pred_df = pd.concat([pred_rescaled_train_df, pred_rescaled_test_df], axis=0, ignore_index=True)
pred_df.insert(0, 'Date', price_volume_target_df['Date'].values)
interactive_plot(pred_df, 'Original vs Prediction')