# Intro

A downloaded stock from yf has date as an index , and no date column.
After saving the dataframe and loading it again. The index has transformed into a "Date" column and the index transformed into a numerical index.

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import yfinance as yf
import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMA

In [2]:
cwd = os.getcwd()

In [3]:
# Directory path
directory = "aex_data/"

# List files and directories in the directory
files_and_directories = os.listdir(directory)

# Print the list
print("Files and directories in", directory, ":")
for item in files_and_directories:
    print(item)


Files and directories in aex_data/ :
.ipynb_checkpoints
ABN_data_2015-11-20_2024-04-12.csv
ABN_data_2024-04-12_NaT_testing_data.csv
ADYEN_data_2018-06-13_2024-04-11.csv
ADYEN_data_2018-06-13_2024-04-12.csv
AD_data_2008-10-24_2024-04-11.csv
AD_data_2008-10-24_2024-04-12.csv
AGN_data_1995-03-27_2024-04-11.csv
AGN_data_1995-03-27_2024-04-12.csv
AKZA_data_1995-03-27_2024-04-11.csv
AKZA_data_1995-03-27_2024-04-12.csv
ASML_data_1998-07-20_2024-04-11.csv
ASML_data_1998-07-20_2024-04-12.csv
ASM_data_1999-01-06_2024-04-11.csv
ASM_data_1999-01-06_2024-04-12.csv
ASRNL_data_2016-06-10_2024-04-11.csv
ASRNL_data_2016-06-10_2024-04-12.csv
BESI_data_1998-07-20_2024-04-11.csv
BESI_data_1998-07-20_2024-04-12.csv
HEIA_data_1995-03-27_2024-04-11.csv
HEIA_data_1995-03-27_2024-04-12.csv
IMCD_data_2014-06-27_2024-04-11.csv
IMCD_data_2014-06-27_2024-04-12.csv
INGA_data_1995-03-27_2024-04-11.csv
INGA_data_1995-03-27_2024-04-12.csv
KPN_data_1995-03-27_2024-04-11.csv
KPN_data_1995-03-27_2024-04-12.csv
MT_data_20

In [4]:
designated_stock_path = "aex_data/ABN_data_2015-11-20_2024-04-12.csv"
df_ABN = pd.read_csv(designated_stock_path)

In [5]:
df_ABN

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2015-11-20,18.180000,18.430000,18.000000,18.350000,11.638737,38392898
1,2015-11-23,18.450001,18.700001,18.215000,18.610001,11.803646,3352514
2,2015-11-24,18.700001,18.799999,18.370001,18.799999,11.924155,4871901
3,2015-11-25,18.850000,19.500000,18.770000,19.450001,12.336428,4802607
4,2015-11-26,19.480000,19.670000,19.410000,19.430000,12.323740,1648481
...,...,...,...,...,...,...,...
2144,2024-04-08,16.395000,16.504999,16.290001,16.430000,16.430000,2483292
2145,2024-04-09,16.415001,16.490000,16.235001,16.260000,16.260000,2267871
2146,2024-04-10,16.395000,16.580000,16.305000,16.385000,16.385000,3472390
2147,2024-04-11,16.355000,16.450001,15.965000,16.075001,16.075001,2642112


- Trading Days: Stock markets are not open on weekends and holidays, so there are typically fewer than 365 trading days in a year. In 2022, for example, there were approximately 252 trading days on the Amsterdam Stock Exchange.

- Data Availability: Sometimes data providers like Yahoo Finance may not have data available for all trading days, especially for older data or for less liquid stocks.

- Corporate Actions: There might be corporate actions such as stock splits, dividends, or other adjustments that could affect the number of rows in the dataset. These actions can lead to gaps in the data.

- Data Quality: Occasionally, there may be errors or missing data in the dataset, leading to fewer rows than expected.

In [6]:
class stockPredictor:
    def __init__(self, ticker_symbol, stock_data):
        self.stock_data = stock_data
        self.stock_data_dateIndexed = stock_data
        self.stock_data_dateIndexed.index = pd.to_datetime(stock_data["Date"], format='%Y-%m-%d')
        self.start_date = stock_data.index.min()
        self.end_date = stock_data.index.max()
        self.decomposition = None
        self.testing_data = None
        self.ticker_symbol = ticker_symbol
        self.aex_exchange_symbol = ticker_symbol + ".AS"
        
    def describe_stock_data(self):
        print(self.stock_data.head(3))
        print(self.stock_data.shape)
        print(self.stock_data.describe())
        print(self.stock_data.dtypes)
        print(self.stock_data.dtypes.unique())
    
    def describe_columns(self):
        print("""
        Open - Opening price of the stock at the beginning of the trading day
        High - Highest value that the stock reached on that day
        Low - Lowest value that the stock reached on that day
        Close - Closing value that the stock reached on that day
        Adj Close - The adjusted closing price is the closing price of the stock adjusted
        to include any corporate actions that occurred before the next trading day's open.
        Volume - Number of stocks traded that day
        """)
    
    def handleMissingValues(self):
        self.stock_data_dateIndexed = self.stock_data_dateIndexed.reindex(pd.date_range(start=self.stock_data_dateIndexed.index.min(), end=self.stock_data_dateIndexed.index.max(), freq='D'))

        # Handle missing values (replace NaN with appropriate method)
        self.stock_data_dateIndexed = self.stock_data_dateIndexed.fillna(method='bfill')  # Forward fill missing values

        # Set the frequency of the index
        self.stock_data_dateIndexed.index.freq = 'D'  # Set the frequency to daily
        
    def check_and_batch_new_data(self):
        # Directory to save the newbatch
        stock_directory = "aex_data/"
        output_directory = os.path.join(cwd, stock_directory)
        # Load the existing data
        existing_data = self.stock_data

        # Get the last date used in the existing data
        last_date_used_existing = existing_data["Date"].max()
        
        # Fetch historical data for the stock
        new_stock_data = yf.download(self.aex_exchange_symbol, period='max')
        
        new_stock_data['Date'] = new_stock_data.index

        # Filter the new data for dates after the last date used in the existing data
        new_data_batch = new_stock_data[new_stock_data["Date"] > last_date_used_existing]

        # Get the last date used in the existing data
        last_date_used_new = new_data_batch["Date"].max()
        
        # Save the dataframe to CSV
        file_path = os.path.join(output_directory, f"{self.ticker_symbol}_data_{last_date_used_existing}_{last_date_used_new}_testing_data.csv")
        new_data_batch.to_csv(file_path, mode= 'w', index=False)
        print (f'Dataframe has been written to {file_path} with overwriting permissions')
        
        # Create testing data
        self.testing_data = new_data_batch

    def timeSeriesSeasonality(self):
        stock_dates = self.stock_data_dateIndexed["Date"]
        stock_prices_close = self.stock_data_dateIndexed["Close"]
        
        # stock_prices_close_numericalindices will keep the original index
        #stock_prices_close_numericalindices = stock_prices_close.copy()

        # Convert the index to a DatetimeIndex for stock_prices_close_dateindices
        #stock_prices_close_dateindices = stock_prices_close.copy()
        #stock_prices_close_dateindices.index = pd.to_datetime(stock_dates, format='%Y-%m-%d')
        #print(stock_prices_close)
        #print(stock_prices_close_numericalindices)
        
        # Plot the time series data
        plt.figure(figsize=(10, 6))
        plt.plot(stock_prices_close)
        plt.title('Stock Prices Over Time')
        plt.xlabel('Time')
        plt.ylabel('Price')
        plt.show()

        # ACF plot
        plt.figure(figsize=(10, 6))
        sm.graphics.tsa.plot_acf(stock_prices_close, lags=20)
        plt.title('Autocorrelation Function (ACF) Plot')
        plt.xlabel('Lag')
        plt.ylabel('Autocorrelation')
        plt.show()

        # PACF plot
        plt.figure(figsize=(10, 6))
        sm.graphics.tsa.plot_pacf(stock_prices_close, lags=20)
        plt.title('Partial Autocorrelation Function (PACF) Plot')
        plt.xlabel('Lag')
        plt.ylabel('Partial Autocorrelation')
        plt.show()

        # Seasonal decomposition
        decomposition = sm.tsa.seasonal_decompose(stock_prices_close, model='additive')
        self.decomposition = decomposition
        print(type(decomposition))
        fig, axes = plt.subplots(4, 1, figsize=(10, 8))
        axes[0].plot(stock_prices_close, label='Original')
        axes[0].legend(loc='upper left')
        axes[1].plot(decomposition.trend, label='Trend')
        axes[1].legend(loc='upper left')
        axes[2].plot(decomposition.seasonal, label='Seasonal')
        axes[2].legend(loc='upper left')
        axes[3].plot(decomposition.resid, label='Residual')
        axes[3].legend(loc='upper left')
        plt.tight_layout()
        plt.show()


    def timeSeriesArima(self):
        # Load stock market data 

        # Assuming 'Close' prices are in a column named 'Close'
        # For simplicity, we'll work with a single stock's closing prices

        stock_prices_close = self.stock_data["Close"]

        # Fit ARIMA model
        model = ARIMA(stock_prices_close, order=(1, 1, 1))  # Example order, replace with appropriate values
        model_fit = model.fit()

        # Summary of the model
        print(model_fit.summary())

        # Forecast future values
        forecast = model_fit.forecast(steps=5)  # Example steps for forecasting 5 future time points
        print("Forecasted values:", forecast)
        
    def data_tsa_visualisation(self):
        stock_dates = self.stock_data_dateIndexed["Date"]
        stock_prices_close = self.stock_data_dateIndexed["Close"]
        
        # Plotting histogram of stock prices
        plt.hist(stock_prices_close, bins=20, color='blue', edgecolor='black')
        plt.title('Distribution of Stock Prices')
        plt.xlabel('Price')
        plt.ylabel('Frequency')
        plt.show()

        # Plotting time series of stock prices
        plt.figure(figsize=(10, 6))
        plt.plot(stock_dates.values, stock_prices_close.values, color='green')
        plt.title('Stock Prices Over Time')
        plt.xlabel('Date')
        plt.ylabel('Price')
        plt.show()
        
        # Calculate correlation matrix
        correlation_matrix = self.stock_data_dateIndexed.select_dtypes(include='number').corr()

        # Plotting heatmap of correlations
        plt.figure(figsize=(10, 8))
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
        plt.title('Correlation Heatmap')
        plt.show()
        
    def residuals_analysis(self):

        # Assuming you already have the decomposed components stored in `decomposition`
        # Access the residuals from the decomposition
        residuals = self.decomposition.resid
        print(f"Total null values in the Dataframe - residuals: {residuals.isnull().sum().sum()}")
        if residuals.isnull().sum().sum() > 0:
            residuals = residuals.dropna(axis=0, how='any')
        
        if residuals.isin([np.inf, -np.inf]).any().any():
            print("DataFrame contains infinite values")
        else:
            print("DataFrame contains -NO- infinite values")

        # Plotting the residuals
        plt.figure(figsize=(10, 6))
        plt.plot(residuals)
        plt.title('Residuals of Time Series Decomposition')
        plt.xlabel('Time')
        plt.ylabel('Residual')
        plt.show()

        # Autocorrelation plot of residuals
        plt.figure(figsize=(10, 6))
        sm.graphics.tsa.plot_acf(residuals, lags=20)
        plt.title('Autocorrelation Function (ACF) of Residuals')
        plt.xlabel('Lag')
        plt.ylabel('Autocorrelation')
        plt.show()

        # Histogram of residuals
        plt.figure(figsize=(10, 6))
        plt.hist(residuals, bins=20, density=True)
        plt.title('Histogram of Residuals')
        plt.xlabel('Residual')
        plt.ylabel('Density')
        plt.show()

        # Perform statistical tests for stationarity or autocorrelation on residuals
        # For example:
        # Perform Augmented Dickey-Fuller test for stationarity
        adf_test_result = sm.tsa.stattools.adfuller(residuals)
        print("ADF test statistic:", adf_test_result[0])
        print("p-value:", adf_test_result[1])
        print("Critical values:", adf_test_result[4])






In [7]:
ticker_symbol = "ABN"
sp_ABN = stockPredictor(ticker_symbol, df_ABN)
#sp_ABN.describe_stock_data()

In [8]:
#sp_ABN.describe_columns()

In [9]:
sp_ABN.handleMissingValues()

In [10]:
#sp_ABN.stock_data_dateIndexed

In [11]:
#sp_ABN.timeSeriesSeasonality()

In [12]:
#sp_ABN.timeSeriesArima()

- AIC (Akaike Information Criterion): A measure of the relative quality of a statistical model, with a lower AIC indicating a better-fitting model. It's used for model selection among a set of candidate models.

- BIC (Bayesian Information Criterion): Similar to AIC, but it penalizes the number of parameters more heavily, favoring simpler models.

In [13]:
#sp_ABN.data_tsa_visualisation()

In [14]:
#sp_ABN.residuals_analysis()

In [15]:
sp_ABN.check_and_batch_new_data()

[*********************100%%**********************]  1 of 1 completed

Dataframe has been written to C:\Users\hugos\Desktop\PersonalProject\predictiveForecasting\aex_data/ABN_data_2024-04-12_NaT_testing_data.csv with overwriting permissions



