In [46]:
"""
``autoreload`` is an IPython extension that reloads modules
automatically before executing the line of code typed.
"""

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [47]:
import pandas as pd
import numpy as np
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns

In [48]:
RAW_DATA_PATH = Path('__file__').resolve().parents[2] / 'data' / '01_raw'
PROCESSED_DATA_PATH = Path('__file__').resolve().parents[2] / 'data' / '02_processed'

In [49]:
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['figure.figsize'] = [15, 8]

In [50]:
def read_and_format_csv(subfolder_path: Path, raw_path: bool = True) -> pd.DataFrame:
    if raw_path:
        path = str(RAW_DATA_PATH / subfolder_path)
        df = pd.read_csv(path)
    else:
        path = str(PROCESSED_DATA_PATH / subfolder_path)
        df = pd.read_csv(path)
        
    
    df = df.astype({col: 'float32' for col in ['Open', 'High', 'Low', 'Close', 'Adj Close',]})
    df['stock_name'] = df['stock_name'].astype('str')
    df['Volume'] = df['Volume'].astype('int32')

    df['Date'] = pd.to_datetime(df['Date'], format='ISO8601', utc='True')
    df['Date'] = df['Date'].dt.tz_convert('America/New_York')
    return df

In [51]:
df = read_and_format_csv(subfolder_path='Energy/XOM.csv', raw_path=True)
df.head()

Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume,stock_name
0,1962-01-01 19:00:00-05:00,0.092624,1.578125,1.589844,1.578125,0.0,902400,XOM
1,1962-01-02 19:00:00-05:00,0.094,1.601563,1.601563,1.578125,0.0,1200000,XOM
2,1962-01-03 19:00:00-05:00,0.094229,1.605469,1.613281,1.597656,0.0,1088000,XOM
3,1962-01-04 19:00:00-05:00,0.092166,1.570313,1.613281,1.566406,0.0,1222400,XOM
4,1962-01-07 19:00:00-05:00,0.091937,1.566406,1.582031,1.546875,0.0,1388800,XOM


In [53]:
import yfinance as yf

NOTE: all prices are in USD.

In [54]:
dataset = yf.download('MSFT').reset_index()
dataset.columns = dataset.columns.get_level_values(0)
dataset['Stock label'] = 'MSFT'
dataset.tail()

[*********************100%***********************]  1 of 1 completed


Price,Date,Adj Close,Close,High,Low,Open,Volume,Stock label
9743,2024-11-08 00:00:00+00:00,422.540009,422.540009,426.5,421.779999,425.320007,16891400,MSFT
9744,2024-11-11 00:00:00+00:00,418.01001,418.01001,424.809998,416.0,422.519989,24503300,MSFT
9745,2024-11-12 00:00:00+00:00,423.029999,423.029999,424.440002,417.200012,418.25,19401200,MSFT
9746,2024-11-13 00:00:00+00:00,425.200012,425.200012,429.329987,418.209991,421.640015,21502200,MSFT
9747,2024-11-14 00:00:00+00:00,426.890015,426.890015,428.170013,420.0,425.0,30213400,MSFT


In [55]:
assert(df.isna().sum().sum() == int(0))

In [56]:
# for label in ['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits']:
#     f = plt.figure()
#     f.gca().set_title(label=label)
#     sns.histplot(df[label], bins=30)

In [2]:
import pandas as pd
from pathlib import Path

In [3]:
INP = Path('__file__').resolve().parents[2] / 'models' / 'arima_order_aic_bic.csv'
INP

WindowsPath('C:/Users/Lyndon/Documents/University/CS3AM/coursework/CS3AM-COURSEWORK/models/arima_order_aic_bic.csv')

In [4]:
df = pd.read_csv(str(INP))
df

Unnamed: 0,p,q,aic,bic
0,0,0,-47901.533077,-47886.908547
1,0,1,-47902.836705,-47880.899910
2,0,2,-47907.275680,-47878.026620
3,0,3,-47917.206046,-47880.644721
4,0,4,-47922.514515,-47878.640925
...,...,...,...,...
724,26,22,-47718.180635,-47352.567386
725,26,23,-47874.818635,-47501.893120
726,26,24,-47870.448517,-47490.210738
727,26,25,-47873.064450,-47485.514406


In [5]:
df.sort_values(by=['aic', 'bic'], ascending=True)

Unnamed: 0,p,q,aic,bic
39,1,12,-47928.255018,-47818.571043
378,14,0,-47928.093784,-47811.097544
324,12,0,-47927.741448,-47825.369738
12,0,12,-47927.658823,-47825.287113
13,0,13,-47927.310546,-47817.626572
...,...,...,...,...
724,26,22,-47718.180635,-47352.567386
310,11,13,-47716.486629,-47526.367740
337,12,13,-47571.977913,-47374.546758
647,23,26,-46866.560731,-46493.635217
