In [None]:
# Data Management
import pandas as pd
import numpy as np
from ta import add_all_ta_features
from pandas_datareader.data import DataReader
import yfinance as yf

# Statistics
from statsmodels.tsa.stattools import adfuller

# Unsupervised Machine Learning
from sklearn.decomposition import PCA

# Supervised Machine Learning
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

# Graphing
import matplotlib.pyplot as plt

In [None]:
# Remove Unwanted Warnings
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=RuntimeWarning)
pd.options.mode.chained_assignment = None  # ignore column assingment tecnique warning (wants to use .iloc)

### Initial Data Extraction


In [None]:
# TIMEFRAME
start_date = "2017-01-01"
end_date = "2022-06-01"
symbol = "^VIX"




# FETCH USING YAHOO FINANCE 
# df = yf.download(symbol ,  start_date , end_date )


tickerData = yf.Ticker(symbol)
df = tickerData.history(period='1d', start=start_date, end=end_date)


# REFINE COLUMNS
df = df[["High" , "Low" ,  "Open", "Close" , "Volume"]]


## Feature Engineering

In [None]:
# Add Technical Analysis Columns
df = add_all_ta_features( 
    df , open="Open", high="High", low="Low", close="Close", volume="Volume" , fillna=True
 )

print(f"Rows (dates): {df.shape[0]}\nColumns (technical analysis): {df.shape[1]}")

In [None]:
non_stationaries = []
columns_with_constants = []

for col in df.columns:
    # Check if column values are all the same
    if df[col].nunique() <= 1:
        columns_with_constants.append(col)
        print(f"Skipping constant column: {col}")
        continue

    # Find variance in column numbers
    dftest = adfuller(df[col].values)
    
    # Get p-value
    p_value = dftest[1]
    
    # Perform t-test
    t_test = dftest[0] < dftest[4]["1%"]
    
    # Check if non stationary
    if p_value > 0.05  or not t_test:
        non_stationaries.append(col)

print(f"Non-Stationaries found:  {len(non_stationaries)}")


In [None]:
# Convert Non-Stationaries into Stationaries
df_stationary = df.copy()


# Find all Nonstationaries  &  Turn them into percent change instead
df_stationary[non_stationaries] = df_stationary[non_stationaries].pct_change()  


#Remove because First column is the names thus [0,0,0,0...] 
df_stationary = df_stationary.iloc[1:]

In [None]:
# Remove if Column has any NaN Rows
na_list = df_stationary.columns[df_stationary.isna().any().tolist()]
df_stationary.drop(columns=na_list , inplace=True)

In [None]:
# Handle Infinity Values
df_stationary = df_stationary.replace( [np.inf , -np.inf] , 0 )

print(df_stationary.shape)
df_stationary