In [38]:
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm 
import pandas as pd
from yahoo_fin.stock_info import get_data

## Scrape data from Yahoo

In [228]:
dfY = get_data('^VIX')['2014-01-02':'2021-09-20']
dfY.drop(['open', 'high', 'low', 'close', 'volume', 'ticker'],1,inplace=True)
dfY.reset_index(inplace = True)
dfY.columns = ['Trade Date', 'VIX']

dfY.head()

Unnamed: 0,Trade Date,VIX
0,2014-01-02,14.23
1,2014-01-03,13.76
2,2014-01-06,13.55
3,2014-01-07,12.92
4,2014-01-08,12.87


## Scrape data from CBOE

In [2]:
page=requests.get('https://www.cboe.com/us/futures/market_statistics/historical_data/')
soup = BeautifulSoup(page.text, 'html.parser')

dfs = soup.findAll('li', {'class':'mbn'})

In [45]:
TradeDates = []
Name = []
Settle = []

for df in dfs :
    text = df.a.text
    if text.split('+')[1][3:5].isnumeric() :
        pass
    else :
        link = df.a['href']
        if link.split('/')[3].split('-')[0] != '2013' :
            data = pd.read_csv('https://www.cboe.com/us/futures/market_statistics/historical_data/'+ link)
            TradeDates.extend(list(data['Trade Date'].values))
            #Futures.extend(list(data['Futures'].values))
            Settle.extend(list(data['Settle'].values))
            Name.extend([text[-11:-9]] * data.shape[0])
        
dataframe = pd.DataFrame(list(zip(TradeDates, Name,  Settle)),
               columns =['Trade Date', 'Name', 'Settle'])


In [46]:
dataframe.tail()

Unnamed: 0,Trade Date,Name,Settle
18227,2014-12-11,Z4,19.1
18228,2014-12-12,Z4,19.6
18229,2014-12-15,Z4,19.6
18230,2014-12-16,Z4,23.1
18231,2014-12-17,Z4,24.09


## Data Preparation

In [49]:
dataframe['Trade Date'] = pd.to_datetime(dataframe['Trade Date'],errors='ignore')

In [40]:
feat = ['feature 1', 'feature 2', 'feature 3', 'feature 4', 'feature 5', 'feature 6', 'feature 7', 'feature 8', 'feature 9', 'feature 10']
data = pd.DataFrame(columns = feat)
data

Unnamed: 0,feature 1,feature 2,feature 3,feature 4,feature 5,feature 6,feature 7,feature 8,feature 9,feature 10


In [211]:
new = dataframe.join(dfY.set_index('Trade Date'), on='Trade Date')
new = pd.concat([new, data])
#new.set_index('Trade Date', inplace = True)

new = new[new['VIX'].notna()]
new=new.sort_values(ascending = False, by='Trade Date')

new.head()

Unnamed: 0,Trade Date,Name,Settle,VIX,feature 1,feature 2,feature 3,feature 4,feature 5,feature 6,feature 7,feature 8,feature 9,feature 10
2425,2021-09-20,Z1,24.8125,25.709999,,,,,,,,,,
310,2021-09-20,M2,25.825,25.709999,,,,,,,,,,
102,2021-09-20,F2,25.4788,25.709999,,,,,,,,,,
2151,2021-09-20,V1,24.4629,25.709999,,,,,,,,,,
309,2021-09-20,K2,25.825,25.709999,,,,,,,,,,


In [212]:
dicMonths = {'F':1,'G':2,'H':3,'J':4,'K':5,'M':6,'N':7,'Q':8,'U':9,'V':10,'X':11,'Z':12}
dicYears = {'2':2022, '1':2021, '0':2020, '9':2019, '8':2018, '7':2017, '6':2016, '5':2015, '4':2014}

def turnNameToDate(name) :
    m, y = name[0], name[1]
    m, y = dicMonths[m], dicYears[y]
    date = '-'.join([str(y), str(m)])
    return date
    
new['dateOfFile'] = new['Name'].apply(turnNameToDate)

In [213]:
new['ToCompareWith'] = new['Trade Date'].apply(lambda x: str(pd.to_datetime(x).year)+'-'+str(pd.to_datetime(x).month))

In [214]:
new = new[new['ToCompareWith'] == new['dateOfFile']]
new.shape

(1194, 16)

In [215]:
new.set_index('Trade Date', inplace = True)
new.head()

Unnamed: 0_level_0,Name,Settle,VIX,feature 1,feature 2,feature 3,feature 4,feature 5,feature 6,feature 7,feature 8,feature 9,feature 10,dateOfFile,ToCompareWith
Trade Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2021-09-15,U1,19.79,18.18,,,,,,,,,,,2021-9,2021-9
2021-09-14,U1,19.715,19.459999,,,,,,,,,,,2021-9,2021-9
2021-09-13,U1,19.298,19.370001,,,,,,,,,,,2021-9,2021-9
2021-09-10,U1,20.6249,20.950001,,,,,,,,,,,2021-9,2021-9
2021-09-09,U1,19.3716,18.799999,,,,,,,,,,,2021-9,2021-9


In [216]:
uniqueMonthsYears = list(new.Name.unique())[::-1]
new['feature 1'] = new['Settle']

In [229]:
for date in new.index[:] :
    
    for i in range(1, 10) :
        actualdate = new.loc[date]['Name'][:]
        feature_date = pd.to_datetime(date) + pd.DateOffset(months=i)
        
        try :
            value = new.loc[(new.index == str(feature_date).split()[0]), 'Settle'][0]
            new.at[date,f'feature {i+1}']=value
        except :
            pass




In [226]:
new.drop(['Name', 'Settle', 'dateOfFile', 'ToCompareWith'], axis = 1, inplace = True)
new.set_index('Trade Date', inplace = True)
new.sample(5)

Unnamed: 0_level_0,VIX,feature 1,feature 2,feature 3,feature 4,feature 5,feature 6,feature 7,feature 8,feature 9,feature 10
Trade Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-04-13,41.169998,40.575,35.075,,31.825,23.225,,26.775,23.325,,22.675
2019-01-15,18.6,18.825,,13.475,12.575,16.925,,12.875,21.175,,13.625
2017-07-12,10.3,11.525,,11.875,10.575,,10.375,10.575,25.825,16.175,18.825
2016-11-07,18.709999,16.025,13.325,,12.575,12.625,14.025,,11.875,12.575,11.125
2016-12-16,12.2,12.725,,,11.925,,11.075,11.325,,12.95,


In [227]:
new.to_csv('resultFuture.csv')