In [1]:
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm 
import pandas as pd
from yahoo_fin.stock_info import get_data

## Scrape data from Yahoo

In [58]:
dfY = get_data('^VIX')['2014-01-02':'2021-09-20']
dfY.drop(['open', 'high', 'low', 'close', 'volume', 'ticker'],1,inplace=True)
dfY.reset_index(inplace = True)
dfY.columns = ['Trade Date', 'VIX']

dfY.head()

Unnamed: 0,Trade Date,VIX
0,2014-01-02,14.23
1,2014-01-03,13.76
2,2014-01-06,13.55
3,2014-01-07,12.92
4,2014-01-08,12.87


## Scrape data from CBOE

In [199]:
page=requests.get('https://www.cboe.com/us/futures/market_statistics/historical_data/')
soup = BeautifulSoup(page.text, 'html.parser')

dfs = soup.findAll('li', {'class':'mbn'})

366

In [None]:
dicMonths = {'F':1,'G':2,'H':3,'J':4,'K':5,'M':6,'N':7,'Q':8,'U':9,'V':10,'X':11,'Z':12}
dicYears = {'2':2022, '1':2021, '0':2020, '9':2019, '8':2018, '7':2017, '6':2016, '5':2015, '4':2014, '3':2013}

#sort the names of dataframes by year and month
dff = [df.a.text.strip().split()[1] for df in dfs[::-1]]
sorted_dff = sorted(dff, key= lambda xy:(dicYears[xy[1]], dicMonths[xy[0]]))

#get the indexes of sorted dataframes
idx = [dff.index(elt) for elt in sorted_dff]

#get the sorted dataframes
dfs = dfs[::-1]
dfs = [dfs[i] for i in idx]

In [200]:
# remove dfs of the year 2013 and the weekly ones VX + VXT01 ...
dfs = [df for df in dfs if not df.a.text.split('+')[1][3:5].isnumeric() and df.a['href'].split('/')[3].split('-')[0] != '2013']
len(dfs)

102

In [202]:
newdata = pd.DataFrame(columns = ['Futures', 'Settle'])

# scrape the link of dfs then join it 
for df in dfs :
    
        link = df.a['href']
        data = pd.read_csv('https://www.cboe.com/us/futures/market_statistics/historical_data/'+ link)
        data.set_index('Trade Date', inplace = True)
        if len(newdata) == 0 :
            newdata = data[['Futures', 'Settle']]
            last = newdata.index[-1]
           
        else :
            newdata = pd.concat([newdata[:-1], data.loc[last:,['Futures', 'Settle']]])
            last = newdata.index[-1] 
            

In [203]:
save = newdata
newdata.sample(5)

Unnamed: 0_level_0,Futures,Settle
Trade Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-12-04,Z (Dec 2019),15.325
2020-04-30,K (May 2020),33.975
2014-11-17,X (Nov 2014),14.35
2019-08-29,U (Sep 2019),18.475
2020-07-27,Q (Aug 2020),27.525


## Data Preparation

In [208]:
feat = ['feature 1', 'feature 2', 'feature 3', 'feature 4', 'feature 5', 'feature 6', 'feature 7', 'feature 8', 'feature 9', 'feature 10']
data = pd.DataFrame(columns = feat)
data

Unnamed: 0,feature 1,feature 2,feature 3,feature 4,feature 5,feature 6,feature 7,feature 8,feature 9,feature 10


In [205]:
newdata = newdata.reset_index()
newdata.head(2)

Unnamed: 0,Trade Date,Futures,Settle
0,2013-04-22,F (Jan 2014),0.0
1,2013-04-23,F (Jan 2014),0.0


In [206]:
newdata['Trade Date'] = pd.to_datetime(newdata['Trade Date'],errors='ignore')

In [209]:
new = newdata.join(dfY.set_index('Trade Date'), on='Trade Date')
new = pd.concat([new, data])

new = new[new['VIX'].notna()]
new=new.sort_values(ascending = False, by='Trade Date')

new.head()

Unnamed: 0,Trade Date,Futures,Settle,VIX,feature 1,feature 2,feature 3,feature 4,feature 5,feature 6,feature 7,feature 8,feature 9,feature 10
2121,2021-09-20,V (Oct 2021),24.4629,25.709999,,,,,,,,,,
2120,2021-09-17,V (Oct 2021),21.9377,20.809999,,,,,,,,,,
2119,2021-09-16,V (Oct 2021),20.5556,18.690001,,,,,,,,,,
2118,2021-09-15,V (Oct 2021),20.8379,18.18,,,,,,,,,,
2117,2021-09-14,U (Sep 2021),19.715,19.459999,,,,,,,,,,


In [210]:
new.set_index('Trade Date', inplace = True)
new.head()

Unnamed: 0_level_0,Futures,Settle,VIX,feature 1,feature 2,feature 3,feature 4,feature 5,feature 6,feature 7,feature 8,feature 9,feature 10
Trade Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2021-09-20,V (Oct 2021),24.4629,25.709999,,,,,,,,,,
2021-09-17,V (Oct 2021),21.9377,20.809999,,,,,,,,,,
2021-09-16,V (Oct 2021),20.5556,18.690001,,,,,,,,,,
2021-09-15,V (Oct 2021),20.8379,18.18,,,,,,,,,,
2021-09-14,U (Sep 2021),19.715,19.459999,,,,,,,,,,


In [211]:
new['feature 1'] = new['Settle']

In [None]:
# fill the features from 'feature 2' to 'feature 10'
for date in new.index[:] :
    
        for i in range(1, 10) :
            feature_date = pd.to_datetime(date) + pd.DateOffset(months=i)
            try :
                value = new.loc[(new.index == str(feature_date).split()[0]), 'Settle'][0]
                new.at[date, f'feature {i+1}']=value
                
            except :
                pass

In [227]:
new.drop(['Futures', 'Settle'], axis = 1, inplace = True)
new.head(5)

Unnamed: 0_level_0,VIX,feature 1,feature 2,feature 3,feature 4,feature 5,feature 6,feature 7,feature 8,feature 9,feature 10
Trade Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-09-20,25.709999,24.4629,,,,,,,,,
2021-09-17,20.809999,21.9377,,,,,,,,,
2021-09-16,18.690001,20.5556,,,,,,,,,
2021-09-15,18.18,20.8379,,,,,,,,,
2021-09-14,19.459999,19.715,,,,,,,,,


In [224]:
new.to_csv('resultat.csv')