In [162]:
import pandas as pd
import numpy as np
import seaborn as sns
import requests
import pandas as pd
from pandas_datareader import data 
import matplotlib.pyplot as plt

sns.set(style='darkgrid', context='talk', palette='Dark2')

Example of web scrapping on Tesla

In [209]:
start_date = '2000-01-01'
end_date = '2022-10-31'
tesla = data.DataReader('MSFT', 'yahoo', start_date, end_date)
tesla

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-01-03,59.312500,56.000000,58.687500,58.281250,53228400.0,36.555374
2000-01-04,58.562500,56.125000,56.781250,56.312500,54119000.0,35.320549
2000-01-05,58.187500,54.687500,55.562500,56.906250,64059600.0,35.692940
2000-01-06,56.937500,54.187500,56.093750,55.000000,54976600.0,34.497295
2000-01-07,56.125000,53.656250,54.312500,55.718750,62013600.0,34.948116
...,...,...,...,...,...,...
2022-10-25,251.039993,245.830002,247.259995,250.660004,34775500.0,250.660004
2022-10-26,238.300003,230.059998,231.169998,231.320007,82543200.0,231.320007
2022-10-27,233.690002,225.779999,231.039993,226.750000,40424600.0,226.750000
2022-10-28,236.600006,226.050003,226.240005,235.869995,40647700.0,235.869995


Having imported the appropriate tools, getting market data from a free online source, such as Yahoo Finance, is super easy. Since pandas has a simple remote data access for the Yahoo Finance API data, this is as simple as:

In [199]:
def web_scrapper():
    tickers = pd.read_csv("CSV/constituents.csv")
    tickers_symbol = tickers['Symbol']

    # Define the instruments to download. We would like to see Apple, Microsoft and the S&P500 index.
    # tickers = ['AAPL', 'MSFT', 'GOOG', 'META']

    # We would like all available data from 01/01/2000 until 2022-10-31.
    start_date = '2000-01-01'
    end_date = '2022-10-31'

    # User pandas_reader.data.DataReader to load the desired data.
    df = data.DataReader(['MSFT'], 'yahoo', start_date, end_date) # I commented this line because I saved the scrapped data into a .csv

    close = df['Close']
    volume = df['Volume']

    #Let us assume we are interested in working with the Close prices which have been already been adjusted by Yahoo finance to account for stock splits. We want to make sure that all weekdays are included in our dataset, which is very often desirable for quantitative trading strategies.

    #Of course, some of the weekdays might be public holidays in which case no price will be available. For this reason, we will fill the missing prices with the latest available prices:

    all_weekdays = pd.date_range(start=start_date, end=end_date, freq='B')  # B stands for business day frequency
    close = close.reindex(all_weekdays)
    close = close.fillna(method='ffill')
    volume = volume.reindex(all_weekdays)
    volume = volume.fillna(method='ffill')
    df = df.reindex(all_weekdays)
    df = df.fillna(method='ffill')

    # df.to_csv("CSV/sp500.csv",index=False)
    # close.to_csv("CSV/sp500-close.csv",index=False)
    # volume.to_csv("CSV/sp500-volume.csv", index=False)

    return df, close, volume


In [202]:
dates = web_scrapper()

In [211]:
dates[0].index.values

array(['2000-01-03T00:00:00.000000000', '2000-01-04T00:00:00.000000000',
       '2000-01-05T00:00:00.000000000', ...,
       '2022-10-27T00:00:00.000000000', '2022-10-28T00:00:00.000000000',
       '2022-10-31T00:00:00.000000000'], dtype='datetime64[ns]')

In [166]:
sp500 = pd.read_csv("CSV/sp500.csv")
close = pd.read_csv("CSV/sp500-close.csv")
volume = pd.read_csv("CSV/sp500-volume.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [184]:
sp500

Unnamed: 0,Adj Close,Adj Close.1,Adj Close.2,Adj Close.3,Adj Close.4,Adj Close.5,Adj Close.6,Adj Close.7,Adj Close.8,Adj Close.9,...,Volume.495,Volume.496,Volume.497,Volume.498,Volume.499,Volume.500,Volume.501,Volume.502,Volume.503,Volume.504
0,MMM,AOS,ABT,ABBV,ABMD,ACN,ATVI,ADM,ADBE,AAP,...,BF.B,CERN,DISCA,DISCK,INFO,KSU,PBCT,VIAC,WLTW,XLNX
1,26.166324615478516,2.3924336433410645,9.262777328491211,,18.25,,1.2144209146499634,6.6519775390625,16.274673461914062,,...,,,,,,,,,,
2,25.126611709594727,2.3579587936401367,8.998124122619629,,17.8125,,1.1774803400039673,6.582683563232422,14.90939712524414,,...,,,,,,,,,,
3,25.854413986206055,2.351064443588257,8.981582641601562,,18.0,,1.1820976734161377,6.478750228881836,15.204174995422363,,...,,,,,,,,,,
4,27.933847427368164,2.316591501235962,9.295857429504395,,18.03125,,1.1590102910995483,6.513394355773926,15.328289985656738,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5952,118.5,51.798843,97.949997,149.820007,264.290009,280.609985,72.5,91.139999,323.790009,181.399994,...,,,,,,,,,,
5953,122.699997,51.649712,98.419998,152.509995,268.25,279.869995,72.480003,93.589996,320.480011,181.809998,...,,,,,,,,,,
5954,122.800003,51.600002,96.93,153.5,267.579987,278.839996,72.5,94.129997,318.649994,188.080002,...,,,,,,,,,,
5955,126.599998,54.380001,99.489998,147.610001,258.059998,287.779999,72.849998,94.879997,325.679993,190.669998,...,,,,,,,,,,


In [213]:
close["Date"] = dates[0].index.values

In [214]:
close

Unnamed: 0,MMM,AOS,ABT,ABMD,ACN,ATVI,ADM,ADBE,AAP,AMD,...,WDC,WY,WHR,WMB,XEL,YUM,ZBRA,ZBH,ZION,Date
0,47.187500,3.614583,15.711531,18.250000,,1.369792,10.884354,16.390625,,15.500000,...,3.937500,69.875000,61.812500,23.198961,19.000000,6.706057,25.027779,,55.500000,2000-01-03
1,45.312500,3.562500,15.262630,17.812500,,1.328125,10.770975,15.015625,,14.625000,...,4.687500,67.250000,59.437500,22.684469,19.437500,6.571262,24.666668,,52.812500,2000-01-04
2,46.625000,3.552083,15.234574,18.000000,,1.333333,10.600907,15.312500,,15.000000,...,4.375000,70.812500,60.312500,24.087631,20.187500,6.604960,25.138889,,52.750000,2000-01-05
3,50.375000,3.500000,15.767643,18.031250,,1.307292,10.657596,15.437500,,16.000000,...,4.562500,74.125000,60.812500,24.602125,20.000000,6.548796,23.777779,,53.500000,2000-01-06
4,51.375000,3.645833,15.935981,17.937500,,1.343750,10.827664,16.187500,,16.250000,...,5.625000,71.687500,61.000000,25.256933,20.000000,6.402768,23.513889,,53.625000,2000-01-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5951,118.500000,52.099998,97.949997,264.290009,280.609985,72.500000,91.139999,323.790009,181.399994,61.470001,...,35.290001,30.910000,137.000000,31.850000,62.400002,111.239998,274.579987,110.730003,51.509998,2022-10-25
5952,122.699997,51.950001,98.419998,268.250000,279.869995,72.480003,93.589996,320.480011,181.809998,59.730000,...,35.279999,30.959999,134.740005,32.150002,62.570000,112.269997,273.179993,112.269997,50.400002,2022-10-26
5953,122.800003,51.900002,96.930000,267.579987,278.839996,72.500000,94.129997,318.649994,188.080002,58.599998,...,34.340000,31.010000,135.029999,32.500000,63.250000,113.910004,275.290009,111.559998,50.380001,2022-10-27
5954,126.599998,54.380001,99.489998,258.059998,287.779999,72.849998,94.879997,325.679993,190.669998,62.009998,...,35.500000,31.100000,139.910004,32.669998,65.370003,117.879997,288.000000,113.470001,51.540001,2022-10-28


In [168]:
sp500

Unnamed: 0,Adj Close,Adj Close.1,Adj Close.2,Adj Close.3,Adj Close.4,Adj Close.5,Adj Close.6,Adj Close.7,Adj Close.8,Adj Close.9,...,Volume.495,Volume.496,Volume.497,Volume.498,Volume.499,Volume.500,Volume.501,Volume.502,Volume.503,Volume.504
0,MMM,AOS,ABT,ABBV,ABMD,ACN,ATVI,ADM,ADBE,AAP,...,BF.B,CERN,DISCA,DISCK,INFO,KSU,PBCT,VIAC,WLTW,XLNX
1,26.166324615478516,2.3924336433410645,9.262777328491211,,18.25,,1.2144209146499634,6.6519775390625,16.274673461914062,,...,,,,,,,,,,
2,25.126611709594727,2.3579587936401367,8.998124122619629,,17.8125,,1.1774803400039673,6.582683563232422,14.90939712524414,,...,,,,,,,,,,
3,25.854413986206055,2.351064443588257,8.981582641601562,,18.0,,1.1820976734161377,6.478750228881836,15.204174995422363,,...,,,,,,,,,,
4,27.933847427368164,2.316591501235962,9.295857429504395,,18.03125,,1.1590102910995483,6.513394355773926,15.328289985656738,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5952,118.5,51.798843,97.949997,149.820007,264.290009,280.609985,72.5,91.139999,323.790009,181.399994,...,,,,,,,,,,
5953,122.699997,51.649712,98.419998,152.509995,268.25,279.869995,72.480003,93.589996,320.480011,181.809998,...,,,,,,,,,,
5954,122.800003,51.600002,96.93,153.5,267.579987,278.839996,72.5,94.129997,318.649994,188.080002,...,,,,,,,,,,
5955,126.599998,54.380001,99.489998,147.610001,258.059998,287.779999,72.849998,94.879997,325.679993,190.669998,...,,,,,,,,,,


In [169]:
sp500['Close']

0              MMM
1          47.1875
2          45.3125
3           46.625
4           50.375
           ...    
5952         118.5
5953    122.699997
5954    122.800003
5955    126.599998
5956    125.790001
Name: Close, Length: 5957, dtype: object

#### Preparing the Data

Let us assume we are interested in working with the Close prices which have been already been adjusted by Yahoo finance to account for stock splits. We want to make sure that all weekdays are included in our dataset, which is very often desirable for quantitative trading strategies.

Of course, some of the weekdays might be public holidays in which case no price will be available. For this reason, we will fill the missing prices with the latest available prices:

Let's build a function to drop columns with nulls percentage above 25

In [170]:
def drop_null_columns(data):
    nulls_percent_df = pd.DataFrame(data.isna().sum()/len(data)).reset_index()
    nulls_percent_df.columns = ['column_name', 'nulls_percentage']
    # nulls_percent_df.head()
    columns_above_threshold = nulls_percent_df[nulls_percent_df['nulls_percentage']>0.1]
    # print(len(columns_above_threshold['column_name']))
    drop_columns_list = list(columns_above_threshold['column_name'])
    # print(drop_columns_list)
    data = data.drop(columns=drop_columns_list)
    data.shape
    return data

In [171]:
close.isna().sum()

MMM        0
AOS        0
ABT        0
ABBV    3392
ABMD       0
        ... 
KSU     5956
PBCT    5956
VIAC    5956
WLTW    5956
XLNX    5956
Length: 505, dtype: int64

In [172]:
close = drop_null_columns(close)

In [173]:
volume = drop_null_columns(volume)

In [174]:
close.isna().sum()

MMM       0
AOS       0
ABT       0
ABMD      0
ACN     403
       ... 
XEL       0
YUM       0
ZBRA      0
ZBH     407
ZION      0
Length: 383, dtype: int64

#### Now we are going to create 100 random portfolios for the close and the volume
#### Each sample will be of 5 samples from the whole population

In [175]:
close

Unnamed: 0,MMM,AOS,ABT,ABMD,ACN,ATVI,ADM,ADBE,AAP,AMD,...,WST,WDC,WY,WHR,WMB,XEL,YUM,ZBRA,ZBH,ZION
0,47.187500,3.614583,15.711531,18.250000,,1.369792,10.884354,16.390625,,15.500000,...,7.562500,3.937500,69.875000,61.812500,23.198961,19.000000,6.706057,25.027779,,55.500000
1,45.312500,3.562500,15.262630,17.812500,,1.328125,10.770975,15.015625,,14.625000,...,7.640625,4.687500,67.250000,59.437500,22.684469,19.437500,6.571262,24.666668,,52.812500
2,46.625000,3.552083,15.234574,18.000000,,1.333333,10.600907,15.312500,,15.000000,...,7.609375,4.375000,70.812500,60.312500,24.087631,20.187500,6.604960,25.138889,,52.750000
3,50.375000,3.500000,15.767643,18.031250,,1.307292,10.657596,15.437500,,16.000000,...,7.531250,4.562500,74.125000,60.812500,24.602125,20.000000,6.548796,23.777779,,53.500000
4,51.375000,3.645833,15.935981,17.937500,,1.343750,10.827664,16.187500,,16.250000,...,7.609375,5.625000,71.687500,61.000000,25.256933,20.000000,6.402768,23.513889,,53.625000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5951,118.500000,52.099998,97.949997,264.290009,280.609985,72.500000,91.139999,323.790009,181.399994,61.470001,...,249.210007,35.290001,30.910000,137.000000,31.850000,62.400002,111.239998,274.579987,110.730003,51.509998
5952,122.699997,51.950001,98.419998,268.250000,279.869995,72.480003,93.589996,320.480011,181.809998,59.730000,...,254.399994,35.279999,30.959999,134.740005,32.150002,62.570000,112.269997,273.179993,112.269997,50.400002
5953,122.800003,51.900002,96.930000,267.579987,278.839996,72.500000,94.129997,318.649994,188.080002,58.599998,...,221.229996,34.340000,31.010000,135.029999,32.500000,63.250000,113.910004,275.290009,111.559998,50.380001
5954,126.599998,54.380001,99.489998,258.059998,287.779999,72.849998,94.879997,325.679993,190.669998,62.009998,...,225.509995,35.500000,31.100000,139.910004,32.669998,65.370003,117.879997,288.000000,113.470001,51.540001


In [176]:
def random_portfolios(data):
    import random
    list_of_portfolios = []
    tickers_data = list(data.columns.values)
    for i in range(100):
        portofolio_aleatorio = random.sample(tickers_data, 5)
        list_of_portfolios.append(data[portofolio_aleatorio])
    return list_of_portfolios

In [177]:
list_of_portfolios = random_portfolios(close)

In [178]:
list_of_portfolios

[            GLW         ADI         TGT          GL         BLK
 0     40.604168   45.093750   36.031250   12.583333   16.625000
 1     36.666668   42.812500   34.468750   12.333333   15.875000
 2     35.833332   43.437500   33.687500   12.333333   15.750000
 3     35.000000   42.250000   32.093750   12.305556   16.000000
 4     38.958332   43.437500   33.750000   12.555556   15.875000
 ...         ...         ...         ...         ...         ...
 5951  31.730000  146.369995  165.800003  114.639999  635.940002
 5952  32.389999  141.380005  166.770004  114.360001  637.799988
 5953  32.110001  140.679993  166.899994  111.910004  648.979980
 5954  32.490002  144.880005  167.520004  113.709999  663.750000
 5955  32.169998  142.619995  164.250000  115.519997  645.909973
 
 [5956 rows x 5 columns],
              TXN        AES         MMM      GPS         JPM
 0      51.437500  36.250000   47.187500  43.1250   48.583332
 1      49.250000  34.812500   45.312500  41.5625   47.250000
 2    

In [180]:
list_of_portfolios[0]['GLW'][-1:].values[0]

32.16999816894531

In [182]:
stock_performance = list_of_portfolios[0]['GLW'][-1:].values[0] - list_of_portfolios[0]['GLW'][0:].values[0]

In [183]:
stock_performance

-8.43416976928711

In [None]:
list(close.columns.values)[0]

'MMM'

In [None]:
def close_print(stock_symbol):
    
    df = data.DataReader(stock_symbol, 'yahoo', start_date, end_date)
    close = df['Close']
    all_weekdays = pd.date_range(start=start_date, end=end_date, freq='B')  # B stands for business day frequency
    close = close.reindex(all_weekdays)
    close = close.fillna(method='ffill')

    for i in stock_symbol:
        # Get the ticker timeseries. This now returns a Pandas Series object indexed by date.
        ticker = close.loc[:, i]

        # Calculate the 20 and 100 days moving averages of the closing prices
        short_rolling_ticker = ticker.rolling(window=20).mean()
        long_rolling_ticker = ticker.rolling(window=100).mean()

        # Plot everything by leveraging the very powerful matplotlib package
        fig, ax = plt.subplots(figsize=(18,11))

        plt.title(i)

        ax.plot(ticker.index, ticker, label=i)
        ax.plot(short_rolling_ticker.index, short_rolling_ticker, label='20 days rolling')
        ax.plot(long_rolling_ticker.index, long_rolling_ticker, label='100 days rolling')

        ax.set_xlabel('Date')
        ax.set_ylabel('Adjusted closing price ($)')
        ax.legend()
    close.to_csv("CSV/close.csv", index=True)


In [None]:
# close_print(['AAPL'])

In [None]:
def volume_print(stock_symbol):
    
    df = data.DataReader(stock_symbol, 'yahoo', start_date, end_date)
    volume = df['Volume']
    all_weekdays = pd.date_range(start=start_date, end=end_date, freq='B')  # B stands for business day frequency
    volume = volume.reindex(all_weekdays)
    volume = volume.fillna(method='ffill')


    for i in stock_symbol:
        # Get the ticker timeseries. This now returns a Pandas Series object indexed by date.
        ticker = volume.loc[:, i]

        # Calculate the 30 and 100 days moving averages of the closing prices
        short_rolling_ticker = ticker.rolling(window=30).mean()
        long_rolling_ticker = ticker.rolling(window=100).mean()

        # short_rolling_ticker = ticker.rolling(window=30, center=True).mean()
        # long_rolling_ticker = ticker.rolling(window=100, center=True).mean()

        # Plot everything by leveraging the very powerful matplotlib package
        fig, ax = plt.subplots(figsize=(18,11))

        plt.title(i)

        ax.plot(ticker.index, ticker, label=i)
        ax.plot(short_rolling_ticker.index, short_rolling_ticker, label='30 days rolling')
        ax.plot(long_rolling_ticker.index, long_rolling_ticker, label='100 days rolling')

        ax.set_xlabel('Date')
        ax.set_ylabel('Stock Volume')
        ax.legend()

    volume.to_csv("CSV/volume.csv", index=True)


In [None]:
# volume_print(['AAPL'])

In [None]:
# Relative returns
returns = close.pct_change(1)
returns.head()

Unnamed: 0,MMM,AOS,ABT,ABMD,ACN,ATVI,ADM,ADBE,AAP,AMD,...,WDC,WY,WHR,WMB,WYNN,XEL,YUM,ZBRA,ZBH,ZION
0,,,,,,,,,,,...,,,,,,,,,,
1,-0.039735,-0.014409,-0.028571,-0.023973,,-0.030418,-0.010417,-0.083889,,-0.056452,...,0.190476,-0.037567,-0.038423,-0.022177,,0.023026,-0.020101,-0.014428,,-0.048423
2,0.028966,-0.002924,-0.001838,0.010526,,0.003921,-0.015789,0.019771,,0.025641,...,-0.066667,0.052974,0.014721,0.061856,,0.038585,0.005128,0.019144,,-0.001183
3,0.080429,-0.014663,0.034991,0.001736,,-0.019531,0.005347,0.008163,,0.066667,...,0.042857,0.046778,0.00829,0.021359,,-0.009288,-0.008503,-0.054144,,0.014218
4,0.019851,0.041667,0.010676,-0.005199,,0.027888,0.015958,0.048583,,0.015625,...,0.232877,-0.032884,0.003083,0.026616,,0.0,-0.022298,-0.011098,,0.002336


In [None]:
# Log returns - First the logarithm of the prices is taken and the the difference of consecutive (log) observations
log_returns = np.log(close).diff()
log_returns.head()

Unnamed: 0,MMM,AOS,ABT,ABMD,ACN,ATVI,ADM,ADBE,AAP,AMD,...,WDC,WY,WHR,WMB,WYNN,XEL,YUM,ZBRA,ZBH,ZION
0,,,,,,,,,,,...,,,,,,,,,,
1,-0.040546,-0.014514,-0.028987,-0.024265,,-0.030891,-0.010471,-0.087618,,-0.058108,...,0.174353,-0.038291,-0.03918,-0.022427,,0.022765,-0.020305,-0.014533,,-0.049635
2,0.028554,-0.002928,-0.00184,0.010471,,0.003914,-0.015915,0.019578,,0.025318,...,-0.068993,0.051619,0.014614,0.060018,,0.037859,0.005115,0.018963,,-0.001184
3,0.077358,-0.014771,0.034392,0.001735,,-0.019724,0.005333,0.00813,,0.064539,...,0.041964,0.045717,0.008256,0.021134,,-0.009331,-0.00854,-0.055665,,0.014118
4,0.019657,0.040822,0.01062,-0.005213,,0.027506,0.015832,0.04744,,0.015504,...,0.20935,-0.033436,0.003079,0.026268,,0.0,-0.022551,-0.01116,,0.002334


In [None]:
# fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(18,13))

# for c in log_returns:
#     ax1.plot(log_returns.index, log_returns[c].cumsum(), label=str(c))

# ax1.set_ylabel('Cumulative log returns')
# ax1.legend(loc='best')

# for c in log_returns:
#     ax2.plot(log_returns.index, 100*(np.exp(log_returns[c].cumsum()) - 1), label=str(c))

# ax2.set_ylabel('Total relative returns (%)')
# ax2.legend(loc='best')

# plt.show()
