In [3]:
import pandas as pd   
import numpy as np
import quandl
import datetime
import matplotlib.pyplot as plt   
%matplotlib inline 

In [4]:
#pd.describe_option()

### Notebook set up

In [5]:
pd.set_option('display.notebook_repr_html', False)   
pd.set_option('display.max_columns', 12)   
pd.set_option('display.max_rows', 8)

In [6]:
#s =quandl.get("CHRIS/CBOE_VX1")
s = pd.read_csv('CHRIS-CBOE_VX1.csv', squeeze = True, parse_dates = ['Trade Date'])

In [7]:
del s['Prev. Day Open Interest']
del s['EFP']
del s['Change']

In [8]:
s.rename(columns={'Total Volume': 'Volume'}, inplace=True)

In [9]:
s.head()

  Trade Date   Open   High    Low  Close  Settle  Volume
0 2017-02-23  13.20  13.78  13.18  13.52  13.525  118392
1 2017-02-22  13.20  13.45  13.00  13.21  13.225   99759
2 2017-02-21  13.25  13.25  12.75  13.20  13.175  101726
3 2017-02-17  13.00  13.38  12.95  13.25  13.125  135622
4 2017-02-16  12.85  13.70  12.75  12.96  12.975  180398

#### Scalar lookup by label or location using .at[]  and .iat[] Scalar values can be looked up by label using .at[] by passing the row label and then the column name/value:


In [10]:
s.at[50,'Settle']    # [int. str]

13.025

In [11]:
s.iat[50,5]    #[int, int]  row index , column number

13.025

In [12]:
s.Volume < 10000 

0       False
1       False
2       False
3       False
        ...  
2937     True
2938     True
2939     True
2940     True
Name: Volume, dtype: bool

In [13]:
s[(s.Settle < 12) & (s.Settle > 0)][['Settle']]

      Settle
6     11.175
7     11.425
8     11.725
43    11.575
...      ...
666   11.950
676   11.740
2482  11.870
2483  11.670

[10 rows x 1 columns]

### Loading  Historical Data

In [15]:
from pandas_datareader import data, wb

#import pandas.io.data as web

start = datetime.datetime(2013, 4, 1)   

end = datetime.datetime(2013, 9, 30)

msft = web.DataReader("MSFT", 'yahoo', start, end)   

aapl = web.DataReader("AAPL", 'yahoo', start, end)

In [16]:
#msft.to_csv("msft.csv")   
#aapl.to_csv("msft.csv") 

### Reading saved files back to the Notebook

In [17]:
msft = pd.read_csv("msft.csv", index_col=0, parse_dates=True)   
aapl = pd.read_csv("aapl.csv", index_col=0, parse_dates=True) 

In [20]:
msft[:6]

                 Open       High        Low      Close    Volume  Adj Close
Date                                                                       
2013-04-01  28.639999  28.660000  28.360001  28.610001  29201100  25.666909
2013-04-02  28.590000  28.850000  28.520000  28.799999  28456500  25.837363
2013-04-03  28.750000  28.950001  28.540001  28.559999  35062800  25.622051
2013-04-04  28.389999  28.610001  28.270000  28.600000  45263200  25.657937
2013-04-05  28.219999  28.780001  28.110001  28.700001  50927300  25.747651
2013-04-08  28.730000  28.730000  28.469999  28.590000  34759500  25.648966

## join, concatenate, merge  DataFrames

In [21]:
glue_them_both = pd.concat([msft.head(3), aapl.head(3)])
glue_them_both

                  Open        High         Low       Close     Volume  \
Date                                                                    
2013-04-01   28.639999   28.660000   28.360001   28.610001   29201100   
2013-04-02   28.590000   28.850000   28.520000   28.799999   28456500   
2013-04-03   28.750000   28.950001   28.540001   28.559999   35062800   
2013-04-01  441.899994  443.700008  427.739990  428.910004   97433000   
2013-04-02  427.599987  438.139988  426.400013  429.789997  132379800   
2013-04-03  431.370007  437.279995  430.310009  431.989994   90804000   

            Adj Close  
Date                   
2013-04-01  25.666909  
2013-04-02  25.837363  
2013-04-03  25.622051  
2013-04-01  56.389605  
2013-04-02  56.505299  
2013-04-03  56.794537  

In [22]:
aaplA01 = aapl['2013-05'][['Adj Close']]
msftA01 = msft['2013-04'][['Adj Close']]

In [23]:
pd.concat([aaplA01.head(7), msftA01[0:7]])

            Adj Close
Date                 
2013-05-01  57.754283
2013-05-02  58.573352
2013-05-03  59.159719
2013-05-06  60.570408
...               ...
2013-04-04  25.657937
2013-04-05  25.747651
2013-04-08  25.648966
2013-04-09  26.564040

[14 rows x 1 columns]

In [24]:
# Making a MultiIndex with search by keys
closes = pd.concat([msftA01, aaplA01],  keys=[5, 10])

In [25]:
closes

               Adj Close
   Date                 
5  2013-04-01  25.666909
   2013-04-02  25.837363
   2013-04-03  25.622051
   2013-04-04  25.657937
...                  ...
10 2013-05-28  58.421094
   2013-05-29  58.885613
   2013-05-30  59.763042
   2013-05-31  59.518212

[44 rows x 1 columns]

In [26]:
msftAV = msft[['Adj Close', 'Volume']]   
aaplAV = msft[['Adj Close', 'Volume']]  
pd.concat([msftAV, aaplAV])

            Adj Close    Volume
Date                           
2013-04-01  25.666909  29201100
2013-04-02  25.837363  28456500
2013-04-03  25.622051  35062800
2013-04-04  25.657937  45263200
...               ...       ...
2013-09-25  29.577193  28907500
2013-09-26  29.813740  28504000
2013-09-27  30.268634  55348000
2013-09-30  30.277730  39839500

[256 rows x 2 columns]

In [27]:
pd.concat([msftAV, aaplAV], join='inner')

            Adj Close    Volume
Date                           
2013-04-01  25.666909  29201100
2013-04-02  25.837363  28456500
2013-04-03  25.622051  35062800
2013-04-04  25.657937  45263200
...               ...       ...
2013-09-25  29.577193  28907500
2013-09-26  29.813740  28504000
2013-09-27  30.268634  55348000
2013-09-30  30.277730  39839500

[256 rows x 2 columns]

In [28]:
# Making a new index insted of an old one with ignore_index = True
pd.concat([msftAV[:3], aaplAV[:3]], ignore_index=True)

   Adj Close    Volume
0  25.666909  29201100
1  25.837363  28456500
2  25.622051  35062800
3  25.666909  29201100
4  25.837363  28456500
5  25.622051  35062800

In [27]:
mask = pd.merge(msftAV, aaplAV) 
mask

     Adj Close    Volume
0    25.666909  29201100
1    25.837363  28456500
2    25.622051  35062800
3    25.657937  45263200
..         ...       ...
124  29.577193  28907500
125  29.813740  28504000
126  30.268634  55348000
127  30.277730  39839500

[128 rows x 2 columns]


## Pivoting
####  The following command creates a DataFrame with this schema and populates the records:


In [29]:
msft.insert(0, 'Symbol', 'MSFT')
aapl.insert(0, 'Symbol', 'AAPL')

In [30]:
combined = pd.concat([msft, aapl]).sort_index()
p = combined.reset_index()

In [31]:
p.head()

        Date Symbol        Open        High         Low       Close  \
0 2013-04-01   MSFT   28.639999   28.660000   28.360001   28.610001   
1 2013-04-01   AAPL  441.899994  443.700008  427.739990  428.910004   
2 2013-04-02   AAPL  427.599987  438.139988  426.400013  429.789997   
3 2013-04-02   MSFT   28.590000   28.850000   28.520000   28.799999   
4 2013-04-03   MSFT   28.750000   28.950001   28.540001   28.559999   

      Volume  Adj Close  
0   29201100  25.666909  
1   97433000  56.389605  
2  132379800  56.505299  
3   28456500  25.837363  
4   35062800  25.622051  

In [32]:
closes = p.pivot(index='Date', columns='Symbol', values='Adj Close') 
closes

Symbol           AAPL       MSFT
Date                            
2013-04-01  56.389605  25.666909
2013-04-02  56.505299  25.837363
2013-04-03  56.794537  25.622051
2013-04-04  56.233152  25.657937
...               ...        ...
2013-09-25  64.147455  29.577193
2013-09-26  64.772236  29.813740
2013-09-27  64.309979  30.268634
2013-09-30  63.510679  30.277730

[128 rows x 2 columns]

In [33]:
closes.stack()

Date        Symbol
2013-04-01  AAPL      56.389605
            MSFT      25.666909
2013-04-02  AAPL      56.505299
            MSFT      25.837363
                        ...    
2013-09-27  AAPL      64.309979
            MSFT      30.268634
2013-09-30  AAPL      63.510679
            MSFT      30.277730
dtype: float64

In [34]:
closes.unstack()

Symbol  Date      
AAPL    2013-04-01    56.389605
        2013-04-02    56.505299
        2013-04-03    56.794537
        2013-04-04    56.233152
                        ...    
MSFT    2013-09-25    29.577193
        2013-09-26    29.813740
        2013-09-27    30.268634
        2013-09-30    30.277730
dtype: float64

#### Melting is the process of transforming a DataFrame into a format where each row represents a unique id-variable combination. 

In [35]:
melted = pd.melt(p, id_vars=['Date', 'Symbol'])   
melted[:5]

        Date Symbol variable       value
0 2013-04-01   MSFT     Open   28.639999
1 2013-04-01   AAPL     Open  441.899994
2 2013-04-02   AAPL     Open  427.599987
3 2013-04-02   MSFT     Open   28.590000
4 2013-04-03   MSFT     Open   28.750000

In [36]:
melted[(melted.Date=='2013-04-03') & (melted.Symbol=='MSFT')]

           Date Symbol   variable            value
4    2013-04-03   MSFT       Open        28.750000
260  2013-04-03   MSFT       High        28.950001
516  2013-04-03   MSFT        Low        28.540001
772  2013-04-03   MSFT      Close        28.559999
1028 2013-04-03   MSFT     Volume  35062800.000000
1284 2013-04-03   MSFT  Adj Close        25.622051

In [37]:
s4g = combined[['Symbol', 'Adj Close']].reset_index() 
s4g.insert(1, 'Year', pd.DatetimeIndex(s4g['Date']).year)

In [38]:
s4g.insert(2, 'Month',pd.DatetimeIndex(s4g['Date']).month) 

In [39]:
s4g.head()

        Date  Year  Month Symbol  Adj Close
0 2013-04-01  2013      4   MSFT  25.666909
1 2013-04-01  2013      4   AAPL  56.389605
2 2013-04-02  2013      4   AAPL  56.505299
3 2013-04-02  2013      4   MSFT  25.837363
4 2013-04-03  2013      4   MSFT  25.622051

In [40]:
s4g.groupby(['Symbol', 'Year', 'Month'],as_index=False).agg(np.mean)[:5]


  Symbol  Year  Month  Adj Close
0   AAPL  2013      4  55.187294
1   AAPL  2013      5  58.976977
2   AAPL  2013      6  56.316589
3   AAPL  2013      7  56.795571
4   AAPL  2013      8  64.496779

In [41]:
#DateTime range with freq day; can be 'M', 'A' year
mp = pd.period_range('1/1/2013', '12/31/2014', freq='D') 
mp

PeriodIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
             '2013-01-05', '2013-01-06', '2013-01-07', '2013-01-08',
             '2013-01-09', '2013-01-10',
             ...
             '2014-12-22', '2014-12-23', '2014-12-24', '2014-12-25',
             '2014-12-26', '2014-12-27', '2014-12-28', '2014-12-29',
             '2014-12-30', '2014-12-31'],
            dtype='int64', length=730, freq='D')

#### The following command shifts the adjusted closing prices forward by 1 day:

In [42]:
mp.shift(1)

PeriodIndex(['2013-01-02', '2013-01-03', '2013-01-04', '2013-01-05',
             '2013-01-06', '2013-01-07', '2013-01-08', '2013-01-09',
             '2013-01-10', '2013-01-11',
             ...
             '2014-12-23', '2014-12-24', '2014-12-25', '2014-12-26',
             '2014-12-27', '2014-12-28', '2014-12-29', '2014-12-30',
             '2014-12-31', '2015-01-01'],
            dtype='int64', length=730, freq='D')

In [43]:
shifted_backwards = mp.shift(-2)[:10] 
shifted_backwards

PeriodIndex(['2012-12-30', '2012-12-31', '2013-01-01', '2013-01-02',
             '2013-01-03', '2013-01-04', '2013-01-05', '2013-01-06',
             '2013-01-07', '2013-01-08'],
            dtype='int64', freq='D')

### the calculation of daily percentage changes from the previous day

In [44]:
pct_cng= msft['Adj Close']  / msft['Adj Close'].shift(1)-1
pct_cng.dropna()

Date
2013-04-02    0.006641
2013-04-03   -0.008333
2013-04-04    0.001401
2013-04-05    0.003497
                ...   
2013-09-25    0.001849
2013-09-26    0.007998
2013-09-27    0.015258
2013-09-30    0.000301
Name: Adj Close, dtype: float64

In [45]:
msft['Adj Close'].pct_change().dropna()

Date
2013-04-02    0.006641
2013-04-03   -0.008333
2013-04-04    0.001401
2013-04-05    0.003497
                ...   
2013-09-25    0.001849
2013-09-26    0.007998
2013-09-27    0.015258
2013-09-30    0.000301
Name: Adj Close, dtype: float64

In [46]:
msft_cum_ret = (1 + (msft['Adj Close'] / msft['Adj Close'].shift(1)-1)).cumprod() 
msft_cum_ret

Date
2013-04-01         NaN
2013-04-02    1.006641
2013-04-03    0.998252
2013-04-04    0.999650
                ...   
2013-09-25    1.152347
2013-09-26    1.161563
2013-09-27    1.179286
2013-09-30    1.179641
Name: Adj Close, dtype: float64

In [47]:
#Resampling can be either downsampling, where data is converted to wider frequency ranges (such as downsampling from day-to-day to month-to-month) 
#or upsampling, where data is converted to narrower time ranges. 
#Data for the associated labels are then calculated by a function provided to pandas instead  of simple filling

In [48]:
msft_monthly_cum_ret = msft_cum_ret.resample("W")
msft_monthly_cum_ret

Date
2013-04-07    1.001922
2013-04-14    1.022090
2013-04-21    1.013981
2013-04-28    1.096959
                ...   
2013-09-15    1.152064
2013-09-22    1.173119
2013-09-29    1.160783
2013-10-06    1.179641
Freq: W-SUN, Name: Adj Close, dtype: float64

In [49]:
msft_cum_ret.resample("M", how="std")

Date
2013-04-30    0.052469
2013-05-31    0.033876
2013-06-30    0.023484
2013-07-31    0.062526
2013-08-31    0.033603
2013-09-30    0.026043
Freq: M, Name: Adj Close, dtype: float64

In [50]:
 msft_cum_ret.resample("M", how="ohlc")

                open      high       low     close
Date                                              
2013-04-30  1.006641  1.156938  0.998252  1.156938
2013-05-31  1.143656  1.234743  1.141559  1.228407
2013-06-30  1.252694  1.255509  1.171034  1.215736
2013-07-31  1.209400  1.276628  1.104862  1.120701
2013-08-31  1.114718  1.231746  1.111550  1.183894
2013-09-30  1.130016  1.192401  1.104141  1.179641

In [51]:
sample = msft_cum_ret[1:3] 

In [52]:
by_hour = sample.resample("H")

In [53]:
by_hour

Date
2013-04-02 00:00:00    1.006641
2013-04-02 01:00:00         NaN
2013-04-02 02:00:00         NaN
2013-04-02 03:00:00         NaN
                         ...   
2013-04-02 21:00:00         NaN
2013-04-02 22:00:00         NaN
2013-04-02 23:00:00         NaN
2013-04-03 00:00:00    0.998252
Freq: H, Name: Adj Close, dtype: float64

In [54]:
# use Method .interpolate()   to shift and fii in NaN
by_hour.interpolate()

Date
2013-04-02 00:00:00    1.006641
2013-04-02 01:00:00    1.006291
2013-04-02 02:00:00    1.005942
2013-04-02 03:00:00    1.005592
                         ...   
2013-04-02 21:00:00    0.999301
2013-04-02 22:00:00    0.998951
2013-04-02 23:00:00    0.998602
2013-04-03 00:00:00    0.998252
Freq: H, Name: Adj Close, dtype: float64