### Getting and Viewing the data:

In [1]:
import boto3
import pandas as pd
from botocore import UNSIGNED
from botocore.config import Config
# botocore added from video comments
from io import StringIO
# ^ buffer in memory to use the read csv method

In [2]:
s3 = boto3.resource('s3', config=Config(signature_version=UNSIGNED))
# config part was added from video comments *
bucket = s3.Bucket('deutsche-boerse-xetra-pds')
# 'deutsche-boerse-xerta-pds' is the name of the bucket

A bucket is a container for objects. An object is a file and any metadata that describes that file. To store an object in Amazon S3, you create a bucket and then upload the object to the bucket. When the object is in the bucket, you can open it, download it, and move it. 

In [3]:
bucket_obj1 = bucket.objects.filter(Prefix='2021-03-15')
bucket_obj2 = bucket.objects.filter(Prefix='2021-03-16')
objects = [obj for obj in bucket_obj1] + [obj for obj in bucket_obj2]
# this creates an object for with all the sources we want to use

In [4]:
objects
# need to call 'objects' to see its output in Jupyter Notebook

[s3.ObjectSummary(bucket_name='deutsche-boerse-xetra-pds', key='2021-03-15/2021-03-15_BINS_XETR00.csv'),
 s3.ObjectSummary(bucket_name='deutsche-boerse-xetra-pds', key='2021-03-15/2021-03-15_BINS_XETR01.csv'),
 s3.ObjectSummary(bucket_name='deutsche-boerse-xetra-pds', key='2021-03-15/2021-03-15_BINS_XETR02.csv'),
 s3.ObjectSummary(bucket_name='deutsche-boerse-xetra-pds', key='2021-03-15/2021-03-15_BINS_XETR03.csv'),
 s3.ObjectSummary(bucket_name='deutsche-boerse-xetra-pds', key='2021-03-15/2021-03-15_BINS_XETR04.csv'),
 s3.ObjectSummary(bucket_name='deutsche-boerse-xetra-pds', key='2021-03-15/2021-03-15_BINS_XETR05.csv'),
 s3.ObjectSummary(bucket_name='deutsche-boerse-xetra-pds', key='2021-03-15/2021-03-15_BINS_XETR06.csv'),
 s3.ObjectSummary(bucket_name='deutsche-boerse-xetra-pds', key='2021-03-15/2021-03-15_BINS_XETR07.csv'),
 s3.ObjectSummary(bucket_name='deutsche-boerse-xetra-pds', key='2021-03-15/2021-03-15_BINS_XETR08.csv'),
 s3.ObjectSummary(bucket_name='deutsche-boerse-xetra-pd

In [5]:
# csv_obj = bucket.Object(key='2021-03-15/2021-03-15_BINS_XETR15.csv').get().get('Body').read().decode('utf-8')
# View just one object, key taken randomly from one of the files listed above

In [6]:
csv_obj_init = bucket.Object(key=objects[0].key).get().get('Body').read().decode('utf-8')
data = StringIO(csv_obj_init)
df_init = pd.read_csv(data, delimiter=',')
# This gets the first object so we can pass it to columns in df_all below

In [7]:
# This df is empty because this is stock data at midnight, and all of the stock markets are closed
# The fact that this happens to be an empty df is coincidental
# We just want to use the columns for df_all below so it doesn't matter that it is empty

df_init.columns
# Returns a list of all of the column names from this df

Index(['ISIN', 'Mnemonic', 'SecurityDesc', 'SecurityType', 'Currency',
       'SecurityID', 'Date', 'Time', 'StartPrice', 'MaxPrice', 'MinPrice',
       'EndPrice', 'TradedVolume', 'NumberOfTrades'],
      dtype='object')

In [8]:
df_all = pd.DataFrame(columns=df_init.columns)
# ^All DFs have to use the same columns
# We use the columns from the a df of the first object

for obj in objects:
    csv_obj = bucket.Object(key=obj.key).get().get('Body').read().decode('utf-8')
    data = StringIO(csv_obj)
    df = pd.read_csv(data, delimiter =',')
    df_all = pd.concat([df_all, df], ignore_index=True)
    # ^ I believe this code fixes the error caused the the code below
    # df_all = df_all.append(df, ignore_index=True)
# Convert the csv_object to a pandas df to read it more easily
# df_all combines dataframes into one
# for loop with 'key=obj.key' to view multiple objects at once 
# Returns a warnning that recommends using concat instead of append
# ^ ignore the warning for now

In [9]:
df_all

Unnamed: 0,ISIN,Mnemonic,SecurityDesc,SecurityType,Currency,SecurityID,Date,Time,StartPrice,MaxPrice,MinPrice,EndPrice,TradedVolume,NumberOfTrades
0,AT0000A0E9W5,SANT,S+T AG (Z.REG.MK.Z.)O.N.,Common stock,EUR,2504159,2021-03-15,08:00,22.12,22.12,22.12,22.12,1527,1
1,DE000A0DJ6J9,S92,SMA SOLAR TECHNOL.AG,Common stock,EUR,2504287,2021-03-15,08:00,53.85,53.85,53.5,53.5,508,2
2,DE000A0D6554,NDX1,NORDEX SE O.N.,Common stock,EUR,2504290,2021-03-15,08:00,22.24,22.24,22.18,22.18,5270,4
3,DE000A0D9PT0,MTX,MTU AERO ENGINES NA O.N.,Common stock,EUR,2504297,2021-03-15,08:00,201.5,201.5,200.6,200.6,1744,17
4,DE000A0HN5C6,DWNI,DEUTSCHE WOHNEN SE INH,Common stock,EUR,2504314,2021-03-15,08:00,38.95,39.06,38.89,39.05,28662,47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206907,DE000A0WMPJ6,AIXA,AIXTRON SE NA O.N.,Common stock,EUR,2504428,2021-03-16,16:42,19.235,19.235,19.235,19.235,211,1
206908,DE0007164600,SAP,SAP SE O.N.,Common stock,EUR,2505077,2021-03-16,16:43,103.02,103.02,103.02,103.02,20,1
206909,DE0007568578,F3C,SFC ENERGY AG,Common stock,EUR,2505109,2021-03-16,16:44,24.55,24.55,24.55,24.55,130,1
206910,DE0007664005,VOW,VOLKSWAGEN AG ST O.N.,Common stock,EUR,2505113,2021-03-16,16:44,266.6,266.6,266.6,266.6,500,1


In [10]:
columns = ['ISIN', 'Date', 'Time', 'StartPrice', 'MaxPrice', 'MinPrice', 'EndPrice', 'TradedVolume']
# ^ Create a list of relevant columns that we want to look at from our DF

df_all = df_all.loc[:, columns]

In [11]:
# data = StringIO(csv_obj)
# df = pd.read_csv(data, delimiter =',')
# Convert the csv_object to a pandas df to read it more easily
# Only use this when viewing one object, it is used in the for loop above to view multiple

In [12]:
df_all.dropna(inplace=True)
# ^ Drop NA values

In [13]:
df_all

Unnamed: 0,ISIN,Date,Time,StartPrice,MaxPrice,MinPrice,EndPrice,TradedVolume
0,AT0000A0E9W5,2021-03-15,08:00,22.12,22.12,22.12,22.12,1527
1,DE000A0DJ6J9,2021-03-15,08:00,53.85,53.85,53.5,53.5,508
2,DE000A0D6554,2021-03-15,08:00,22.24,22.24,22.18,22.18,5270
3,DE000A0D9PT0,2021-03-15,08:00,201.5,201.5,200.6,200.6,1744
4,DE000A0HN5C6,2021-03-15,08:00,38.95,39.06,38.89,39.05,28662
...,...,...,...,...,...,...,...,...
206907,DE000A0WMPJ6,2021-03-16,16:42,19.235,19.235,19.235,19.235,211
206908,DE0007164600,2021-03-16,16:43,103.02,103.02,103.02,103.02,20
206909,DE0007568578,2021-03-16,16:44,24.55,24.55,24.55,24.55,130
206910,DE0007664005,2021-03-16,16:44,266.6,266.6,266.6,266.6,500


In [14]:
df_all.shape
# We can use .shape to see if dropna did anything, in this case it didn't

(206912, 8)

### Get opening price per ISIN and day:

In [15]:
df_all['opening_price'] = df_all.sort_values(by=['Time']).groupby(['ISIN','Date'])['StartPrice'].transform('first')
# Sort the data so we can see what we want more specifically

In [16]:
df_all[df_all['ISIN']=='AT0000A0E9W5']
# Here we get all of the opening prices of the specified ISIN
# The opening prices are put in a new column at the end of the df

Unnamed: 0,ISIN,Date,Time,StartPrice,MaxPrice,MinPrice,EndPrice,TradedVolume,opening_price
0,AT0000A0E9W5,2021-03-15,08:00,22.12,22.12,22.12,22.12,1527,22.12
4830,AT0000A0E9W5,2021-03-15,08:12,22.24,22.28,22.22,22.28,1914,22.12
5714,AT0000A0E9W5,2021-03-15,08:16,22.32,22.36,22.32,22.36,780,22.12
6336,AT0000A0E9W5,2021-03-15,08:19,22.32,22.32,22.3,22.3,568,22.12
6714,AT0000A0E9W5,2021-03-15,08:20,22.26,22.26,22.26,22.26,250,22.12
...,...,...,...,...,...,...,...,...,...
202503,AT0000A0E9W5,2021-03-16,16:26,22.6,22.6,22.6,22.6,1177,22.52
202789,AT0000A0E9W5,2021-03-16,16:27,22.62,22.62,22.56,22.56,1616,22.52
203058,AT0000A0E9W5,2021-03-16,16:28,22.58,22.58,22.58,22.58,15,22.52
203362,AT0000A0E9W5,2021-03-16,16:29,22.58,22.58,22.56,22.58,251,22.52


### Get closing price per ISIN and day

In [17]:
df_all['closing_price'] = df_all.sort_values(by=['Time']).groupby(['ISIN','Date'])['StartPrice'].transform('last')
# Shouldn't I change ['StartPrice'] to ['EndPrice'] ?

In [18]:
df_all[df_all['ISIN']=='AT0000A0E9W5']

Unnamed: 0,ISIN,Date,Time,StartPrice,MaxPrice,MinPrice,EndPrice,TradedVolume,opening_price,closing_price
0,AT0000A0E9W5,2021-03-15,08:00,22.12,22.12,22.12,22.12,1527,22.12,22.42
4830,AT0000A0E9W5,2021-03-15,08:12,22.24,22.28,22.22,22.28,1914,22.12,22.42
5714,AT0000A0E9W5,2021-03-15,08:16,22.32,22.36,22.32,22.36,780,22.12,22.42
6336,AT0000A0E9W5,2021-03-15,08:19,22.32,22.32,22.3,22.3,568,22.12,22.42
6714,AT0000A0E9W5,2021-03-15,08:20,22.26,22.26,22.26,22.26,250,22.12,22.42
...,...,...,...,...,...,...,...,...,...,...
202503,AT0000A0E9W5,2021-03-16,16:26,22.6,22.6,22.6,22.6,1177,22.52,22.56
202789,AT0000A0E9W5,2021-03-16,16:27,22.62,22.62,22.56,22.56,1616,22.52,22.56
203058,AT0000A0E9W5,2021-03-16,16:28,22.58,22.58,22.58,22.58,15,22.52,22.56
203362,AT0000A0E9W5,2021-03-16,16:29,22.58,22.58,22.56,22.58,251,22.52,22.56


### Aggregations

In [19]:
df_all = df_all.groupby(['ISIN', 'Date'], as_index=False).agg(
    opening_price_eur=('opening_price', 'min'),
    closing_price_eur=('closing_price', 'min'),
    minimum_price_eur=('MinPrice', 'min'),
    maximum_price_eur=('MaxPrice', 'max'),
    daily_traded_volume=('TradedVolume', 'sum')
)
# Leaving as_index True will cause ISIN and Date to be the index
# Grouping by min for this example doesn't make a difference but we 
# have to choose an aggregation

In [20]:
df_all
# This df has two outputs for each day for each ISIN

Unnamed: 0,ISIN,Date,opening_price_eur,closing_price_eur,minimum_price_eur,maximum_price_eur,daily_traded_volume
0,AT00000FACC2,2021-03-15,9.490,9.410,9.300,9.600,2392
1,AT00000FACC2,2021-03-16,9.570,9.330,9.330,9.570,2280
2,AT0000606306,2021-03-15,18.190,18.090,18.000,18.190,905
3,AT0000606306,2021-03-16,18.000,18.100,18.000,18.160,395
4,AT0000609607,2021-03-15,15.620,15.760,15.580,15.920,644
...,...,...,...,...,...,...,...
6040,XS2265369731,2021-03-15,10.186,10.151,10.151,10.220,0
6041,XS2265369731,2021-03-16,10.249,10.249,10.249,10.249,0
6042,XS2265370234,2021-03-15,19.789,20.054,19.789,20.054,50
6043,XS2284324667,2021-03-15,26.302,26.014,25.984,26.302,0


### Percent Change Prev Closing

In [21]:
df_all['prev_closing_price'] = df_all.sort_values(by=['Date']).groupby(['ISIN'])['closing_price_eur'].shift(1)
# This adds a new column at the end with the previous closing price

In [22]:
df_all

Unnamed: 0,ISIN,Date,opening_price_eur,closing_price_eur,minimum_price_eur,maximum_price_eur,daily_traded_volume,prev_closing_price
0,AT00000FACC2,2021-03-15,9.490,9.410,9.300,9.600,2392,
1,AT00000FACC2,2021-03-16,9.570,9.330,9.330,9.570,2280,9.410
2,AT0000606306,2021-03-15,18.190,18.090,18.000,18.190,905,
3,AT0000606306,2021-03-16,18.000,18.100,18.000,18.160,395,18.090
4,AT0000609607,2021-03-15,15.620,15.760,15.580,15.920,644,
...,...,...,...,...,...,...,...,...
6040,XS2265369731,2021-03-15,10.186,10.151,10.151,10.220,0,
6041,XS2265369731,2021-03-16,10.249,10.249,10.249,10.249,0,10.151
6042,XS2265370234,2021-03-15,19.789,20.054,19.789,20.054,50,
6043,XS2284324667,2021-03-15,26.302,26.014,25.984,26.302,0,


In [23]:
df_all['change_prev_closing_%'] = ((df_all['closing_price_eur'] - df_all['prev_closing_price']) / df_all['prev_closing_price'] * 100)
# This generates a change in price percentage column

In [24]:
df_all

Unnamed: 0,ISIN,Date,opening_price_eur,closing_price_eur,minimum_price_eur,maximum_price_eur,daily_traded_volume,prev_closing_price,change_prev_closing_%
0,AT00000FACC2,2021-03-15,9.490,9.410,9.300,9.600,2392,,
1,AT00000FACC2,2021-03-16,9.570,9.330,9.330,9.570,2280,9.410,-0.850159
2,AT0000606306,2021-03-15,18.190,18.090,18.000,18.190,905,,
3,AT0000606306,2021-03-16,18.000,18.100,18.000,18.160,395,18.090,0.055279
4,AT0000609607,2021-03-15,15.620,15.760,15.580,15.920,644,,
...,...,...,...,...,...,...,...,...,...
6040,XS2265369731,2021-03-15,10.186,10.151,10.151,10.220,0,,
6041,XS2265369731,2021-03-16,10.249,10.249,10.249,10.249,0,10.151,0.965422
6042,XS2265370234,2021-03-15,19.789,20.054,19.789,20.054,50,,
6043,XS2284324667,2021-03-15,26.302,26.014,25.984,26.302,0,,


In [25]:
df_all.drop(columns=['prev_closing_price'], inplace=True)
# We don't need the previous closing price column anymore so we drop it

In [26]:
df_all = df_all.round(decimals=2)
# This rounds all values to two decimal places

In [27]:
df_all

Unnamed: 0,ISIN,Date,opening_price_eur,closing_price_eur,minimum_price_eur,maximum_price_eur,daily_traded_volume,change_prev_closing_%
0,AT00000FACC2,2021-03-15,9.49,9.41,9.30,9.60,2392,
1,AT00000FACC2,2021-03-16,9.57,9.33,9.33,9.57,2280,-0.85
2,AT0000606306,2021-03-15,18.19,18.09,18.00,18.19,905,
3,AT0000606306,2021-03-16,18.00,18.10,18.00,18.16,395,0.06
4,AT0000609607,2021-03-15,15.62,15.76,15.58,15.92,644,
...,...,...,...,...,...,...,...,...
6040,XS2265369731,2021-03-15,10.19,10.15,10.15,10.22,0,
6041,XS2265369731,2021-03-16,10.25,10.25,10.25,10.25,0,0.97
6042,XS2265370234,2021-03-15,19.79,20.05,19.79,20.05,50,
6043,XS2284324667,2021-03-15,26.30,26.01,25.98,26.30,0,


### Next, we will add Date Arguments (in a separate Jupyter Notebook file because some of this code will be changed)