### Getting and Viewing the data:

Change Below:
- Also importing BytesIO from io

In [1]:
import boto3
import pandas as pd
from botocore import UNSIGNED
from botocore.config import Config
# botocore added from video comments
from io import StringIO, BytesIO
# ^ buffer in memory to use the read csv method
from datetime import datetime, timedelta

In [2]:
# Adding in the arguement date:
arg_date = '2022-02-09'
# The goal is to be able to take this date as an arguement and
# generate a report for everything since this date

In [3]:
# The date needs to be a datetime arguement of a string
# We need the data from the day before to generate the
# price change percentage, so we need the day before in datetime
arg_date_dt = datetime.strptime(arg_date, '%Y-%m-%d').date() - timedelta(days=1)

In [4]:
arg_date_dt

datetime.date(2022, 2, 8)

In [5]:
s3 = boto3.resource('s3', config=Config(signature_version=UNSIGNED))
# config part was added from video comments *
bucket = s3.Bucket('deutsche-boerse-xetra-pds')
# 'deutsche-boerse-xerta-pds' is the name of the bucket

A bucket is a container for objects. An object is a file and any metadata that describes that file. To store an object in Amazon S3, you create a bucket and then upload the object to the bucket. When the object is in the bucket, you can open it, download it, and move it. 

In [6]:
objects = [obj for obj in bucket.objects.all() if datetime.strptime(obj.key.split('/')[0], '%Y-%m-%d').date() >= arg_date_dt]
# Creates objects with the dates from the arg_date_dt and later
# Running this cell takes awhile

In [7]:
objects
# need to call 'objects' to see its output in Jupyter Notebook
# Here we can see that all objects since the specified arg_date are listed

[s3.ObjectSummary(bucket_name='deutsche-boerse-xetra-pds', key='2022-02-08/2022-02-08_BINS_XETR00.csv'),
 s3.ObjectSummary(bucket_name='deutsche-boerse-xetra-pds', key='2022-02-08/2022-02-08_BINS_XETR01.csv'),
 s3.ObjectSummary(bucket_name='deutsche-boerse-xetra-pds', key='2022-02-08/2022-02-08_BINS_XETR02.csv'),
 s3.ObjectSummary(bucket_name='deutsche-boerse-xetra-pds', key='2022-02-08/2022-02-08_BINS_XETR03.csv'),
 s3.ObjectSummary(bucket_name='deutsche-boerse-xetra-pds', key='2022-02-08/2022-02-08_BINS_XETR04.csv'),
 s3.ObjectSummary(bucket_name='deutsche-boerse-xetra-pds', key='2022-02-08/2022-02-08_BINS_XETR05.csv'),
 s3.ObjectSummary(bucket_name='deutsche-boerse-xetra-pds', key='2022-02-08/2022-02-08_BINS_XETR06.csv'),
 s3.ObjectSummary(bucket_name='deutsche-boerse-xetra-pds', key='2022-02-08/2022-02-08_BINS_XETR07.csv'),
 s3.ObjectSummary(bucket_name='deutsche-boerse-xetra-pds', key='2022-02-08/2022-02-08_BINS_XETR08.csv'),
 s3.ObjectSummary(bucket_name='deutsche-boerse-xetra-pd

In [8]:
# csv_obj = bucket.Object(key='2021-03-15/2021-03-15_BINS_XETR15.csv').get().get('Body').read().decode('utf-8')
# View just one object, key taken randomly from one of the files listed above

In [9]:
csv_obj_init = bucket.Object(key=objects[0].key).get().get('Body').read().decode('utf-8')
data = StringIO(csv_obj_init)
df_init = pd.read_csv(data, delimiter=',')
# This gets the first object so we can pass it to columns in df_all below

In [10]:
# This df is empty because this is stock data at midnight, and all of the stock markets are closed
# The fact that this happens to be an empty df is coincidental
# We just want to use the columns for df_all below so it doesn't matter that it is empty

df_init.columns
# Returns a list of all of the column names from this df

Index(['ISIN', 'Mnemonic', 'SecurityDesc', 'SecurityType', 'Currency',
       'SecurityID', 'Date', 'Time', 'StartPrice', 'MaxPrice', 'MinPrice',
       'EndPrice', 'TradedVolume', 'NumberOfTrades'],
      dtype='object')

In [11]:
df_all = pd.DataFrame(columns=df_init.columns)
# ^All DFs have to use the same columns
# We use the columns from the a df of the first object

for obj in objects:
    csv_obj = bucket.Object(key=obj.key).get().get('Body').read().decode('utf-8')
    data = StringIO(csv_obj)
    df = pd.read_csv(data, delimiter =',')
    df_all = pd.concat([df_all, df], ignore_index=True)
# Convert the csv_object to a pandas df to read it more easily
# df_all combines dataframes into one
# for loop with 'key=obj.key' to view multiple objects at once 
# Returns a warnning that recommends using concat instead of append

In [12]:
df_all

Unnamed: 0,ISIN,Mnemonic,SecurityDesc,SecurityType,Currency,SecurityID,Date,Time,StartPrice,MaxPrice,MinPrice,EndPrice,TradedVolume,NumberOfTrades
0,AT0000A0E9W5,SANT,S+T AG O.N.,Common stock,EUR,2504159,2022-02-08,08:00,15.89,15.96,15.85,15.96,2066,3
1,DE000A0DJ6J9,S92,SMA SOLAR TECHNOL.AG,Common stock,EUR,2504287,2022-02-08,08:00,29.94,29.94,29.82,29.82,947,2
2,DE000A0D6554,NDX1,NORDEX SE O.N.,Common stock,EUR,2504290,2022-02-08,08:00,12.89,12.95,12.89,12.93,12650,33
3,DE000A0D9PT0,MTX,MTU AERO ENGINES NA O.N.,Common stock,EUR,2504297,2022-02-08,08:00,185.85,186.1,185.6,185.6,3994,14
4,DE000A0HN5C6,DWNI,DEUTSCHE WOHNEN SE INH,Common stock,EUR,2504314,2022-02-08,08:00,35.88,35.88,35.88,35.88,10,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433003,DE0006231004,IFX,INFINEON TECH.AG NA O.N.,Common stock,EUR,2505024,2022-02-11,16:43,33.38,33.38,33.38,33.38,1500,1
433004,DE0008404005,ALV,ALLIANZ SE NA O.N.,Common stock,EUR,2505133,2022-02-11,16:43,229.55,229.55,229.55,229.55,40,1
433005,DE000SHL1006,SHL,SIEMENS HEALTH.AG NA O.N.,Common stock,EUR,3058562,2022-02-11,16:43,56.58,56.58,56.58,56.58,100,1
433006,DE000WAF3001,WAF,SILTRONIC AG NA O.N.,Common stock,EUR,2504859,2022-02-11,16:44,112.0,112.0,112.0,112.0,180,1


In [13]:
columns = ['ISIN', 'Date', 'Time', 'StartPrice', 'MaxPrice', 'MinPrice', 'EndPrice', 'TradedVolume']
# ^ Create a list of relevant columns that we want to look at from our DF

df_all = df_all.loc[:, columns]

In [14]:
# data = StringIO(csv_obj)
# df = pd.read_csv(data, delimiter =',')
# Convert the csv_object to a pandas df to read it more easily
# Only use this when viewing one object, it is used in the for loop above to view multiple

In [15]:
df_all.dropna(inplace=True)
# ^ Drop NA values

In [16]:
df_all

Unnamed: 0,ISIN,Date,Time,StartPrice,MaxPrice,MinPrice,EndPrice,TradedVolume
0,AT0000A0E9W5,2022-02-08,08:00,15.89,15.96,15.85,15.96,2066
1,DE000A0DJ6J9,2022-02-08,08:00,29.94,29.94,29.82,29.82,947
2,DE000A0D6554,2022-02-08,08:00,12.89,12.95,12.89,12.93,12650
3,DE000A0D9PT0,2022-02-08,08:00,185.85,186.1,185.6,185.6,3994
4,DE000A0HN5C6,2022-02-08,08:00,35.88,35.88,35.88,35.88,10
...,...,...,...,...,...,...,...,...
433003,DE0006231004,2022-02-11,16:43,33.38,33.38,33.38,33.38,1500
433004,DE0008404005,2022-02-11,16:43,229.55,229.55,229.55,229.55,40
433005,DE000SHL1006,2022-02-11,16:43,56.58,56.58,56.58,56.58,100
433006,DE000WAF3001,2022-02-11,16:44,112.0,112.0,112.0,112.0,180


In [17]:
df_all.shape
# We can use .shape to see if dropna did anything, in this case it didn't

(433008, 8)

### Get opening price per ISIN and day:

In [18]:
df_all['opening_price'] = df_all.sort_values(by=['Time']).groupby(['ISIN','Date'])['StartPrice'].transform('first')
# Sort the data so we can see what we want more specifically

In [19]:
df_all[df_all['ISIN']=='AT0000A0E9W5']
# Here we get all of the opening prices of the specified ISIN
# The opening prices are put in a new column at the end of the df

Unnamed: 0,ISIN,Date,Time,StartPrice,MaxPrice,MinPrice,EndPrice,TradedVolume,opening_price
0,AT0000A0E9W5,2022-02-08,08:00,15.89,15.96,15.85,15.96,2066,15.89
471,AT0000A0E9W5,2022-02-08,08:03,15.89,15.89,15.89,15.89,328,15.89
594,AT0000A0E9W5,2022-02-08,08:04,15.91,15.91,15.91,15.91,3,15.89
3141,AT0000A0E9W5,2022-02-08,08:06,15.98,15.98,15.98,15.98,2000,15.89
4557,AT0000A0E9W5,2022-02-08,08:13,15.92,15.92,15.88,15.88,966,15.89
...,...,...,...,...,...,...,...,...,...
427227,AT0000A0E9W5,2022-02-11,16:22,16.35,16.35,16.35,16.35,1353,15.99
428659,AT0000A0E9W5,2022-02-11,16:27,16.31,16.31,16.31,16.31,1408,15.99
428954,AT0000A0E9W5,2022-02-11,16:28,16.31,16.31,16.29,16.29,588,15.99
429248,AT0000A0E9W5,2022-02-11,16:29,16.32,16.32,16.32,16.32,648,15.99


### Get closing price per ISIN and day

In [20]:
df_all['closing_price'] = df_all.sort_values(by=['Time']).groupby(['ISIN','Date'])['StartPrice'].transform('last')
# Shouldn't I change ['StartPrice'] to ['EndPrice'] ?

In [21]:
df_all[df_all['ISIN']=='AT0000A0E9W5']

Unnamed: 0,ISIN,Date,Time,StartPrice,MaxPrice,MinPrice,EndPrice,TradedVolume,opening_price,closing_price
0,AT0000A0E9W5,2022-02-08,08:00,15.89,15.96,15.85,15.96,2066,15.89,15.7
471,AT0000A0E9W5,2022-02-08,08:03,15.89,15.89,15.89,15.89,328,15.89,15.7
594,AT0000A0E9W5,2022-02-08,08:04,15.91,15.91,15.91,15.91,3,15.89,15.7
3141,AT0000A0E9W5,2022-02-08,08:06,15.98,15.98,15.98,15.98,2000,15.89,15.7
4557,AT0000A0E9W5,2022-02-08,08:13,15.92,15.92,15.88,15.88,966,15.89,15.7
...,...,...,...,...,...,...,...,...,...,...
427227,AT0000A0E9W5,2022-02-11,16:22,16.35,16.35,16.35,16.35,1353,15.99,16.32
428659,AT0000A0E9W5,2022-02-11,16:27,16.31,16.31,16.31,16.31,1408,15.99,16.32
428954,AT0000A0E9W5,2022-02-11,16:28,16.31,16.31,16.29,16.29,588,15.99,16.32
429248,AT0000A0E9W5,2022-02-11,16:29,16.32,16.32,16.32,16.32,648,15.99,16.32


### Aggregations

In [22]:
df_all = df_all.groupby(['ISIN', 'Date'], as_index=False).agg(
    opening_price_eur=('opening_price', 'min'),
    closing_price_eur=('closing_price', 'min'),
    minimum_price_eur=('MinPrice', 'min'),
    maximum_price_eur=('MaxPrice', 'max'),
    daily_traded_volume=('TradedVolume', 'sum')
)
# Leaving as_index True will cause ISIN and Date to be the index
# Grouping by min for this example doesn't make a difference but we 
# have to choose an aggregation

In [23]:
df_all
# This df has two outputs for each day for each ISIN

Unnamed: 0,ISIN,Date,opening_price_eur,closing_price_eur,minimum_price_eur,maximum_price_eur,daily_traded_volume
0,AT000000STR1,2022-02-08,38.750,38.750,38.500,38.750,251
1,AT000000STR1,2022-02-09,39.000,38.600,38.500,39.050,905
2,AT000000STR1,2022-02-10,38.950,38.950,38.950,39.450,147
3,AT000000STR1,2022-02-11,39.150,39.250,38.650,39.600,914
4,AT00000FACC2,2022-02-08,9.100,9.170,9.100,9.300,3906
...,...,...,...,...,...,...,...
12879,XS2314660700,2022-02-11,20.428,20.284,20.256,20.754,219
12880,XS2376095068,2022-02-08,39.182,38.098,37.996,39.182,27
12881,XS2376095068,2022-02-09,38.350,38.304,38.178,38.350,0
12882,XS2376095068,2022-02-10,38.530,39.254,38.530,39.524,2233


### Percent Change Prev Closing

In [24]:
df_all['prev_closing_price'] = df_all.sort_values(by=['Date']).groupby(['ISIN'])['closing_price_eur'].shift(1)
# This adds a new column at the end with the previous closing price

In [25]:
df_all

Unnamed: 0,ISIN,Date,opening_price_eur,closing_price_eur,minimum_price_eur,maximum_price_eur,daily_traded_volume,prev_closing_price
0,AT000000STR1,2022-02-08,38.750,38.750,38.500,38.750,251,
1,AT000000STR1,2022-02-09,39.000,38.600,38.500,39.050,905,38.750
2,AT000000STR1,2022-02-10,38.950,38.950,38.950,39.450,147,38.600
3,AT000000STR1,2022-02-11,39.150,39.250,38.650,39.600,914,38.950
4,AT00000FACC2,2022-02-08,9.100,9.170,9.100,9.300,3906,
...,...,...,...,...,...,...,...,...
12879,XS2314660700,2022-02-11,20.428,20.284,20.256,20.754,219,20.694
12880,XS2376095068,2022-02-08,39.182,38.098,37.996,39.182,27,
12881,XS2376095068,2022-02-09,38.350,38.304,38.178,38.350,0,38.098
12882,XS2376095068,2022-02-10,38.530,39.254,38.530,39.524,2233,38.304


In [26]:
df_all['change_prev_closing_%'] = ((df_all['closing_price_eur'] - df_all['prev_closing_price']) / df_all['prev_closing_price'] * 100)
# This generates a change in price percentage column

In [27]:
df_all

Unnamed: 0,ISIN,Date,opening_price_eur,closing_price_eur,minimum_price_eur,maximum_price_eur,daily_traded_volume,prev_closing_price,change_prev_closing_%
0,AT000000STR1,2022-02-08,38.750,38.750,38.500,38.750,251,,
1,AT000000STR1,2022-02-09,39.000,38.600,38.500,39.050,905,38.750,-0.387097
2,AT000000STR1,2022-02-10,38.950,38.950,38.950,39.450,147,38.600,0.906736
3,AT000000STR1,2022-02-11,39.150,39.250,38.650,39.600,914,38.950,0.770218
4,AT00000FACC2,2022-02-08,9.100,9.170,9.100,9.300,3906,,
...,...,...,...,...,...,...,...,...,...
12879,XS2314660700,2022-02-11,20.428,20.284,20.256,20.754,219,20.694,-1.981251
12880,XS2376095068,2022-02-08,39.182,38.098,37.996,39.182,27,,
12881,XS2376095068,2022-02-09,38.350,38.304,38.178,38.350,0,38.098,0.540711
12882,XS2376095068,2022-02-10,38.530,39.254,38.530,39.524,2233,38.304,2.480159


In [28]:
df_all.drop(columns=['prev_closing_price'], inplace=True)
# We don't need the previous closing price column anymore so we drop it

In [29]:
df_all = df_all.round(decimals=2)
# This rounds all values to two decimal places

In [30]:
# We want to filter by all dates greater or equal to arg date here:
df_all = df_all[df_all.Date >= arg_date]

In [31]:
df_all

Unnamed: 0,ISIN,Date,opening_price_eur,closing_price_eur,minimum_price_eur,maximum_price_eur,daily_traded_volume,change_prev_closing_%
1,AT000000STR1,2022-02-09,39.00,38.60,38.50,39.05,905,-0.39
2,AT000000STR1,2022-02-10,38.95,38.95,38.95,39.45,147,0.91
3,AT000000STR1,2022-02-11,39.15,39.25,38.65,39.60,914,0.77
5,AT00000FACC2,2022-02-09,9.48,9.38,9.38,9.49,153,2.29
6,AT00000FACC2,2022-02-10,9.30,9.25,9.22,9.43,620,-1.39
...,...,...,...,...,...,...,...,...
12878,XS2314660700,2022-02-10,20.91,20.69,20.69,21.01,801,1.10
12879,XS2314660700,2022-02-11,20.43,20.28,20.26,20.75,219,-1.98
12881,XS2376095068,2022-02-09,38.35,38.30,38.18,38.35,0,0.54
12882,XS2376095068,2022-02-10,38.53,39.25,38.53,39.52,2233,2.48


## Save to S3:

In [32]:
key = 'xetra_daily_report_' + datetime.today().strftime('%Y%m%d_%H%M%S') + '.parquet'
# Need to create a key, we are using the name that we want to
# save the file as, which includes the current datetime

In [42]:
# Use bytesio to create an output buffer: 
# Output buffer: "An output buffer is a location in memory or cache where
# data ready to be seen is held until the display device is ready."

out_buffer = BytesIO()

df_all.to_parquet(out_buffer, index=False)

bucket_target = s3.Bucket('xetra-bucket-123')
# Make sure to use the bucket name that you set in S3

bucket_target.put_object(Body=out_buffer.getvalue(), Key=key)


s3.Object(bucket_name='xetra-bucket-123', key='xetra_daily_report_20220212_000301.parquet')

I ran into an error and found a solution through some googling.

First you have to go to your bucket permissions unblock all public access from your bucket. Then you have to replace your bucket policy (also in permissions) with the code below. After that, it should be able to work. You can change your permissions back to private except for the "Block public and cross-account access to buckets and objects through any public bucket or access point policies" option. That one has to stay public  for some reason.

I don't really know why this worked for me, but it did. If you know why it works please or what the original issue was please let me know!

Code:

        {
        "Version": "2012-10-17",
        "Statement": [
            {
                "Action": [
                    "s3:PutObject",
                    "s3:PutObjectAcl",
                    "s3:GetObject",
                    "s3:GetObjectAcl",
                    "s3:DeleteObject"
                ],
                "Resource": [
                    "arn:aws:s3:::yourbucketnamehere",
                    "arn:aws:s3:::yourbucketnamehere/*"
                ],
                "Effect": "Allow",
                "Principal": "*"
            }
        ]
    }

### Reading the uploaded file:

In [43]:
for obj in bucket_target.objects.all():
    print(obj.key)
    
# Needed to give everyone access to read and write the ACL for this to work???
# I also created a new user while troubleshooting this, not sure if it did anything

xetra_daily_report_20220211_233010.parquet
xetra_daily_report_20220212_000301.parquet


In [46]:
prq_obj = bucket_target.Object(key='xetra_daily_report_20220212_000301.parquet').get().get('Body').read()

# Use object created above as key

data = BytesIO(prq_obj)
df_report = pd.read_parquet(data)

In [47]:
df_report
# This data should be the same as the data from df_all

Unnamed: 0,ISIN,Date,opening_price_eur,closing_price_eur,minimum_price_eur,maximum_price_eur,daily_traded_volume,change_prev_closing_%
0,AT000000STR1,2022-02-09,39.00,38.60,38.50,39.05,905,-0.39
1,AT000000STR1,2022-02-10,38.95,38.95,38.95,39.45,147,0.91
2,AT000000STR1,2022-02-11,39.15,39.25,38.65,39.60,914,0.77
3,AT00000FACC2,2022-02-09,9.48,9.38,9.38,9.49,153,2.29
4,AT00000FACC2,2022-02-10,9.30,9.25,9.22,9.43,620,-1.39
...,...,...,...,...,...,...,...,...
9658,XS2314660700,2022-02-10,20.91,20.69,20.69,21.01,801,1.10
9659,XS2314660700,2022-02-11,20.43,20.28,20.26,20.75,219,-1.98
9660,XS2376095068,2022-02-09,38.35,38.30,38.18,38.35,0,0.54
9661,XS2376095068,2022-02-10,38.53,39.25,38.53,39.52,2233,2.48


Next, we will refactor the code (and the Jupyter Notebook) to make it more pythonic, cleaner, and more readable.