### Getting and Viewing the data:

#### Change Below:
- Import datetime and timedelta

In [36]:
import boto3
import pandas as pd
from botocore import UNSIGNED
from botocore.config import Config
# botocore added from video comments
from io import StringIO
# ^ buffer in memory to use the read csv method
from datetime import datetime, timedelta

#### Change Below:

In [37]:
# Adding in the arguement date:
arg_date = '2022-01-13'
# The goal is to be able to take this date as an arguement and
# generate a report for everything since this date

#### Change Below:

In [38]:
# The date needs to be a datetime arguement of a string
# We need the data from the day before to generate the
# price change percentage, so we need the day before in datetime
arg_date_dt = datetime.strptime(arg_date, '%Y-%m-%d').date() - timedelta(days=1)

#### Change Below:

In [39]:
arg_date_dt

datetime.date(2022, 1, 12)

In [40]:
s3 = boto3.resource('s3', config=Config(signature_version=UNSIGNED))
# config part was added from video comments *
bucket = s3.Bucket('deutsche-boerse-xetra-pds')
# 'deutsche-boerse-xerta-pds' is the name of the bucket

A bucket is a container for objects. An object is a file and any metadata that describes that file. To store an object in Amazon S3, you create a bucket and then upload the object to the bucket. When the object is in the bucket, you can open it, download it, and move it. 

#### Change Below:

In [41]:
objects = [obj for obj in bucket.objects.all() if datetime.strptime(obj.key.split('/')[0], '%Y-%m-%d').date() >= arg_date_dt]
# Creates objects with the dates from the arg_date_dt and later
# Running this cell takes awhile

In [42]:
objects
# need to call 'objects' to see its output in Jupyter Notebook
# Here we can see that all objects since the specified arg_date are listed

[s3.ObjectSummary(bucket_name='deutsche-boerse-xetra-pds', key='2022-01-12/2022-01-12_BINS_XETR00.csv'),
 s3.ObjectSummary(bucket_name='deutsche-boerse-xetra-pds', key='2022-01-12/2022-01-12_BINS_XETR01.csv'),
 s3.ObjectSummary(bucket_name='deutsche-boerse-xetra-pds', key='2022-01-12/2022-01-12_BINS_XETR02.csv'),
 s3.ObjectSummary(bucket_name='deutsche-boerse-xetra-pds', key='2022-01-12/2022-01-12_BINS_XETR03.csv'),
 s3.ObjectSummary(bucket_name='deutsche-boerse-xetra-pds', key='2022-01-12/2022-01-12_BINS_XETR04.csv'),
 s3.ObjectSummary(bucket_name='deutsche-boerse-xetra-pds', key='2022-01-12/2022-01-12_BINS_XETR05.csv'),
 s3.ObjectSummary(bucket_name='deutsche-boerse-xetra-pds', key='2022-01-12/2022-01-12_BINS_XETR06.csv'),
 s3.ObjectSummary(bucket_name='deutsche-boerse-xetra-pds', key='2022-01-12/2022-01-12_BINS_XETR07.csv'),
 s3.ObjectSummary(bucket_name='deutsche-boerse-xetra-pds', key='2022-01-12/2022-01-12_BINS_XETR08.csv'),
 s3.ObjectSummary(bucket_name='deutsche-boerse-xetra-pd

In [43]:
# csv_obj = bucket.Object(key='2021-03-15/2021-03-15_BINS_XETR15.csv').get().get('Body').read().decode('utf-8')
# View just one object, key taken randomly from one of the files listed above

In [44]:
csv_obj_init = bucket.Object(key=objects[0].key).get().get('Body').read().decode('utf-8')
data = StringIO(csv_obj_init)
df_init = pd.read_csv(data, delimiter=',')
# This gets the first object so we can pass it to columns in df_all below

In [45]:
# This df is empty because this is stock data at midnight, and all of the stock markets are closed
# The fact that this happens to be an empty df is coincidental
# We just want to use the columns for df_all below so it doesn't matter that it is empty

df_init.columns
# Returns a list of all of the column names from this df

Index(['ISIN', 'Mnemonic', 'SecurityDesc', 'SecurityType', 'Currency',
       'SecurityID', 'Date', 'Time', 'StartPrice', 'MaxPrice', 'MinPrice',
       'EndPrice', 'TradedVolume', 'NumberOfTrades'],
      dtype='object')

In [46]:
df_all = pd.DataFrame(columns=df_init.columns)
# ^All DFs have to use the same columns
# We use the columns from the a df of the first object

for obj in objects:
    csv_obj = bucket.Object(key=obj.key).get().get('Body').read().decode('utf-8')
    data = StringIO(csv_obj)
    df = pd.read_csv(data, delimiter =',')
    df_all = pd.concat([df_all, df], ignore_index=True)
# Convert the csv_object to a pandas df to read it more easily
# df_all combines dataframes into one
# for loop with 'key=obj.key' to view multiple objects at once 
# Returns a warnning that recommends using concat instead of append

In [47]:
df_all

Unnamed: 0,ISIN,Mnemonic,SecurityDesc,SecurityType,Currency,SecurityID,Date,Time,StartPrice,MaxPrice,MinPrice,EndPrice,TradedVolume,NumberOfTrades
0,AT0000A0E9W5,SANT,S+T AG O.N.,Common stock,EUR,2504159,2022-01-12,08:00,14.69,14.7,14.56,14.7,7151,8
1,DE000A0DJ6J9,S92,SMA SOLAR TECHNOL.AG,Common stock,EUR,2504287,2022-01-12,08:00,36.4,36.5,36.38,36.38,2741,7
2,DE000A0D6554,NDX1,NORDEX SE O.N.,Common stock,EUR,2504290,2022-01-12,08:00,14.3,14.38,14.27,14.36,33226,57
3,DE000A0D9PT0,MTX,MTU AERO ENGINES NA O.N.,Common stock,EUR,2504297,2022-01-12,08:00,194.0,194.25,193.7,194.05,2458,24
4,DE000A0HN5C6,DWNI,DEUTSCHE WOHNEN SE INH,Common stock,EUR,2504314,2022-01-12,08:00,36.57,36.68,36.57,36.68,5416,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2736677,DE0006231004,IFX,INFINEON TECH.AG NA O.N.,Common stock,EUR,2505024,2022-02-11,16:43,33.38,33.38,33.38,33.38,1500,1
2736678,DE0008404005,ALV,ALLIANZ SE NA O.N.,Common stock,EUR,2505133,2022-02-11,16:43,229.55,229.55,229.55,229.55,40,1
2736679,DE000SHL1006,SHL,SIEMENS HEALTH.AG NA O.N.,Common stock,EUR,3058562,2022-02-11,16:43,56.58,56.58,56.58,56.58,100,1
2736680,DE000WAF3001,WAF,SILTRONIC AG NA O.N.,Common stock,EUR,2504859,2022-02-11,16:44,112.0,112.0,112.0,112.0,180,1


In [48]:
columns = ['ISIN', 'Date', 'Time', 'StartPrice', 'MaxPrice', 'MinPrice', 'EndPrice', 'TradedVolume']
# ^ Create a list of relevant columns that we want to look at from our DF

df_all = df_all.loc[:, columns]

In [49]:
# data = StringIO(csv_obj)
# df = pd.read_csv(data, delimiter =',')
# Convert the csv_object to a pandas df to read it more easily
# Only use this when viewing one object, it is used in the for loop above to view multiple

In [50]:
df_all.dropna(inplace=True)
# ^ Drop NA values

In [51]:
df_all

Unnamed: 0,ISIN,Date,Time,StartPrice,MaxPrice,MinPrice,EndPrice,TradedVolume
0,AT0000A0E9W5,2022-01-12,08:00,14.69,14.7,14.56,14.7,7151
1,DE000A0DJ6J9,2022-01-12,08:00,36.4,36.5,36.38,36.38,2741
2,DE000A0D6554,2022-01-12,08:00,14.3,14.38,14.27,14.36,33226
3,DE000A0D9PT0,2022-01-12,08:00,194.0,194.25,193.7,194.05,2458
4,DE000A0HN5C6,2022-01-12,08:00,36.57,36.68,36.57,36.68,5416
...,...,...,...,...,...,...,...,...
2736677,DE0006231004,2022-02-11,16:43,33.38,33.38,33.38,33.38,1500
2736678,DE0008404005,2022-02-11,16:43,229.55,229.55,229.55,229.55,40
2736679,DE000SHL1006,2022-02-11,16:43,56.58,56.58,56.58,56.58,100
2736680,DE000WAF3001,2022-02-11,16:44,112.0,112.0,112.0,112.0,180


In [52]:
df_all.shape
# We can use .shape to see if dropna did anything, in this case it didn't

(2736682, 8)

### Get opening price per ISIN and day:

In [53]:
df_all['opening_price'] = df_all.sort_values(by=['Time']).groupby(['ISIN','Date'])['StartPrice'].transform('first')
# Sort the data so we can see what we want more specifically

In [54]:
df_all[df_all['ISIN']=='AT0000A0E9W5']
# Here we get all of the opening prices of the specified ISIN
# The opening prices are put in a new column at the end of the df

Unnamed: 0,ISIN,Date,Time,StartPrice,MaxPrice,MinPrice,EndPrice,TradedVolume,opening_price
0,AT0000A0E9W5,2022-01-12,08:00,14.69,14.7,14.56,14.7,7151,14.69
655,AT0000A0E9W5,2022-01-12,08:04,14.6,14.72,14.6,14.65,1443,14.69
2967,AT0000A0E9W5,2022-01-12,08:05,14.58,14.58,14.58,14.58,101,14.69
3255,AT0000A0E9W5,2022-01-12,08:06,14.6,14.6,14.6,14.6,220,14.69
3569,AT0000A0E9W5,2022-01-12,08:07,14.58,14.58,14.57,14.58,564,14.69
...,...,...,...,...,...,...,...,...,...
2730901,AT0000A0E9W5,2022-02-11,16:22,16.35,16.35,16.35,16.35,1353,15.99
2732333,AT0000A0E9W5,2022-02-11,16:27,16.31,16.31,16.31,16.31,1408,15.99
2732628,AT0000A0E9W5,2022-02-11,16:28,16.31,16.31,16.29,16.29,588,15.99
2732922,AT0000A0E9W5,2022-02-11,16:29,16.32,16.32,16.32,16.32,648,15.99


### Get closing price per ISIN and day

In [None]:
df_all['closing_price'] = df_all.sort_values(by=['Time']).groupby(['ISIN','Date'])['StartPrice'].transform('last')
# Shouldn't I change ['StartPrice'] to ['EndPrice'] ?

In [None]:
df_all[df_all['ISIN']=='AT0000A0E9W5']

### Aggregations

In [None]:
df_all = df_all.groupby(['ISIN', 'Date'], as_index=False).agg(
    opening_price_eur=('opening_price', 'min'),
    closing_price_eur=('closing_price', 'min'),
    minimum_price_eur=('MinPrice', 'min'),
    maximum_price_eur=('MaxPrice', 'max'),
    daily_traded_volume=('TradedVolume', 'sum')
)
# Leaving as_index True will cause ISIN and Date to be the index
# Grouping by min for this example doesn't make a difference but we 
# have to choose an aggregation

In [None]:
df_all
# This df has two outputs for each day for each ISIN

### Percent Change Prev Closing

In [None]:
df_all['prev_closing_price'] = df_all.sort_values(by=['Date']).groupby(['ISIN'])['closing_price_eur'].shift(1)
# This adds a new column at the end with the previous closing price

In [None]:
df_all

In [None]:
df_all['change_prev_closing_%'] = ((df_all['closing_price_eur'] - df_all['prev_closing_price']) / df_all['prev_closing_price'] * 100)
# This generates a change in price percentage column

In [None]:
df_all

In [None]:
df_all.drop(columns=['prev_closing_price'], inplace=True)
# We don't need the previous closing price column anymore so we drop it

In [None]:
df_all = df_all.round(decimals=2)
# This rounds all values to two decimal places

#### Change Below:

In [None]:
# We want to filter by all dates greater or equal to arg date here:
df_all = df_all[df_all.Date >= arg_date]

In [None]:
df_all

Next, we will save the transformed dataframe to S3