In [2]:
import pandas as pd
import time

# Dataset 1kk

In [12]:
# Filter by number	Open > 70
start = time.time()

sp500_stocks = pd.read_csv('Datasets/1kk/sp500_stocks.csv')

sp500_stocks = sp500_stocks[sp500_stocks.Open > 70]

sp500_stocks.to_csv('Datasets/1kk/sp500_temp.csv', index=False)

end = time.time() 

print("Delta time: ", end - start)

Delta time:  3.7442679405212402


In [17]:
# Filter by string	Symbol = AAPL
start = time.time()

sp500_stocks = pd.read_csv('Datasets/1kk/sp500_stocks.csv')

sp500_stocks = sp500_stocks[sp500_stocks.Symbol == 'AAPL']

sp500_stocks.to_csv('Datasets/1kk/sp500_temp.csv', index=False)

end = time.time() 

print("Delta time: ", end - start)

Delta time:  1.0565924644470215


In [25]:
# Distinct	Symbol
start = time.time()

sp500_stocks = pd.read_csv('Datasets/1kk/sp500_stocks.csv')

sp500_stocks = sp500_stocks['Symbol'].drop_duplicates()

sp500_stocks.to_csv('Datasets/1kk/sp500_temp.csv', index=False)

end = time.time() 

print("Delta time: ", end - start)

Delta time:  1.05568265914917


In [14]:
# Sort	by Open asc
start = time.time()

sp500_stocks = pd.read_csv('Datasets/1kk/sp500_stocks.csv')

sp500_stocks = sp500_stocks.sort_values(by='Open', ascending=False)

sp500_stocks.to_csv('Datasets/1kk/sp500_sorted.csv', index=False)

end = time.time() 

print("Delta time: ", end - start)

Delta time:  7.643849849700928


In [30]:
# Union	with Sorted
start = time.time()

sp500_stocks = pd.read_csv('Datasets/1kk/sp500_stocks.csv')
sp500_stocks_sorted = pd.read_csv('Datasets/1kk/sp500_sorted.csv')

sp500_stocks =  pd.concat([sp500_stocks, sp500_stocks_sorted])

sp500_stocks.to_csv('Datasets/1kk/sp500_temp.csv', index=False)

end = time.time() 

print("Delta time: ", end - start)

Delta time:  15.093489408493042


In [35]:
# Group By	by Symbol Volume min max avg sum + count
start = time.time()

sp500_stocks = pd.read_csv('Datasets/1kk/sp500_stocks.csv')

sp500_stocks = sp500_stocks.groupby('Symbol').agg({'Volume': ['min', 'max', 'mean', 'sum', 'count']})
sp500_stocks = sp500_stocks.reset_index()

sp500_stocks.columns = ['Symbol', 'Volume_min', 'Volume_max', 'Volume_avg', 'Volume_sum', 'Count'] 

sp500_stocks.to_csv('Datasets/1kk/sp500_temp.csv', index=False)

end = time.time() 

print("Delta time: ", end - start)

Delta time:  1.094360589981079


In [40]:
# Left Join	with companies take everything
start = time.time()

sp500_stocks = pd.read_csv('Datasets/1kk/sp500_stocks.csv')
sp500_companies = pd.read_csv('Datasets/1kk/sp500_companies.csv')

sp500_stocks = pd.merge(sp500_stocks, sp500_companies, on='Symbol', how='left')

sp500_stocks.to_csv('Datasets/1kk/sp500_temp.csv', index=False)

end = time.time() 

print("Delta time: ", end - start)

Delta time:  35.938045501708984


In [8]:
# Inner Join	with company list
start = time.time()

sp500_stocks = pd.read_csv('Datasets/1kk/sp500_stocks.csv')
sp500_company_list = pd.read_csv('Datasets/1kk/sp500_company_list.csv')

sp500_stocks = pd.merge(sp500_stocks, sp500_company_list, on='Symbol', how='inner')

sp500_stocks.to_csv('Datasets/1kk/sp500_temp.csv', index=False)

end = time.time() 

print("Delta time: ", end - start)

Delta time:  4.055384874343872


In [19]:
# Math	High +-/* Low
start = time.time()

sp500_stocks = pd.read_csv('Datasets/1kk/sp500_stocks.csv')

sp500_stocks['Calculation1'] = sp500_stocks['High'] + sp500_stocks['Low']
sp500_stocks['Calculation2'] = sp500_stocks['High'] - sp500_stocks['Low']
sp500_stocks['Calculation3'] = sp500_stocks['High'] * sp500_stocks['Low']
sp500_stocks['Calculation4'] = sp500_stocks['High'] / sp500_stocks['Low'] 

sp500_stocks.to_csv('Datasets/1kk/sp500_temp.csv', index=False)

end = time.time() 

print("Delta time: ", end - start)

Delta time:  11.496999502182007


# Pineline

In [23]:
start = time.time()

sp500_stocks = pd.read_csv('Datasets/1kk/sp500_stocks.csv')
sp500_companies = pd.read_csv('Datasets/1kk/sp500_companies.csv')
sp500_company_list = pd.read_csv('Datasets/1kk/sp500_company_list.csv')

# parse date 
sp500_stocks['Date'] = pd.to_datetime(sp500_stocks['Date'])

# Filter by date	Date > 2015-01-01   
sp500_stocks = sp500_stocks[sp500_stocks.Date > '2015-01-01']

# OpenCloseDiff = Open - Close
sp500_stocks['OpenCloseDiff'] = sp500_stocks['Open'] - sp500_stocks['Close']

# left join with companies, take Exchange, Sector, Industry, Country
sp500_stocks = pd.merge(sp500_stocks, sp500_companies[['Symbol', 'Exchange', 'Sector', 'Industry', 'Country']], on='Symbol', how='left')

# group by country, aggregate high max, low min, openclosediff max
sp500_stocks_country = sp500_stocks.groupby('Country').agg({'High': 'max', 'Low': 'min', 'OpenCloseDiff': 'max'})
sp500_stocks_country['AggregationType'] = 'Country'

# group by symbol, aggregate high max, low min, openclosediff max
sp500_stocks_symbol = sp500_stocks.groupby('Symbol').agg({'High': 'max', 'Low': 'min', 'OpenCloseDiff': 'max'})
sp500_stocks_symbol['AggregationType'] = 'Symbol'

# group by exchange, aggregate high max, low min, openclosediff max
sp500_stocks_exchange = sp500_stocks.groupby('Exchange').agg({'High': 'max', 'Low': 'min', 'OpenCloseDiff': 'max'})
sp500_stocks_exchange['AggregationType'] = 'Exchange' 

# group by sector, aggregate high max, low min, openclosediff max
sp500_stocks_sector = sp500_stocks.groupby('Sector').agg({'High': 'max', 'Low': 'min', 'OpenCloseDiff': 'max'}) 
sp500_stocks_sector['AggregationType'] = 'Sector'

# group by industry, aggregate high max, low min, openclosediff max
sp500_stocks_industry = sp500_stocks.groupby('Industry').agg({'High': 'max', 'Low': 'min', 'OpenCloseDiff': 'max'}) 
sp500_stocks_industry['AggregationType'] = 'Industry'

# union grouped
sp500_stocks = pd.concat([sp500_stocks_country, sp500_stocks_symbol, sp500_stocks_exchange, sp500_stocks_sector, sp500_stocks_industry])
sp500_stocks = sp500_stocks.reset_index()
sp500_stocks.columns = ['Name', 'High_max', 'Low_min', 'OpenCloseDiff_max', 'AggregationType']
sp500_stocks.head()

# sort asc by aggregationtype, asc by name
sp500_stocks = sp500_stocks.sort_values(by=['AggregationType', 'Name'], ascending=[True, True])

sp500_stocks.to_csv('Datasets/1kk/sp500_temp.csv', index=False)

end = time.time()

print("Delta time: ", end - start)



Delta time:  1.4341228008270264
