In [1]:
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
import datetime
import random
from itertools import count

In [2]:
df = pd.read_csv("AAPL-Updated.csv")

In [5]:
csv_file = "AAPL-Updated.csv"

window_len = 30   # Number of trading days in a window
dataset_size = 100
samples = [df.iloc[x:window_len] for x in np.random.randint(len(df), size=dataset_size)]

earliest_date = "01-02-2014"
latest_date = "01-18-2019"
datestr_format = "%m-%d-%Y"

e_datetime = datetime.datetime.strptime(earliest_date, "%m-%d-%Y").date()
l_datetime = datetime.datetime.strptime(latest_date, "%m-%d-%Y").date()

In [6]:
df[df["Time"] == "09:30"].head()

Unnamed: 0,Date,Time,Open,Volume,Percent-Change
0,01-02-2014,09:30,72.5,1577070,0.041396
391,01-03-2014,09:30,72.13,1156545,-0.02772
782,01-06-2014,09:30,70.13,2278719,0.028527
1173,01-07-2014,09:30,71.03,1463014,0.014081
1564,01-08-2014,09:30,70.32,1309121,-0.05685


In [7]:
r = random.random()
rdate = e_datetime + (l_datetime - e_datetime) * r
rdate_str = rdate.strftime("%m-%d-%Y")
print(rdate_str)

09-11-2016


In [8]:
rdate_str in df["Date"].values

False

In [10]:
rdate + datetime.timedelta(days=window_len)

datetime.date(2016, 10, 11)

In [11]:
rdate < l_datetime

True

In [12]:
date_range = []
for i in count():
    s = (rdate + datetime.timedelta(days=i)).strftime("%m-%d-%Y")
    if s in df["Date"].values:
        date_range.append(s)
    if len(date_range) >= 30:
        break
    
    
# date_range = [(rdate + datetime.timedelta(days=i)).strftime("%m-%d-%Y") for i in range(window_size) if (rdate + datetime.timedelta(days=i)).strftime("%m-%d-%Y") in df["Date"].values]
date_range

['09-12-2016',
 '09-13-2016',
 '09-14-2016',
 '09-15-2016',
 '09-16-2016',
 '09-19-2016',
 '09-20-2016',
 '09-21-2016',
 '09-22-2016',
 '09-23-2016',
 '09-26-2016',
 '09-27-2016',
 '09-28-2016',
 '09-29-2016',
 '09-30-2016',
 '10-03-2016',
 '10-04-2016',
 '10-05-2016',
 '10-06-2016',
 '10-07-2016',
 '10-10-2016',
 '10-11-2016',
 '10-12-2016',
 '10-13-2016',
 '10-14-2016',
 '10-17-2016',
 '10-18-2016',
 '10-19-2016',
 '10-20-2016',
 '10-21-2016']

In [13]:
df[df["Date"].isin(date_range)]

Unnamed: 0,Date,Time,Open,Volume,Percent-Change
264590,09-12-2016,09:30,99.02,1334100,-0.050469
264591,09-12-2016,09:31,99.32,411673,0.302969
264592,09-12-2016,09:32,99.62,452877,0.302054
264593,09-12-2016,09:33,99.70,296926,0.080305
264594,09-12-2016,09:34,99.86,288779,0.160481
264595,09-12-2016,09:35,99.99,714683,0.130182
264596,09-12-2016,09:36,100.32,487411,0.330033
264597,09-12-2016,09:37,100.51,485276,0.189394
264598,09-12-2016,09:38,100.47,252037,-0.039797
264599,09-12-2016,09:39,100.37,393114,-0.099532


In [16]:
class StockDataset(Dataset):
    
    def __init__(self, csv_file, first_datestr, last_datestr, datestr_format, window_len, dataset_size):
        
        self.df = pd.read_csv(csv_file)
        self.window_len = window_len
        self.size = dataset_size
        
        # Dataset as a list of randomly selected dataframes with continuous dates
        self.dataset = []
        
        # Generate batches of dataframes
        e_datetime = datetime.datetime.strptime(earliest_date, datestr_format).date()
        l_datetime = datetime.datetime.strptime(latest_date, datestr_format).date()
        
        for i in range(self.size):
            # Generate a single dataframe
            date_range = []
            while True:
                date_range.clear()
                rdate = e_datetime + (l_datetime - e_datetime) * random.random()    # A random date in the range
                for j in count():
                    s = (rdate + datetime.timedelta(days=j)).strftime(datestr_format)
                    print(s)
                    if s in df["Date"].values:
                        date_range.append(s)
                    if len(date_range) >= 30:
                        break
                # Check if last date in range 
                if datetime.datetime.strptime(date_range[-1], datestr_format).date() < l_datetime:
                    break
          
            self.dataset.append(self.df[self.df["Date"].isin(date_range)])       
            
        
    def __len__(self):
    # Return the size of the dataset
        return self.size
    
    def __getitem__(self, idx):
        return self.dataset[idx]

In [17]:
dataset = StockDataset(csv_file, earliest_date, latest_date, datestr_format, window_len, dataset_size)

06-07-2018
06-08-2018
06-09-2018
06-10-2018
06-11-2018
06-12-2018
06-13-2018
06-14-2018
06-15-2018
06-16-2018
06-17-2018
06-18-2018
06-19-2018
06-20-2018
06-21-2018
06-22-2018
06-23-2018
06-24-2018
06-25-2018
06-26-2018
06-27-2018
06-28-2018
06-29-2018
06-30-2018
07-01-2018
07-02-2018
07-03-2018
07-04-2018
07-05-2018
07-06-2018
07-07-2018
07-08-2018
07-09-2018
07-10-2018
07-11-2018
07-12-2018
07-13-2018
07-14-2018
07-15-2018
07-16-2018
07-17-2018
07-18-2018
07-19-2018
08-21-2017
08-22-2017
08-23-2017
08-24-2017
08-25-2017
08-26-2017
08-27-2017
08-28-2017
08-29-2017
08-30-2017
08-31-2017
09-01-2017
09-02-2017
09-03-2017
09-04-2017
09-05-2017
09-06-2017
09-07-2017
09-08-2017
09-09-2017
09-10-2017
09-11-2017
09-12-2017
09-13-2017
09-14-2017
09-15-2017
09-16-2017
09-17-2017
09-18-2017
09-19-2017
09-20-2017
09-21-2017
09-22-2017
09-23-2017
09-24-2017
09-25-2017
09-26-2017
09-27-2017
09-28-2017
09-29-2017
09-30-2017
10-01-2017
10-02-2017
09-27-2018
09-28-2018
09-29-2018
09-30-2018
10-01-2018

KeyboardInterrupt: 

In [None]:
dataset.dataset[1]