# RNN

In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

## 1. Load the data

In [2]:
df = pd.read_csv ('data/appl_1980_2014.csv')

In [3]:
df.head(10)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,2014-07-08,96.27,96.8,93.92,95.35,65130000,95.35
1,2014-07-07,94.14,95.99,94.1,95.97,56305400,95.97
2,2014-07-03,93.67,94.1,93.2,94.03,22891800,94.03
3,2014-07-02,93.87,94.06,93.09,93.48,28420900,93.48
4,2014-07-01,93.52,94.07,93.13,93.52,38170200,93.52
5,2014-06-30,92.1,93.73,92.09,92.93,49482300,92.93
6,2014-06-27,90.82,92.0,90.77,91.98,64006800,91.98
7,2014-06-26,90.37,91.05,89.8,90.9,32595800,90.9
8,2014-06-25,90.21,90.7,89.65,90.36,36852200,90.36
9,2014-06-24,90.75,91.74,90.19,90.28,38988300,90.28


In [6]:
# make sure your data is of actually date format
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8465 entries, 0 to 8464
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       8465 non-null   object 
 1   Open       8465 non-null   float64
 2   High       8465 non-null   float64
 3   Low        8465 non-null   float64
 4   Close      8465 non-null   float64
 5   Volume     8465 non-null   int64  
 6   Adj Close  8465 non-null   float64
dtypes: float64(5), int64(1), object(1)
memory usage: 463.1+ KB


In [7]:
# date is string format, change to date format
df.Date = pd.to_datetime(df.Date)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8465 entries, 0 to 8464
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       8465 non-null   datetime64[ns]
 1   Open       8465 non-null   float64       
 2   High       8465 non-null   float64       
 3   Low        8465 non-null   float64       
 4   Close      8465 non-null   float64       
 5   Volume     8465 non-null   int64         
 6   Adj Close  8465 non-null   float64       
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 463.1 KB


### Choose the time frame

In [9]:
# get the unique year 
df.Date.dt.year.unique()

array([2014, 2013, 2012, 2011, 2010, 2009, 2008, 2007, 2006, 2005, 2004,
       2003, 2002, 2001, 2000, 1999, 1998, 1997, 1996, 1995, 1994, 1993,
       1992, 1991, 1990, 1989, 1988, 1987, 1986, 1985, 1984, 1983, 1982,
       1981, 1980])

In [10]:
# get the year between 2009 and 2014
cond = (df.Date.dt.year > 2009) & (df.Date.dt.year < 2014)
df = df[cond]
df.Date.dt.year.unique()

array([2013, 2012, 2011, 2010])

### Set date as index

In [13]:
df.index

Int64Index([ 129,  130,  131,  132,  133,  134,  135,  136,  137,  138,
            ...
            1125, 1126, 1127, 1128, 1129, 1130, 1131, 1132, 1133, 1134],
           dtype='int64', length=1006)

now index is interger seires. For timeseries, index should be DateTime.

In [14]:
# change the index to DateTime
df = df.set_index('Date')
df.index

DatetimeIndex(['2013-12-31', '2013-12-30', '2013-12-27', '2013-12-26',
               '2013-12-24', '2013-12-23', '2013-12-20', '2013-12-19',
               '2013-12-18', '2013-12-17',
               ...
               '2010-01-15', '2010-01-14', '2010-01-13', '2010-01-12',
               '2010-01-11', '2010-01-08', '2010-01-07', '2010-01-06',
               '2010-01-05', '2010-01-04'],
              dtype='datetime64[ns]', name='Date', length=1006, freq=None)