# Time Series: Part 1

## Data and Imports

In [5]:
import pandas as pd

In [35]:
sp500 = pd.read_csv('sp500.csv')

In [9]:
sp500.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume'], dtype='object')

In [18]:
## Notice date is an object and not a timeseries. We'll need to change that

sp500.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1258 entries, 0 to 1257
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    1257 non-null   object 
 1   Open    1258 non-null   float64
 2   High    1258 non-null   float64
 3   Low     1258 non-null   float64
 4   Close   1258 non-null   float64
 5   Volume  1258 non-null   int64  
dtypes: float64(4), int64(1), object(1)
memory usage: 59.1+ KB


In [16]:
sp500.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,,4703.96,4740.74,4703.96,4725.79,2194630000
1,"Dec. 22, 2021",4650.36,4697.67,4645.53,4696.56,2439570000
2,"Dec. 21, 2021",4594.96,4651.14,4583.16,4649.23,2564370000
3,"Dec. 20, 2021",4587.9,4587.9,4531.1,4568.02,3395780000
4,"Dec. 17, 2021",4652.5,4666.7,4600.22,4620.64,5609780000


## Review Time Series

### Change To DateTime

In [37]:
## Lets change Date to a datetime 
## It is also good practice to use the format parameter to speed up the process and ensure the date and time is set properly
## Use this website to find whats needed to set the format:
## https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior

sp500['Date'] = pd.to_datetime(sp500['Date'], format='%b. %d, %Y')
sp500.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1258 entries, 0 to 1257
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    1257 non-null   datetime64[ns]
 1   Open    1258 non-null   float64       
 2   High    1258 non-null   float64       
 3   Low     1258 non-null   float64       
 4   Close   1258 non-null   float64       
 5   Volume  1258 non-null   int64         
dtypes: datetime64[ns](1), float64(4), int64(1)
memory usage: 59.1 KB


### Handle Missing Data

In [44]:
## Notice missing data in Date

sp500.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1258 entries, 0 to 1257
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    1257 non-null   datetime64[ns]
 1   Open    1258 non-null   float64       
 2   High    1258 non-null   float64       
 3   Low     1258 non-null   float64       
 4   Close   1258 non-null   float64       
 5   Volume  1258 non-null   int64         
dtypes: datetime64[ns](1), float64(4), int64(1)
memory usage: 59.1 KB


In [46]:
## In this situation since its just one row we can just drop the row

sp500=sp500.dropna()

In [50]:
## Now no missing data
sp500.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1257 entries, 1 to 1257
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    1257 non-null   datetime64[ns]
 1   Open    1257 non-null   float64       
 2   High    1257 non-null   float64       
 3   Low     1257 non-null   float64       
 4   Close   1257 non-null   float64       
 5   Volume  1257 non-null   int64         
dtypes: datetime64[ns](1), float64(4), int64(1)
memory usage: 68.7 KB


### Now lets find the start and end date of the dataset

In [55]:
## Oldest date
sp500['Date'].min()

Timestamp('2016-12-27 00:00:00')

In [53]:
## Most recent date
sp500['Date'].max()

Timestamp('2021-12-22 00:00:00')

In [57]:
## Want to know the length of time this dataset goes for:

sp500['Date'].max() - sp500['Date'].min()

Timedelta('1821 days 00:00:00')

In [86]:
## What if we wanna know the length in years?

(sp500['Date'].max() - sp500['Date'].min())/pd.Timedelta(days=365)

4.989041095890411

### Timestamp

In [89]:
## We can define a timestamp like so: 
pd.Timestamp('2021-01-01')

Timestamp('2021-01-01 00:00:00')

In [93]:
## and then compare it to dates within the dataset

sp500['Date'].max() - pd.Timestamp('2021-01-01')

Timedelta('355 days 00:00:00')

### Break the Date Column down to year, month, day, day_name

In [105]:
## Lets do year:

sp500['Year'] = sp500['Date'].dt.year
sp500['Year']

1       2021
2       2021
3       2021
4       2021
5       2021
        ... 
1253    2017
1254    2016
1255    2016
1256    2016
1257    2016
Name: Year, Length: 1257, dtype: int32

In [107]:
## Lets do month:

sp500['Month'] = sp500['Date'].dt.month
sp500['Month']

1       12
2       12
3       12
4       12
5       12
        ..
1253     1
1254    12
1255    12
1256    12
1257    12
Name: Month, Length: 1257, dtype: int32

In [109]:
## Lets do day:

sp500['Day'] = sp500['Date'].dt.day
sp500['Day']

1       22
2       21
3       20
4       17
5       16
        ..
1253     3
1254    30
1255    29
1256    28
1257    27
Name: Day, Length: 1257, dtype: int32

In [111]:
## Lets do day_name:

sp500['Day_name'] = sp500['Date'].dt.day_name()
sp500['Day_name']

1       Wednesday
2         Tuesday
3          Monday
4          Friday
5        Thursday
          ...    
1253      Tuesday
1254       Friday
1255     Thursday
1256    Wednesday
1257      Tuesday
Name: Day_name, Length: 1257, dtype: object

In [113]:
sp500.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1257 entries, 1 to 1257
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Date      1257 non-null   datetime64[ns]
 1   Open      1257 non-null   float64       
 2   High      1257 non-null   float64       
 3   Low       1257 non-null   float64       
 4   Close     1257 non-null   float64       
 5   Volume    1257 non-null   int64         
 6   Year      1257 non-null   int32         
 7   Month     1257 non-null   int32         
 8   Day       1257 non-null   int32         
 9   Day_name  1257 non-null   object        
dtypes: datetime64[ns](1), float64(4), int32(3), int64(1), object(1)
memory usage: 93.3+ KB


In [115]:
sp500.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Year,Month,Day,Day_name
1,2021-12-22,4650.36,4697.67,4645.53,4696.56,2439570000,2021,12,22,Wednesday
2,2021-12-21,4594.96,4651.14,4583.16,4649.23,2564370000,2021,12,21,Tuesday
3,2021-12-20,4587.9,4587.9,4531.1,4568.02,3395780000,2021,12,20,Monday
4,2021-12-17,4652.5,4666.7,4600.22,4620.64,5609780000,2021,12,17,Friday
5,2021-12-16,4719.13,4731.99,4651.89,4668.67,3592810000,2021,12,16,Thursday


## Pickle The New Dataset For Next lesson

In [118]:
sp500.to_pickle('sp500.pkl')