## Extracting Date & Time

In [2]:
import pandas as pd

In [3]:
air_quality =  pd.read_csv('air_quality.csv')

In [4]:
air_quality.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95685 entries, 0 to 95684
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   date_time  95685 non-null  object 
 1   PM2.5      95685 non-null  float64
 2   PM10       95685 non-null  float64
 3   SO2        95685 non-null  float64
 4   NO2        95685 non-null  float64
 5   CO         95685 non-null  float64
 6   O3         95685 non-null  float64
 7   TEMP       95685 non-null  float64
 8   PRES       95685 non-null  float64
 9   DEWP       95685 non-null  float64
 10  RAIN       95685 non-null  float64
 11  wd         95685 non-null  object 
 12  WSPM       95685 non-null  float64
 13  station    95685 non-null  object 
dtypes: float64(11), object(3)
memory usage: 10.2+ MB


In [5]:
# Only really focusing on the date_time column sine were working with date times in this lesson
# Notice the date_time column is labled as 'object' above in the info section. This means its currently a string, not what we want.
air_quality.head()

Unnamed: 0,date_time,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,wd,WSPM,station
0,2013-03-01 00:00:00,9.0,9.0,3.0,17.0,300.0,89.0,-0.5,1024.5,-21.4,0.0,NNW,5.7,Dongsi
1,2013-03-01 01:00:00,4.0,4.0,3.0,16.0,300.0,88.0,-0.7,1025.1,-22.1,0.0,NW,3.9,Dongsi
2,2013-03-01 05:00:00,4.0,4.0,9.0,25.0,300.0,78.0,-2.4,1027.5,-21.3,0.0,NW,2.4,Dongsi
3,2013-03-01 06:00:00,5.0,5.0,10.0,29.0,400.0,67.0,-2.5,1028.2,-20.4,0.0,NW,2.2,Dongsi
4,2013-03-01 07:00:00,3.0,6.0,12.0,40.0,400.0,52.0,-1.4,1029.5,-20.4,0.0,NNW,3.0,Dongsi


In [6]:
# Because the date_time column is an object type datetime methods won't work.
# Trying to get the year but cannot do so because the date_time column is a string not a datetime type.
air_quality['date_time'].dt.year

AttributeError: Can only use .dt accessor with datetimelike values

## Use pandas to_datetime method to convert the column to datetime

In [7]:
air_quality['date_time'] = pd.to_datetime(air_quality['date_time'])

In [8]:
air_quality.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95685 entries, 0 to 95684
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date_time  95685 non-null  datetime64[ns]
 1   PM2.5      95685 non-null  float64       
 2   PM10       95685 non-null  float64       
 3   SO2        95685 non-null  float64       
 4   NO2        95685 non-null  float64       
 5   CO         95685 non-null  float64       
 6   O3         95685 non-null  float64       
 7   TEMP       95685 non-null  float64       
 8   PRES       95685 non-null  float64       
 9   DEWP       95685 non-null  float64       
 10  RAIN       95685 non-null  float64       
 11  wd         95685 non-null  object        
 12  WSPM       95685 non-null  float64       
 13  station    95685 non-null  object        
dtypes: datetime64[ns](1), float64(11), object(2)
memory usage: 10.2+ MB


## Now lets gather data from the date_time column

In [9]:
# Get year from date_time and make new column
air_quality['year'] = air_quality['date_time'].dt.year

In [10]:
# Get month from date_time and make new column
air_quality['month'] = air_quality['date_time'].dt.month

In [11]:
# Get day from date_time and make new column
air_quality['day'] = air_quality['date_time'].dt.day

In [12]:
# Get hour from date_time and make new column
air_quality['hour'] = air_quality['date_time'].dt.hour

In [13]:
# Get quarter from date_time and make new column
air_quality['quarter'] = air_quality['date_time'].dt.quarter

In [14]:
# Get day of the week number from date_time and make new column
air_quality['day_of_week_num'] = air_quality['date_time'].dt.dayofweek

In [15]:
# Get name of day of the week from date_time and make new column
air_quality['day_of_week_name'] = air_quality['date_time'].dt.day_name()

In [16]:
air_quality.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95685 entries, 0 to 95684
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   date_time         95685 non-null  datetime64[ns]
 1   PM2.5             95685 non-null  float64       
 2   PM10              95685 non-null  float64       
 3   SO2               95685 non-null  float64       
 4   NO2               95685 non-null  float64       
 5   CO                95685 non-null  float64       
 6   O3                95685 non-null  float64       
 7   TEMP              95685 non-null  float64       
 8   PRES              95685 non-null  float64       
 9   DEWP              95685 non-null  float64       
 10  RAIN              95685 non-null  float64       
 11  wd                95685 non-null  object        
 12  WSPM              95685 non-null  float64       
 13  station           95685 non-null  object        
 14  year              9568

In [17]:
air_quality[['date_time','year','month','day','hour','quarter','day_of_week_num','day_of_week_name']]

Unnamed: 0,date_time,year,month,day,hour,quarter,day_of_week_num,day_of_week_name
0,2013-03-01 00:00:00,2013,3,1,0,1,4,Friday
1,2013-03-01 01:00:00,2013,3,1,1,1,4,Friday
2,2013-03-01 05:00:00,2013,3,1,5,1,4,Friday
3,2013-03-01 06:00:00,2013,3,1,6,1,4,Friday
4,2013-03-01 07:00:00,2013,3,1,7,1,4,Friday
...,...,...,...,...,...,...,...,...
95680,2017-02-28 15:00:00,2017,2,28,15,1,1,Tuesday
95681,2017-02-28 16:00:00,2017,2,28,16,1,1,Tuesday
95682,2017-02-28 21:00:00,2017,2,28,21,1,1,Tuesday
95683,2017-02-28 22:00:00,2017,2,28,22,1,1,Tuesday


In [18]:
air_quality[['day_of_week_num','day_of_week_name']].value_counts()

day_of_week_num  day_of_week_name
6                Sunday              13931
5                Saturday            13812
4                Friday              13782
3                Thursday            13633
0                Monday              13536
1                Tuesday             13523
2                Wednesday           13468
Name: count, dtype: int64

## How to get the difference in 2 date times

#### Example 1: Subtract max()-min()

In [19]:
air_quality['date_time'].max()

Timestamp('2017-02-28 23:00:00')

In [20]:
air_quality['date_time'].min()

Timestamp('2013-03-01 00:00:00')

In [21]:
# Timedelta means the difference between the two datetime values
air_quality['date_time'].max() - air_quality['date_time'].min()

Timedelta('1460 days 23:00:00')

In [22]:
# assign to variable
date_time_range = air_quality['date_time'].max() - air_quality['date_time'].min()

In [23]:
# Represents 1 day
pd.Timedelta(days=1)

Timedelta('1 days 00:00:00')

In [24]:
# Since 365 is roughly a year this is representing a year
pd.Timedelta(days=365)

Timedelta('365 days 00:00:00')

In [25]:
# Lets compare the range to see how many total single days there are using Timedelta
# The decimal is the 23 hours in the date_time_range
date_time_range / pd.Timedelta(days=1)

1460.9583333333333

In [26]:
# Lets compare the range to see how many total years there are using Timedelta
date_time_range / pd.Timedelta(days=365)

4.002625570776256

In [27]:
# This is essentially whats happening above
1460.9583333333333 / 365

4.002625570776256

#### Example 2: Subtract a certain date from the date_time column

In [28]:
pd.Timestamp('2022/01/01')

Timestamp('2022-01-01 00:00:00')

In [29]:
pd.Timestamp('2022/01/01') - air_quality['date_time']

0       3228 days 00:00:00
1       3227 days 23:00:00
2       3227 days 19:00:00
3       3227 days 18:00:00
4       3227 days 17:00:00
               ...        
95680   1767 days 09:00:00
95681   1767 days 08:00:00
95682   1767 days 03:00:00
95683   1767 days 02:00:00
95684   1767 days 01:00:00
Name: date_time, Length: 95685, dtype: timedelta64[ns]

In [30]:
# make new column
air_quality['time_until_2022'] = pd.Timestamp('2022/01/01') - air_quality['date_time']

In [31]:
# Now see the number of days for each column
air_quality['time_until_2022']/pd.Timedelta(days=1)

0        3228.000000
1        3227.958333
2        3227.791667
3        3227.750000
4        3227.708333
            ...     
95680    1767.375000
95681    1767.333333
95682    1767.125000
95683    1767.083333
95684    1767.041667
Name: time_until_2022, Length: 95685, dtype: float64

In [32]:
# Assign the number of days to a new column
air_quality['time_until_2022_days'] = air_quality['time_until_2022']/pd.Timedelta(days=1)

In [33]:
# Assign the number of weeks to a new column
air_quality['time_until_2022_weeks'] = air_quality['time_until_2022']/pd.Timedelta(weeks = 1)

In [34]:
air_quality[['date_time','time_until_2022','time_until_2022_days','time_until_2022_weeks']]

Unnamed: 0,date_time,time_until_2022,time_until_2022_days,time_until_2022_weeks
0,2013-03-01 00:00:00,3228 days 00:00:00,3228.000000,461.142857
1,2013-03-01 01:00:00,3227 days 23:00:00,3227.958333,461.136905
2,2013-03-01 05:00:00,3227 days 19:00:00,3227.791667,461.113095
3,2013-03-01 06:00:00,3227 days 18:00:00,3227.750000,461.107143
4,2013-03-01 07:00:00,3227 days 17:00:00,3227.708333,461.101190
...,...,...,...,...
95680,2017-02-28 15:00:00,1767 days 09:00:00,1767.375000,252.482143
95681,2017-02-28 16:00:00,1767 days 08:00:00,1767.333333,252.476190
95682,2017-02-28 21:00:00,1767 days 03:00:00,1767.125000,252.446429
95683,2017-02-28 22:00:00,1767 days 02:00:00,1767.083333,252.440476


In [35]:
# Check the first row above. The time_until_2022_days / 7 = time_until_2022_weeks
3228.000000	/ 7

461.14285714285717

## Compare date times 

In [36]:
air_quality['date_time']<pd.Timestamp('2016/01/01')

0         True
1         True
2         True
3         True
4         True
         ...  
95680    False
95681    False
95682    False
95683    False
95684    False
Name: date_time, Length: 95685, dtype: bool

#### This comparison allows us to analyse the data before and after specific dates

In [37]:
air_quality['prior_2016_ind'] =  air_quality['date_time']<pd.Timestamp('2016/01/01')

In [38]:
air_quality[['date_time','prior_2016_ind']]

Unnamed: 0,date_time,prior_2016_ind
0,2013-03-01 00:00:00,True
1,2013-03-01 01:00:00,True
2,2013-03-01 05:00:00,True
3,2013-03-01 06:00:00,True
4,2013-03-01 07:00:00,True
...,...,...
95680,2017-02-28 15:00:00,False
95681,2017-02-28 16:00:00,False
95682,2017-02-28 21:00:00,False
95683,2017-02-28 22:00:00,False


## Everything We've Added

In [39]:
air_quality.columns

Index(['date_time', 'PM2.5', 'PM10', 'SO2', 'NO2', 'CO', 'O3', 'TEMP', 'PRES',
       'DEWP', 'RAIN', 'wd', 'WSPM', 'station', 'year', 'month', 'day', 'hour',
       'quarter', 'day_of_week_num', 'day_of_week_name', 'time_until_2022',
       'time_until_2022_days', 'time_until_2022_weeks', 'prior_2016_ind'],
      dtype='object')

## Pickle This So I Can Use In Next Lesson

In [40]:
air_quality.to_pickle('aq1.pkl')