Time Series Preprocessing

# Introduction

## Read in libraries

In [22]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import warnings                                
warnings.filterwarnings('ignore')

%matplotlib inline


## Update settings for notebook

In [23]:
#Set options for pandas
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

#Set style for visualizations
plt.style.use('ggplot')

## Read in Data

In [24]:
#Set path to raw data location
path = r'C:\Users\kishe\Documents\Data Science\Projects\Python Projects\In Progress\US-Accidents A Countrywide Traffic Accident Dataset\Data\02_Intermediate'

#Timestamp variables to parse
dates = ['Start_Time', 'End_Time', 'Weather_Timestamp']

#Read in data
df = pd.read_csv(path + '/2020_0126_Traffic_Raw_Data_Cleaned.csv',parse_dates= dates, index_col=0, low_memory=False)

#Rename first col
df.rename(columns = {'2020_0126_Traffic_Raw_Data_Cleaned.csv': 'ID'}, inplace= True)


## Preview data

In [25]:
#Print data shape
print('Data shape: ', df.shape)

display(df.head())

Data shape:  (2974336, 49)


Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,Distance(mi),Description,Street,Side,City,County,State,Zipcode,Timezone,Airport_Code,Weather_Timestamp,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,Wind_Speed(mph),Weather_Condition,Amenity,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight,Start_Weekday,Start_Day,Start_Month,Start_Year,End_Weekday,End_Day,End_Month,End_Year
0,A-1,MapQuest,Three,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,0.01,right lane blocked due to accident on i70 east...,I-70 E,R,Dayton,Montgomery,OH,45424,Eastern,KFFO,2016-02-08 05:58:00,36.9,91.0,29.68,10.0,Calm,8.298064,Light Rain,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Monday,8,2,2016,Monday,8,2,2016
1,A-2,MapQuest,Two,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,0.01,accident on brice rd at tussing rd expect delays,Brice Rd,L,Reynoldsburg,Franklin,OH,43068-3402,Eastern,KCMH,2016-02-08 05:51:00,37.9,100.0,29.65,10.0,Calm,8.298064,Light Rain,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,Monday,8,2,2016,Monday,8,2,2016
2,A-3,MapQuest,Two,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,0.01,accident on oh32 state route 32 westbound at d...,State Route 32,R,Williamsburg,Clermont,OH,45176,Eastern,KI69,2016-02-08 06:56:00,36.0,100.0,29.67,10.0,SW,3.5,Overcast,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,Monday,8,2,2016,Monday,8,2,2016
3,A-4,MapQuest,Three,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,0.01,accident on i75 southbound at exits 52 52b us3...,I-75 S,R,Dayton,Montgomery,OH,45417,Eastern,KDAY,2016-02-08 07:38:00,35.1,96.0,29.64,9.0,SW,4.6,Mostly Cloudy,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,Monday,8,2,2016,Monday,8,2,2016
4,A-5,MapQuest,Two,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,0.01,accident on mcewen rd at oh725 miamisburg cent...,Miamisburg Centerville Rd,R,Dayton,Montgomery,OH,45459,Eastern,KMGY,2016-02-08 07:53:00,36.0,89.0,29.65,6.0,SW,3.5,Mostly Cloudy,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,Monday,8,2,2016,Monday,8,2,2016


In [26]:
print('Column data types are as follows:\n', df.dtypes)

Column data types are as follows:
 ID                               object
Source                           object
Severity                         object
Start_Time               datetime64[ns]
End_Time                 datetime64[ns]
Start_Lat                       float64
Start_Lng                       float64
Distance(mi)                    float64
Description                      object
Street                           object
Side                             object
City                             object
County                           object
State                            object
Zipcode                          object
Timezone                         object
Airport_Code                     object
Weather_Timestamp        datetime64[ns]
Temperature(F)                  float64
Humidity(%)                     float64
Pressure(in)                    float64
Visibility(mi)                  float64
Wind_Direction                   object
Wind_Speed(mph)                 float64
Weath

# Data Preprocessing

In [27]:
df.head(1)

Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,Distance(mi),Description,Street,Side,City,County,State,Zipcode,Timezone,Airport_Code,Weather_Timestamp,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,Wind_Speed(mph),Weather_Condition,Amenity,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight,Start_Weekday,Start_Day,Start_Month,Start_Year,End_Weekday,End_Day,End_Month,End_Year
0,A-1,MapQuest,Three,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,0.01,right lane blocked due to accident on i70 east...,I-70 E,R,Dayton,Montgomery,OH,45424,Eastern,KFFO,2016-02-08 05:58:00,36.9,91.0,29.68,10.0,Calm,8.298064,Light Rain,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Monday,8,2,2016,Monday,8,2,2016


In [28]:
#Remove columns not relevant to time-series analysis
df = df[['ID','Severity', 'Start_Time','Start_Weekday', 'Start_Day', 'Start_Month', 'Start_Year']]

#Set Start_Time as index
df.set_index('Start_Time', inplace = True)

Write to csv

In [29]:
#Set path to write csv
path = r'C:\Users\kishe\Documents\Data Science\Projects\Python Projects\In Progress\US-Accidents A Countrywide Traffic Accident Dataset\Data\03_Processed'

#Write df to csv
df.to_csv(path + '/2020_0128_Processed_Traffic_Data_for_TSA.csv', sep=',')