# Project 5: Optimizing Evacuation Routes using Real-Time Traffic Information

Song May, Michael Daugherty, Kelly Slatery | US-DSI-10 | 02.21.2020

In [1]:
# Imports
import numpy as np
import pandas as pd

## Prepare Data

In [2]:
# Import train data
df = pd.read_csv('./data/train_data/clean_train_data.csv')
df.shape

(168441, 5)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,dates&time,user,tweet,category
0,4959,2020-02-06 22:03:12+00:00,DallasPD,DallasPD and dfrincidents are currently on loc...,1
1,5983,2019-08-30 21:16:20+00:00,DallasPD,Monday Sept on LaborDay Jack Evans Police Hd...,1
2,6062,2019-08-14 22:31:39+00:00,DallasPD,PIODPD is at the scene of a possible barricade...,1
3,6175,2019-07-13 22:04:27+00:00,DallasPD,PIODPD is on scene of an Officer Involved Shoo...,1
4,6177,2019-07-13 01:38:19+00:00,DallasPD,Major police incident in downtown Dallas Griff...,1


In [4]:
# Drop unnecessary column
df.drop(columns='Unnamed: 0', inplace=True)
df.head()

Unnamed: 0,dates&time,user,tweet,category
0,2020-02-06 22:03:12+00:00,DallasPD,DallasPD and dfrincidents are currently on loc...,1
1,2019-08-30 21:16:20+00:00,DallasPD,Monday Sept on LaborDay Jack Evans Police Hd...,1
2,2019-08-14 22:31:39+00:00,DallasPD,PIODPD is at the scene of a possible barricade...,1
3,2019-07-13 22:04:27+00:00,DallasPD,PIODPD is on scene of an Officer Involved Shoo...,1
4,2019-07-13 01:38:19+00:00,DallasPD,Major police incident in downtown Dallas Griff...,1


In [5]:
# Rename the category column
df.rename(columns={'category ': 'class'}, inplace=True)
df.head()

Unnamed: 0,dates&time,user,tweet,class
0,2020-02-06 22:03:12+00:00,DallasPD,DallasPD and dfrincidents are currently on loc...,1
1,2019-08-30 21:16:20+00:00,DallasPD,Monday Sept on LaborDay Jack Evans Police Hd...,1
2,2019-08-14 22:31:39+00:00,DallasPD,PIODPD is at the scene of a possible barricade...,1
3,2019-07-13 22:04:27+00:00,DallasPD,PIODPD is on scene of an Officer Involved Shoo...,1
4,2019-07-13 01:38:19+00:00,DallasPD,Major police incident in downtown Dallas Griff...,1


In [6]:
# Check for nulls
df.isnull().sum()

dates&time       0
user             0
tweet         1730
class            0
dtype: int64

In [7]:
# Remove rows with nulls
df = df[df['tweet'].notnull()]

# Export Filtered Data

In [8]:
# Look at the original shape
df.shape

(166711, 4)

In [9]:
# Filer out the dates of Hurricane Harvey (tweets contained in our test data)
df_train = df[(df['dates&time'] < '2017-08-25') | (df['dates&time'] > '2017-09-02')]

In [10]:
# Look at the new shape (should have fewer rows)
df_train.shape

(165204, 4)

In [11]:
# Export data
df_train.to_csv('./data/final_train_data.csv', index=False)