# Project 5: Optimizing Evacuation Routes using Real-Time Traffic Information

Kelly Slatery | US-DSI-10 | 02.21.2020

# Data Collection

In [1]:
# Imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import GetOldTweets3 as got
import time
import requests

## Attempt 1

In [17]:
# Credits (using GetOldTweets3 package): Martin Beck, 01.12.2020
# https://towardsdatascience.com/how-to-scrape-tweets-from-twitter-59287e20f0f1

# Set up variables for first Twitter pull
username = 'TxDOT'
since_date = '2016-01-01'
until_date = '2020-02-10'
max_tweets = 10000

# Create first query object
tweetCriteria = got.manager.TweetCriteria().setUsername(username).setSince(since_date).setUntil(until_date).setMaxTweets(max_tweets)

# Greate first list of all tweets
tweets = got.manager.TweetManager.getTweets(tweetCriteria)

# Create filtered list of tweet data
user_tweets = [[tweet.date, tweet.username, tweet.text, tweet.hashtags] for tweet in tweets]

# Transform list into the base tweet collection dataframe
all_tweets = pd.DataFrame(user_tweets)

# Export to dataframe
all_tweets.to_csv(f'./data/train_data/train_data_by_user/TxDOT_tweets', index=False)

In [18]:
# Look at first 5 tweets from 'TxDOT'
all_tweets.head()

Unnamed: 0,0,1,2,3
0,2020-02-07 22:20:09+00:00,TxDOT,"When you drink and drive in Texas, our officer...",#PlanWhileYouCan #EndTheStreakTX
1,2020-02-07 18:05:12+00:00,TxDOT,Want to help traffic instead of sitting in it?...,#TxDOTCareers
2,2020-02-07 01:10:04+00:00,TxDOT,Are you skilled in automotive work? We'd love ...,#TxDOTCareers
3,2020-02-07 00:15:25+00:00,TxDOT,"From a shout to a whisper, we can’t emphasize ...",#EndTheStreakTX #ClickItOrTicket
4,2020-02-06 22:14:22+00:00,TxDOT,We love seeing all the nice things y’all have ...,#txwx


In [19]:
# Look at last 5 tweets from 'TxDOT'
all_tweets.tail()

Unnamed: 0,0,1,2,3
4080,2016-01-04 00:08:24+00:00,TxDOT,I-35 Alert: 10 vehicle crash closed down all m...,#my35
4081,2016-01-01 16:20:21+00:00,TxDOT,Make a #resolution to start 2016 right. #PlanW...,#resolution #PlanWhileYouCan #FindASoberRide
4082,2016-01-01 13:30:35+00:00,TxDOT,"TxDOT offices will be closed today, Jan. 1 in ...",#NewYear
4083,2016-01-01 05:55:08+00:00,TxDOT,#HappyNewYear from #TxDOT! We hope everyone ha...,#HappyNewYear #TxDOT #NYE
4084,2016-01-01 00:10:13+00:00,TxDOT,Don't start the #NewYear w/ a DWI. #PlanWhileY...,#NewYear #PlanWhileYouCan #FindASoberRide #Hap...


In [2]:
# Define list of Twitter usernames to scrape, exclusing first Twitter user ('TxDOT')
# From: https://www.txdot.gov/driver/weather/txdot-twitter-feeds.html
twitter_users = ['TxDOTAbilene', 'TxDOTAmarillo', 'TxDOTAtlanta', 'TxDOTAustin', 
                 'TxDOTBeaumont', 'TxDOTBWD', 'TxDOTBryan', 'TxDOTChildress', 'TxDOT_CRP', 
                 'TxDOTDallas', 'TxDOTELP', 'TxDOTFortWorth', 'GalvestonFerry', 'TxDOTHouston', 
                 'HoustonTranstar', 'I35Travel', 'TxDOTLaredo', 'TxDOTLubbock', 'TxDOTLufkin', 
                 'TxDOTOdessa', 'TxDOTParis', 'TxDOTPharr', 'PortA_Ferry', 'TxDOTSanAngelo', 
                 'TxDOTSanAntonio', 'TexasHighways', 'TxDOTTyler', 'TxDOTWacoPIO', 'TXDOTWF', 
                 'TxDOTYoakum', 'ImproveMopac', 'ManorExpressway', 'DFWConnector', 'DriveMidtown', 
                 'Drive360South', 'LBJexpress', 'NTExpress', 'my290Houston']

In [3]:
# How many Twitters are we scraping?
len(twitter_users)

38

In [4]:
# Define a function to pull remaining users' tweets from the same time period
def update_tweets(base_df, user):
    
    # Set up variables for first Twitter pull
    username = user
    since_date = '2016-01-01'
    until_date = '2020-02-10'
    max_tweets = 10000

    # Create first query object
    tweetCriteria = got.manager.TweetCriteria().setUsername(username).setSince(since_date).setUntil(until_date).setMaxTweets(max_tweets)

    # Greate first list of all tweets
    tweets = got.manager.TweetManager.getTweets(tweetCriteria)

    # Create filtered list of tweet data
    user_tweets = [[tweet.date, tweet.username, tweet.text, tweet.hashtags] for tweet in tweets]

    # Transform list into the base tweet collection dataframe
    tweets_df = pd.DataFrame(user_tweets)
    
    # Concatenate new tweets with old tweets
    updated = pd.concat([base_df, tweets_df], axis=0, ignore_index=True, sort=True)
    
    # Export the new dataframe individually
    tweets_df.to_csv(f'./data/train_data/train_data_by_user/{user}_tweets', index=False)
    
    # Print progress update
    print(f'Done scraping. We added {len(user_tweets)} new tweets. Current df shape: {updated.shape}')
    
    # Return updated dataframe of all tweets
    return updated

In [8]:
# Get tweets from next 14 of the total 39 twitters users and add to the all_tweets dataframe

# Set up count
i = 0

# Loop through all twitter users to scrape and add to the all_tweets dataframe, according to Twitter API limits
for user in twitter_users:

    # Count twitters as they are scraped and get the initial time
    i += 1
    t0 = time.time()
    
    # Print progress update
    print(f'Twitter #{i} now scraping...')
    
    # Get the tweets and update the dataframe
    all_tweets = update_tweets(all_tweets, user)
    
    # Wait the the remainder of 15 minutes
    print(f'Time elapsed: {time.time() - t0}')
    time.sleep((15*60) - (time.time() - t0))
    

Twitter #1 now scraping...
Done scraping. We added 1196 new tweets. Current df shape: (5281, 4)

Time elapsed: 30.133867025375366
Twitter #2 now scraping...
Done scraping. We added 1757 new tweets. Current df shape: (7038, 4)

Time elapsed: 46.58091187477112
Twitter #3 now scraping...
Done scraping. We added 356 new tweets. Current df shape: (7394, 4)

Time elapsed: 10.286750793457031
Twitter #4 now scraping...
Done scraping. We added 5420 new tweets. Current df shape: (12814, 4)

Time elapsed: 144.51120901107788
Twitter #5 now scraping...
Done scraping. We added 2231 new tweets. Current df shape: (15045, 4)

Time elapsed: 52.47301006317139
Twitter #6 now scraping...
Done scraping. We added 586 new tweets. Current df shape: (15631, 4)

Time elapsed: 16.99058508872986
Twitter #7 now scraping...
Done scraping. We added 762 new tweets. Current df shape: (16393, 4)

Time elapsed: 19.221446752548218
Twitter #8 now scraping...
Done scraping. We added 1761 new tweets. Current df shape: (18154

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [52]:
# Export first round of data as one dataframe
all_tweets.to_csv('./data/train_data/DOT1_tweets.csv', index=False)

## Attempt 2

In [5]:
# Update list of Twitters still needed to be scraped
remaining_twitter_users = twitter_users[14:]
remaining_twitter_users[:3]

['HoustonTranstar', 'I35Travel', 'TxDOTLaredo']

In [6]:
# Set up variables for first Twitter pull of second attempt
username = 'HoustonTranstar'
since_date = '2016-01-01'
until_date = '2020-02-10'
max_tweets = 10000

# Create first query object
tweetCriteria = got.manager.TweetCriteria().setUsername(username).setSince(since_date).setUntil(until_date).setMaxTweets(max_tweets)

# Greate first list of all tweets
tweets = got.manager.TweetManager.getTweets(tweetCriteria)

# Create filtered list of tweet data
user_tweets = [[tweet.date, tweet.username, tweet.text, tweet.hashtags] for tweet in tweets]

# Transform list into the base tweet collection dataframe
all_tweets2 = pd.DataFrame(user_tweets)

In [7]:
# Look at first 5 tweets from 'HoustonTranstar'
all_tweets2.head()

Unnamed: 0,0,1,2,3
0,2020-02-07 22:17:07+00:00,houstontranstar,http://traffic.houstontranstar.org/layers/,
1,2020-02-07 17:48:31+00:00,houstontranstar,REMINDER: All mainlanes of I-610 West Loop (NB...,
2,2020-02-07 15:53:52+00:00,houstontranstar,http://traffic.houstontranstar.org/layers/,
3,2020-02-07 15:38:03+00:00,houstontranstar,http://traffic.houstontranstar.org/layers/,
4,2020-02-06 16:04:45+00:00,houstontranstar,More information related to this closure can b...,#HoustonTranStar


In [8]:
# Look at last 5 tweets from 'HoustonTranstar'
all_tweets2.tail()

Unnamed: 0,0,1,2,3
9995,2018-07-30 15:36:05+00:00,houstontranstar,IH-10 KATY Eastbound At PARK TEN - Heavy Truck...,
9996,2018-07-30 15:21:05+00:00,houstontranstar,IH-69 Eastex Freeway Southbound At FRANKLIN ST...,
9997,2018-07-30 15:09:05+00:00,houstontranstar,IH-69 Eastex Freeway Southbound At FRANKLIN ST...,
9998,2018-07-30 15:00:05+00:00,houstontranstar,NORTH SAM HOUSTON TOLLWAY Eastbound At NORTH S...,
9999,2018-07-30 14:54:05+00:00,houstontranstar,US-290 NORTHWEST Westbound After TELGE RD - St...,


In [11]:
# Continue scraping: Get tweets from next 9 of remaining 23 twitters users and add to the all_tweets dataframe

# Set up count
i = 0

# Loop through all remaining twitter users to scrape and add to the all_tweets dataframe, 
# according to Twitter API limits
for user in remaining_twitter_users[1:]:

    # Count twitters as they are scraped and get the initial time
    i += 1
    t0 = time.time()
    
    # Print progress update
    print(f'Twitter #{i} now scraping...')
    
    # Get the tweets and update the dataframe
    all_tweets2 = update_tweets(all_tweets2, user)
    
    # Wait the the remainder of 15 minutes
    print(f'Done. Time elapsed: {time.time() - t0}')
    print()
    time.sleep(15*60)
    

Twitter #1 now scraping...
Done scraping. We added 10000 new tweets. Current df shape: (20000, 4)
Done. Time elapsed: 247.74167275428772

Twitter #2 now scraping...
Done scraping. We added 2740 new tweets. Current df shape: (22740, 4)
Done. Time elapsed: 76.26691198348999

Twitter #3 now scraping...
Done scraping. We added 1305 new tweets. Current df shape: (24045, 4)
Done. Time elapsed: 38.40343976020813

Twitter #4 now scraping...
Done scraping. We added 1996 new tweets. Current df shape: (26041, 4)
Done. Time elapsed: 51.653087854385376

Twitter #5 now scraping...
Done scraping. We added 1904 new tweets. Current df shape: (27945, 4)
Done. Time elapsed: 50.5319139957428

Twitter #6 now scraping...
Done scraping. We added 739 new tweets. Current df shape: (28684, 4)
Done. Time elapsed: 23.571969032287598

Twitter #7 now scraping...
Done scraping. We added 1697 new tweets. Current df shape: (30381, 4)
Done. Time elapsed: 48.53655004501343

Twitter #8 now scraping...
Done scraping. We a

KeyboardInterrupt: 

In [12]:
# Export second round of data as one dataframe
all_tweets2.to_csv('./data/train_data/DOT2_tweets.csv', index=False)

# Combine and export data

In [2]:
# Import first round of data
tweets1 = pd.read_csv('./data/DOT1_tweets.csv')

In [3]:
# Import second round of data
tweets2 = pd.read_csv('./data/DOT2_tweets.csv')

In [4]:
# Combine all tweets from first 25 of 38 twitters
all_tweets_final = pd.concat([tweets1, tweets2], axis=0, ignore_index=True, sort=True)
all_tweets_final.shape

(80837, 8)

In [7]:
# Export full dataset of all tweets from first 25 of 38 twitters
all_tweets_final.to_csv('./data/train_data/DOT_full_tweets.csv', index=False)