# Project 5: Optimizing Evacuation Routes using Real-Time Traffic Information

Song May, Michael Daugherty, Kelly Slatery | US-DSI-10 | 02.21.2020

## Problem Statement

## Executive Summary

## Data Dictionary

## Contents

# Data Collection

In [1]:
# Imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import GetOldTweets3 as got
import time
import requests

## Attempt 1

In [2]:
# Credits (using GetOldTweets3 package): Martin Beck, 01.12.2020
# https://towardsdatascience.com/how-to-scrape-tweets-from-twitter-59287e20f0f1

# Set up variables for first Twitter pull
username = 'TxDOT'
since_date = '2016-01-01'
until_date = '2020-02-10'
max_tweets = 18000

# Create first query object
tweetCriteria = got.manager.TweetCriteria().setUsername(username).setSince(since_date).setUntil(until_date).setMaxTweets(max_tweets)

# Greate first list of all tweets
tweets = got.manager.TweetManager.getTweets(tweetCriteria)

# Create filtered list of tweet data
user_tweets = [[tweet.date, tweet.username, tweet.text, tweet.hashtags] for tweet in tweets]

# Transform list into the base tweet collection dataframe
all_tweets = pd.DataFrame(user_tweets)

In [3]:
# Look at first 5 tweets from 'TxDOT'
pd.DataFrame(user_tweets).head()

Unnamed: 0,0,1,2,3
0,2020-02-07 22:20:09+00:00,TxDOT,"When you drink and drive in Texas, our officer...",#PlanWhileYouCan #EndTheStreakTX
1,2020-02-07 18:05:12+00:00,TxDOT,Want to help traffic instead of sitting in it?...,#TxDOTCareers
2,2020-02-07 01:10:04+00:00,TxDOT,Are you skilled in automotive work? We'd love ...,#TxDOTCareers
3,2020-02-07 00:15:25+00:00,TxDOT,"From a shout to a whisper, we can’t emphasize ...",#EndTheStreakTX #ClickItOrTicket
4,2020-02-06 22:14:22+00:00,TxDOT,We love seeing all the nice things y’all have ...,#txwx


In [4]:
# Look at last 5 tweets from 'TxDOT'
pd.DataFrame(user_tweets).tail()

Unnamed: 0,0,1,2,3
4080,2016-01-04 00:08:24+00:00,TxDOT,I-35 Alert: 10 vehicle crash closed down all m...,#my35
4081,2016-01-01 16:20:21+00:00,TxDOT,Make a #resolution to start 2016 right. #PlanW...,#resolution #PlanWhileYouCan #FindASoberRide
4082,2016-01-01 13:30:35+00:00,TxDOT,"TxDOT offices will be closed today, Jan. 1 in ...",#NewYear
4083,2016-01-01 05:55:08+00:00,TxDOT,#HappyNewYear from #TxDOT! We hope everyone ha...,#HappyNewYear #TxDOT #NYE
4084,2016-01-01 00:10:13+00:00,TxDOT,Don't start the #NewYear w/ a DWI. #PlanWhileY...,#NewYear #PlanWhileYouCan #FindASoberRide #Hap...


In [5]:
# Define list of Twitter usernames to scrape, exclusing first Twitter user ('TxDOT')
# From: https://www.txdot.gov/driver/weather/txdot-twitter-feeds.html
twitter_users = ['TxDOTAbilene', 'TxDOTAmarillo', 'TxDOTAtlanta', 'TxDOTAustin', 
                 'TxDOTBeaumont', 'TxDOTBWD', 'TxDOTBryan', 'TxDOTChildress', 'TxDOT_CRP', 
                 'TxDOTDallas', 'TxDOTELP', 'TxDOTFortWorth', 'GalvestonFerry', 'TxDOTHouston', 
                 'HoustonTranstar', 'I35Travel', 'TxDOTLaredo', 'TxDOTLubbock', 'TxDOTLufkin', 
                 'TxDOTOdessa', 'TxDOTParis', 'TxDOTPharr', 'PortA_Ferry', 'TxDOTSanAngelo', 
                 'TxDOTSanAntonio', 'TexasHighways', 'TxDOTTyler', 'TxDOTWacoPIO', 'TXDOTWF', 
                 'TxDOTYoakum', 'ImproveMopac', 'ManorExpressway', 'DFWConnector', 'DriveMidtown', 
                 'Drive360South', 'LBJexpress', 'NTExpress', 'my290Houston']

In [6]:
len(twitter_users)

38

In [2]:
# Define a function to pull remaining users' tweets from the same time period
def update_tweets(base_df, user):
    
    # Set up variables for first Twitter pull
    username = user
    since_date = '2016-01-01'
    until_date = '2020-02-10'
    max_tweets = 18000

    # Create first query object
    tweetCriteria = got.manager.TweetCriteria().setUsername(username).setSince(since_date).setUntil(until_date).setMaxTweets(max_tweets)

    # Greate first list of all tweets
    tweets = got.manager.TweetManager.getTweets(tweetCriteria)

    # Create filtered list of tweet data
    user_tweets = [[tweet.date, tweet.username, tweet.text, tweet.hashtags] for tweet in tweets]

    # Transform list into the base tweet collection dataframe
    tweets_df = pd.DataFrame(user_tweets)
    
    # Concatenate new tweets with old tweets
    updated = pd.concat([base_df, tweets_df], axis=0, ignore_index=True, sort=True)
    
    # Export the new dataframe individually
    tweets_df.to_csv(f'./data/data_by_user/{user}_tweets', index=False)
    
    # Print progress update
    print(f'Done scraping. We added {len(user_tweets)} new tweets. Current df shape: {updated.shape}')
    
    # Return updated dataframe of all tweets
    return updated

In [8]:
# Get tweets from above 38 twitters users and add to the all_tweets dataframe

# Set up count
i = 0

# Loop through all twitter users to scrape and add to the all_tweets dataframe, according to Twitter API limits
for user in twitter_users:

    # Count twitters as they are scraped and get the initial time
    i += 1
    t0 = time.time()
    
    # Print progress update
    print(f'Twitter #{i} now scraping...')
    
    # Get the tweets and update the dataframe
    all_tweets = update_tweets(all_tweets, user)
    
    # Wait the the remainder of 15 minutes
    print(f'Time elapsed: {time.time() - t0}')
    time.sleep((15*60) - (time.time() - t0))
    

Twitter #1 now scraping...
Done scraping. We added 1196 new tweets. Current df shape: (5281, 4)

Time elapsed: 30.133867025375366
Twitter #2 now scraping...
Done scraping. We added 1757 new tweets. Current df shape: (7038, 4)

Time elapsed: 46.58091187477112
Twitter #3 now scraping...
Done scraping. We added 356 new tweets. Current df shape: (7394, 4)

Time elapsed: 10.286750793457031
Twitter #4 now scraping...
Done scraping. We added 5420 new tweets. Current df shape: (12814, 4)

Time elapsed: 144.51120901107788
Twitter #5 now scraping...
Done scraping. We added 2231 new tweets. Current df shape: (15045, 4)

Time elapsed: 52.47301006317139
Twitter #6 now scraping...
Done scraping. We added 586 new tweets. Current df shape: (15631, 4)

Time elapsed: 16.99058508872986
Twitter #7 now scraping...
Done scraping. We added 762 new tweets. Current df shape: (16393, 4)

Time elapsed: 19.221446752548218
Twitter #8 now scraping...
Done scraping. We added 1761 new tweets. Current df shape: (18154

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


## Attempt 2

In [6]:
# Set up variables for first Twitter pull of second attempt, skipping 'HoustonTranstar'
username = 'I35Travel'
since_date = '2016-01-01'
until_date = '2020-02-10'
max_tweets = 18000

# Create first query object
tweetCriteria = got.manager.TweetCriteria().setUsername(username).setSince(since_date).setUntil(until_date).setMaxTweets(max_tweets)

# Greate first list of all tweets
tweets = got.manager.TweetManager.getTweets(tweetCriteria)

# Create filtered list of tweet data
user_tweets = [[tweet.date, tweet.username, tweet.text, tweet.hashtags] for tweet in tweets]

# Transform list into the base tweet collection dataframe
all_tweets2 = pd.DataFrame(user_tweets)

An error occured during an HTTP request: HTTP Error 429: Too Many Requests
Try to open in browser: https://twitter.com/search?q=%20from%3Ai35travel%20since%3A2016-01-01%20until%3A2020-02-10&src=typd


SystemExit: 

In [None]:
# Look at first 5 tweets from 'I35Travel'
pd.DataFrame(user_tweets).head()

In [None]:
# Look at last 5 tweets from 'I35Travel'
pd.DataFrame(user_tweets).tail()

In [11]:
# Continue scraping: Get tweets from remaining 23 twitters users and add to the all_tweets dataframe
# Skipping 'HoustonTranstar'

# Set up count
i = 0

# Loop through all remaining twitter users to scrape and add to the all_tweets dataframe, 
# according to Twitter API limits
for user in twitter_users[16:]:

    # Count twitters as they are scraped and get the initial time
    i += 1
    t0 = time.time()
    
    # Print progress update
    print(f'Twitter #{i} now scraping...')
    
    # Get the tweets and update the dataframe
    all_tweets2 = update_tweets(all_tweets2, user)
    
    # Wait the the remainder of 15 minutes
    print(f'Done. Time elapsed: {time.time() - t0}')
    print()
    time.sleep((15*60) - (time.time() - t0))
    

Twitter #16 now scraping...
An error occured during an HTTP request: HTTP Error 429: Too Many Requests
Try to open in browser: https://twitter.com/search?q=%20from%3Ai35travel%20since%3A2016-01-01%20until%3A2020-02-10&src=typd


SystemExit: 

In [None]:
# Look at first 5 tweets
all_tweets.head()

In [None]:
# Look at last 5 tweets
all_tweets.tail()

# Export data

In [None]:
all_tweets.to_csv('./data/DOT_tweets.csv', index=False)