In [1]:
# Import useful libraries
import time
import config
import numpy as np
import pandas as pd
import os
import requests
import tweepy
import json

## Gather and Read Data

In [2]:
# Read the twitter archive data provided
wrd_archive = pd.read_csv('./twitter-archive-enhanced.csv')
wrd_archive.head(3)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,


In [3]:
# Programmatically download the image predictions
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
file_name = url.split('/')[-1]
response = requests.get(url)

start = time.time()
with open(file_name, 'wb') as f:
    f.write(response.content)
    
print('Process completed in {} seconds'.format(time.time()-start))

Process completed in 0.0030281543731689453 seconds


In [4]:
# Read in the image predictions
img_predictions = pd.read_csv('./image-predictions.tsv', sep='\t')
img_predictions.head(3)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True


In [5]:
# Create an API object to gater twitter data
consumer_key = config.API_KEY
consumer_secret = config.API_KEY_SECRET
access_token = config.ACCESS_TOKEN
access_secret = config.ACCESS_TOKEN_SECRET

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit =True,
                wait_on_rate_limit_notify=True)

In [6]:
# Pull tweet information using the ids in wrd_archive
# Extract the tweet ids from the wrd dataframe
tweet_ids = wrd_archive['tweet_id']
success, failure, counter = (0, 0, 0)
failed_attempts = {}
print('\033[1m'+'COMMENCING JSON EXTRACTION TASK'+'\033[0m'+'\n'+'-'*70)
start_time = time.time()

# Loop over each tweet id and collect the information
with open('tweet_info.json', 'w') as file:
    print('Pulling json data for the first 200 tweets...')
    for tweet_id in tweet_ids:
        if (success % 200 ==0) and (counter > 0):
            print('\033[94m'+'\033[1m'+'Sub-task Complete!'+'\033[0m')
            print('Successful pulls: {} || failed pulls: {} || Pulls pending: {}'.format(success, failure, tweet_ids.size - counter))
            print('\nPulling json data for the next 200 tweets...')
        try:
            tweet_info = api.get_status(tweet_id, tweet_mode='extended')
            json.dump(tweet_info._json, file)
            file.write('\n')
            success+=1
        except Exception as e:
            failed_attempts[tweet_id] = e
            failure+=1
            pass
        finally:
            counter+=1

# Print feedback on execution process  
duration = (time.time() - start_time)/60
failed = len(failed_attempts.keys())
print('\033[1m' + '\033[94m' +'Task Completed!\n'+'\033[0m' + '-'*70)
print('\033[1m'+'DISPLAYING RUNTIME SUMMARY'+'\033[0m')
print('The entire process took: {} minutes'.format(round(duration, 2)))

if (failed > 0):
    print('\033[91m'+'\033[1m'+'Could not pull information for '+ str(failed) + ' tweet ids:'+'\033[0m')
    print(pd.Series(failed_attempts))
else:
    print('\033[94m'+'\033[1m'+'No failed attempts'+'\033[0m')

[1mCOMMENCING JSON EXTRACTION TASK[0m
----------------------------------------------------------------------
Pulling json data for the first 200 tweets...
[91m[1mError occurred while pulling json for tweet id: 888202515573088257[0m
Skipping to the next tweet ids..
[91m[1mError occurred while pulling json for tweet id: 873697596434513921[0m
Skipping to the next tweet ids..
[91m[1mError occurred while pulling json for tweet id: 872668790621863937[0m
Skipping to the next tweet ids..
[91m[1mError occurred while pulling json for tweet id: 872261713294495745[0m
Skipping to the next tweet ids..
[91m[1mError occurred while pulling json for tweet id: 869988702071779329[0m
Skipping to the next tweet ids..
[91m[1mError occurred while pulling json for tweet id: 866816280283807744[0m
Skipping to the next tweet ids..
[91m[1mError occurred while pulling json for tweet id: 861769973181624320[0m
Skipping to the next tweet ids..
[91m[1mError occurred while pulling json for tweet

In [7]:
# Extract the information we want from the json file
json_tweet_details = []

with open('tweet_info.json', 'r', encoding='UTF-8') as file:
    for line in file:
        json_text = json.loads(line)
        # Extract the tweet_id, likes and retweet count
        tweet_id = json_text['id_str']
        retweets = json_text['retweet_count']
        likes = json_text['favorite_count']
        # Extract the hashtag from the json file
        hashtags_info = json_text['entities']['hashtags']
        if len(hashtags_info) !=0:
            hashtags = ['#'+item['text'] for item in hashtags_info]
        else:
            hashtags = 'None'
        # Assign these values into our list
        json_tweet_details.append({
            'tweet_id': tweet_id,
            'hashtag': hashtags,
            'retweets': retweets,
            'likes': likes}
        )
    
    json_tweet_info = pd.DataFrame(json_tweet_details)

In [8]:
json_tweet_info.head(3)

Unnamed: 0,tweet_id,hashtag,retweets,likes
0,892420643555336193,,7024,33866
1,892177421306343426,,5305,29364
2,891815181378084864,,3488,22089


## Assessing Data