In [1]:
# Import useful libraries
import time
import config
import numpy as np
import pandas as pd
import os
import requests
import tweepy
import json

## Gather and Read Data
---

In [2]:
# Read the twitter archive data provided
wrd_archive = pd.read_csv('./twitter-archive-enhanced.csv')
wrd_archive.head(3)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,


In [3]:
# Programmatically download the image predictions
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
file_name = url.split('/')[-1]
response = requests.get(url)

start = time.time()
with open(file_name, 'wb') as f:
    f.write(response.content)
    
print('Process completed in {} seconds'.format(time.time()-start))

Process completed in 0.0015802383422851562 seconds


In [4]:
# Read in the image predictions
img_predictions = pd.read_csv('./image-predictions.tsv', sep='\t')
img_predictions.head(3)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True


In [5]:
# Create an API object to gater twitter data
consumer_key = config.API_KEY
consumer_secret = config.API_KEY_SECRET
access_token = config.ACCESS_TOKEN
access_secret = config.ACCESS_TOKEN_SECRET

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit =True,
                wait_on_rate_limit_notify=True)

In [6]:
if False:
# Pull tweet information using the ids in wrd_archive
# Extract the tweet ids from the wrd dataframe
    tweet_ids = wrd_archive['tweet_id']
    success, failure, counter = (0, 0, 0)
    failed_attempts = {}
    print('\033[1m'+'COMMENCING JSON EXTRACTION TASK'+'\033[0m'+'\n'+'-'*70)
    start_time = time.time()

    # Loop over each tweet id and collect the information
    with open('tweet_info.json', 'w') as file:
        print('Pulling json data for the first 200 tweets...')
        for tweet_id in tweet_ids:
            if (success % 200 ==0) and (counter > 0):
                print('\033[94m'+'\033[1m'+'Sub-task Complete!'+'\033[0m')
                print('Successful pulls: {} || failed pulls: {} || Pulls pending: {}'.format(success, failure, tweet_ids.size - counter))
                print('\nPulling json data for the next 200 tweets...')
            try:
                tweet_info = api.get_status(tweet_id, tweet_mode='extended')
                json.dump(tweet_info._json, file)
                file.write('\n')
                success+=1
            except Exception as e:
                failed_attempts[tweet_id] = e
                failure+=1
                pass
            finally:
                counter+=1

    # Print feedback on execution process  
    duration = (time.time() - start_time)/60
    failed = len(failed_attempts.keys())
    print('\033[1m' + '\033[94m' +'Task Completed!\n'+'\033[0m' + '-'*70)
    print('\033[1m'+'DISPLAYING RUNTIME SUMMARY'+'\033[0m')
    print('The entire process took: {} minutes'.format(round(duration, 2)))

    if (failed > 0):
        print('\033[91m'+'\033[1m'+'Could not pull information for '+ str(failed) + ' tweet ids:'+'\033[0m')
        print(pd.Series(failed_attempts))
    else:
        print('\033[94m'+'\033[1m'+'No failed attempts'+'\033[0m')

In [7]:
# Extract the information we want from the json file
json_tweet_details = []

with open('tweet_info.json', 'r', encoding='UTF-8') as file:
    for line in file:
        json_text = json.loads(line)
        # Extract the tweet_id, likes and retweet count
        tweet_id = json_text['id_str']
        retweets = json_text['retweet_count']
        likes = json_text['favorite_count']
        # Extract the hashtag from the json file
        hashtags_info = json_text['entities']['hashtags']
        if len(hashtags_info) !=0:
            hashtags = ['#'+item['text'] for item in hashtags_info]
        else:
            hashtags = 'None'
        # Assign these values into our list
        json_tweet_details.append({
            'tweet_id': tweet_id,
            'hashtag': hashtags,
            'retweets': retweets,
            'likes': likes}
        )
    
    json_tweet_info = pd.DataFrame(json_tweet_details)

In [8]:
json_tweet_info.head(3)

Unnamed: 0,tweet_id,hashtag,retweets,likes
0,892420643555336193,,7024,33866
1,892177421306343426,,5305,29364
2,891815181378084864,,3488,22089


## Assessing Data
---
### A. Visual Assessment
**1. Examining a sample of 50 records from the `wrd_archive` dataframe in Jupyter notebook, including additional visual assessments in google sheets:**

In [9]:
wrd_archive.sample(50)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
1787,677565715327688705,,,2015-12-17 19:07:09 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Contortionist pup here. Inside pentagram. Clea...,,,,https://twitter.com/dog_rates/status/677565715...,6,10,,,,,
2312,666776908487630848,,,2015-11-18 00:36:17 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Josep. He is a Rye Manganese mix. Can ...,,,,https://twitter.com/dog_rates/status/666776908...,5,10,Josep,,,,
1295,707983188426153984,7.079801e+17,2319108000.0,2016-03-10 17:35:20 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@serial @MrRoles OH MY GOD I listened to all o...,,,,,12,10,,,,,
764,777953400541634568,,,2016-09-19 19:31:59 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: Meet Gerald. He's a fairly exot...,7.681934e+17,4196984000.0,2016-08-23 21:09:14 +0000,https://twitter.com/dog_rates/status/768193404...,8,10,Gerald,doggo,,,
185,856330835276025856,,,2017-04-24 02:15:55 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @Jenna_Marbles: @dog_rates Thanks for ratin...,8.563302e+17,66699010.0,2017-04-24 02:13:14 +0000,,14,10,,,,,
1507,691675652215414786,,,2016-01-25 17:35:00 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Richie and Plip. They are the best of ...,,,,https://twitter.com/dog_rates/status/691675652...,10,10,Richie,,,,
1695,681261549936340994,,,2015-12-27 23:53:05 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Say hello to Panda. He's a Quackadilly Shooste...,,,,https://twitter.com/dog_rates/status/681261549...,9,10,Panda,,,,
1969,673317986296586240,,,2015-12-06 01:48:12 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Take a moment and appreciate how these two dog...,,,,https://twitter.com/dog_rates/status/673317986...,10,10,,,,,
1739,679511351870550016,,,2015-12-23 03:58:25 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Say hello to William. He makes fun of others b...,,,,https://twitter.com/dog_rates/status/679511351...,7,10,William,,,,
2259,667550904950915073,,,2015-11-20 03:51:52 +0000,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",RT @dogratingrating: Exceptional talent. Origi...,6.675487e+17,4296832000.0,2015-11-20 03:43:06 +0000,https://twitter.com/dogratingrating/status/667...,12,10,,,,,


**Notes:**
> **Quality Issues**
>- Some records appear to be replies or retweets to previously created tweets; some may contain ratings, but they are not the original tweets. This information can be observed in the `in_reply_to_status_id`,	`in_reply_to_user_id`, `retweeted_status_id`, `retweeted_status_user_id` and `retweeted_status_timestamp` columns.
>- Unexpected ratings in the `rating_numerator` and `rating_denominator` columns. Examples are rating numerators as high as `666` and denominators as low as `0`.
>- Unusual dog names such as `a`, `an` and `not` in the `name` column.

> **Tidiness Issues**
>- The various stages of dog life: `doggo`, `pupper`, `puppo`, and `floofer` should be contained in one column.
>- Long and unneccessary links in the `source` column. All we need is the type of device users are tweeting from.
<br>

**2. Examining a sample of 50 records from the `img_predictions` dataframe in Jupyter notebook, including additional visual assessments in google sheets:**

In [10]:
img_predictions.sample(50)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
2038,884876753390489601,https://pbs.twimg.com/media/DEe2tZXXkAAwyX3.jpg,1,chow,0.822103,True,Norwich_terrier,0.106075,True,Norfolk_terrier,0.037348,True
1047,713177543487135744,https://pbs.twimg.com/media/CeW3MWMWQAEOMbq.jpg,1,whippet,0.734244,True,basenji,0.025948,True,Great_Dane,0.025874,True
1941,861005113778896900,https://pbs.twimg.com/media/C_LnlF5VoAEsL1K.jpg,1,German_shepherd,0.507951,True,Pembroke,0.136113,True,muzzle,0.075764,False
1115,724983749226668032,https://pbs.twimg.com/media/Cg-o3w0WgAANXdv.jpg,1,golden_retriever,0.67575,True,Great_Pyrenees,0.095168,True,cocker_spaniel,0.076043,True
1892,849336543269576704,https://pbs.twimg.com/media/C8lzFC4XcAAQxB4.jpg,1,patio,0.521788,False,prison,0.149544,False,restaurant,0.027153,False
1987,872620804844003328,https://pbs.twimg.com/media/DBwr_hzXkAEnZBW.jpg,1,cocker_spaniel,0.513191,True,Sussex_spaniel,0.159088,True,standard_poodle,0.149509,True
100,667782464991965184,https://pbs.twimg.com/media/CURwm3cUkAARcO6.jpg,1,lorikeet,0.466149,False,hummingbird,0.083011,False,African_grey,0.054247,False
573,678424312106393600,https://pbs.twimg.com/media/CWo_T8gW4AAgJNo.jpg,1,Maltese_dog,0.759945,True,toy_poodle,0.101194,True,Shih-Tzu,0.056037,True
1765,826598365270007810,https://pbs.twimg.com/media/C3iq0EEXUAAdBYC.jpg,1,French_bulldog,0.628119,True,Siamese_cat,0.117397,False,cougar,0.082765,False
657,682303737705140231,https://pbs.twimg.com/media/CXgHoLnWAAA8i52.jpg,1,seat_belt,0.997659,False,Lakeland_terrier,0.001731,True,Airedale,0.000204,True


**Notes:**
> **Quality Issues**
>- The Prediction in columns `p1`, `p2` and `p3` are not uniformly formatted. Some names are lowercase, some are uppercase and some are titlecase.
>- The predictions above also have words seperated by underscores instead of spaces.

> **Tidiness Issues**
>- From `p1`, `p2` and `p3`, we only need the most confident prediction that corresponds to an actual dog breed.
<br>

**3. Examining a sample of 25 records each from the `json_tweet_info` dataframe in Jupyter notebook**

In [11]:
json_tweet_info.sample(25)

Unnamed: 0,tweet_id,hashtag,retweets,likes
1326,703611486317502464,,1389,3560
40,884441805382717440,,4801,23895
1084,733109485275860992,,15576,49469
2284,666649482315059201,,527,996
502,809920764300447744,,3695,14804
1388,698355670425473025,,399,1709
2302,666293911632134144,,291,428
1151,719339463458033665,,1124,4119
2049,670822709593571328,,83,545
290,836001077879255040,,3907,17945


**Notes:**
> **Quality Issue**
>- It seems that where hashtags are present, they are stored in array format rather than as individual strings.

### B. Programmatic Assessment

In [12]:
wrd_archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

In [13]:
img_predictions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [14]:
json_tweet_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2324 entries, 0 to 2323
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   tweet_id  2324 non-null   object
 1   hashtag   2324 non-null   object
 2   retweets  2324 non-null   int64 
 3   likes     2324 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 72.8+ KB
