In [1]:
# Import useful libraries
import time
import config
import numpy as np
import pandas as pd
import os
import re
import requests
import tweepy
import json
from PIL import Image
from io import BytesIO
from tabulate import tabulate

In [2]:
class Color:
    purple = '\033[95m'
    cyan = '\033[96m'
    darkcyan = '\033[36m'
    blue = '\033[94m'
    green = '\033[92m'
    yellow = '\033[93m'
    red = '\033[91m'
    bold = '\033[1m'
    underline = '\033[4m'
    end = '\033[0m'

## Gather and Read Data
---

In [3]:
# Read the twitter archive data provided
wrd_archive = pd.read_csv('./twitter-archive-enhanced.csv')
wrd_archive.head(3)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,


In [4]:
# Programmatically download the image predictions
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
file_name = url.split('/')[-1]
response = requests.get(url)

start = time.time()
with open(file_name, 'wb') as f:
    f.write(response.content)
    
print('Process completed in {} seconds'.format(time.time()-start))

Process completed in 0.003099203109741211 seconds


In [5]:
# Read in the image predictions
img_predictions = pd.read_csv('./image-predictions.tsv', sep='\t')
img_predictions.head(3)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True


In [6]:
# Create an API object to gater twitter data
consumer_key = config.API_KEY
consumer_secret = config.API_KEY_SECRET
access_token = config.ACCESS_TOKEN
access_secret = config.ACCESS_TOKEN_SECRET

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit =True,
                wait_on_rate_limit_notify=True)

In [7]:
if False:
# Pull tweet information using the ids in wrd_archive
# Extract the tweet ids from the wrd dataframe
    tweet_ids = wrd_archive['tweet_id']
    success, failure, counter = (0, 0, 0)
    failed_attempts = {}
    print('\033[1m'+'COMMENCING JSON EXTRACTION TASK'+'\033[0m'+'\n'+'-'*70)
    start_time = time.time()

    # Loop over each tweet id and collect the information
    with open('tweet_info.json', 'w') as file:
        print('Pulling json data for the first 200 tweets...')
        for tweet_id in tweet_ids:
            if (success % 200 ==0) and (counter > 0):
                print('\033[94m'+'\033[1m'+'Sub-task Complete!'+'\033[0m')
                print('Successful pulls: {} || failed pulls: {} || Pulls pending: {}'.format(success, failure, tweet_ids.size - counter))
                print('\nPulling json data for the next 200 tweets...')
            try:
                tweet_info = api.get_status(tweet_id, tweet_mode='extended')
                json.dump(tweet_info._json, file)
                file.write('\n')
                success+=1
            except Exception as e:
                failed_attempts[tweet_id] = e
                failure+=1
                pass
            finally:
                counter+=1

    # Print feedback on execution process  
    duration = (time.time() - start_time)/60
    failed = len(failed_attempts.keys())
    print('\033[1m' + '\033[94m' +'Task Completed!\n'+'\033[0m' + '-'*70)
    print('\033[1m'+'DISPLAYING RUNTIME SUMMARY'+'\033[0m')
    print('The entire process took: {} minutes'.format(round(duration, 2)))

    if (failed > 0):
        print('\033[91m'+'\033[1m'+'Could not pull information for '+ str(failed) + ' tweet ids:'+'\033[0m')
        print(pd.Series(failed_attempts))
    else:
        print('\033[94m'+'\033[1m'+'No failed attempts'+'\033[0m')

In [8]:
# Extract the information we want from the json file
json_tweet_details = []

with open('tweet_info.json', 'r', encoding='UTF-8') as file:
    for line in file:
        json_text = json.loads(line)
        # Extract the tweet_id, likes and retweet count
        tweet_id = json_text['id_str']
        retweets = json_text['retweet_count']
        likes = json_text['favorite_count']
        # Extract the hashtag from the json file
        hashtags_info = json_text['entities']['hashtags']
        if len(hashtags_info) !=0:
            hashtags = ['#'+item['text'] for item in hashtags_info]
        else:
            hashtags = 'None'
        # Assign these values into our list
        json_tweet_details.append({
            'tweet_id': tweet_id,
            'hashtag': hashtags,
            'retweets': retweets,
            'likes': likes}
        )
    
    json_tweet_info = pd.DataFrame(json_tweet_details)

In [9]:
json_tweet_info.head(3)

Unnamed: 0,tweet_id,hashtag,retweets,likes
0,892420643555336193,,7024,33866
1,892177421306343426,,5305,29364
2,891815181378084864,,3488,22089


## Assessing Data
---
### A. Visual Assessment
**1. Examining a sample of 50 records from the `wrd_archive` dataframe in Jupyter notebook, including additional visual assessments in google sheets:**

In [10]:
wrd_archive.sample(50)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
1199,716791146589110272,,,2016-04-04 00:55:01 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Jennifur. She's supposed to be navigating...,,,,https://twitter.com/dog_rates/status/716791146...,11,10,Jennifur,,,,
133,866720684873056260,,,2017-05-22 18:21:28 +0000,"<a href=""http://twitter.com/download/iphone"" r...",He was providing for his family 13/10 how dare...,,,,https://twitter.com/nbcnews/status/86645871888...,13,10,,,,,
819,770655142660169732,,,2016-08-30 16:11:18 +0000,"<a href=""http://twitter.com/download/iphone"" r...",We only rate dogs. Pls stop sending in non-can...,,,,https://twitter.com/dog_rates/status/770655142...,11,10,very,,,,
198,854365224396361728,,,2017-04-18 16:05:17 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Einstein. He's having a really good da...,,,,https://twitter.com/dog_rates/status/854365224...,13,10,Einstein,,,,
1778,677895101218201600,,,2015-12-18 16:56:01 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Guys this was terrifying. Really spooked me up...,,,,https://twitter.com/dog_rates/status/677895101...,9,10,,,,,
1258,710283270106132480,,,2016-03-17 01:55:02 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Gunner. He's a Figamus Newton. King of...,,,,https://twitter.com/dog_rates/status/710283270...,11,10,Gunner,,,,
2249,667861340749471744,,,2015-11-21 00:25:26 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is a Shotokon Macadamia mix named Cheryl....,,,,https://twitter.com/dog_rates/status/667861340...,9,10,a,,,,
1437,697242256848379904,,,2016-02-10 02:14:42 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Oakley. He has a massive tumor growing...,,,,https://twitter.com/dog_rates/status/697242256...,10,10,Oakley,,,,
1451,695794761660297217,,,2016-02-06 02:22:53 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Wyatt. His throne is modeled after him...,,,,https://twitter.com/dog_rates/status/695794761...,13,10,Wyatt,,,,
1589,686683045143953408,,,2016-01-11 22:56:10 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Mona. She's a Yarborough Splishnsplash...,,,,https://twitter.com/dog_rates/status/686683045...,11,10,Mona,,,pupper,


**Notes:**
> **Quality Issues**
>- Some records appear to be replies or retweets to previously created tweets; some may contain ratings, but they are not the original tweets. This information can be observed in the `in_reply_to_status_id`,	`in_reply_to_user_id`, `retweeted_status_id`, `retweeted_status_user_id` and `retweeted_status_timestamp` columns.
>- Unexpected ratings in the `rating_numerator` and `rating_denominator` columns. Examples are rating numerators as high as `666` and denominators as low as `0`.
>- Unusual dog names such as `a`, `an` and `not` in the `name` column.

> **Tidiness Issues**
>- The various stages of dog life: `doggo`, `pupper`, `puppo`, and `floofer` should be contained in one column.
>- Long and unneccessary links in the `source` column. All we need is the type of device users are tweeting from.
<br>

**2. Examining a sample of 50 records from the `img_predictions` dataframe in Jupyter notebook, including additional visual assessments in google sheets:**

In [11]:
img_predictions.sample(50)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
390,673363615379013632,https://pbs.twimg.com/media/CVhEoq4WcAE8pBm.jpg,1,ox,0.193431,False,warthog,0.123827,False,bison,0.111177,False
1652,809808892968534016,https://pbs.twimg.com/media/CwS4aqZXUAAe3IO.jpg,1,Labrador_retriever,0.861651,True,golden_retriever,0.044462,True,Staffordshire_bullterrier,0.016497,True
1650,809220051211603969,https://pbs.twimg.com/media/CzrtWDbWEAAmIhy.jpg,1,Pomeranian,0.819511,True,Samoyed,0.141241,True,Pembroke,0.013455,True
1522,788070120937619456,https://pbs.twimg.com/media/Co-hmcYXYAASkiG.jpg,1,golden_retriever,0.735163,True,Sussex_spaniel,0.064897,True,Labrador_retriever,0.047704,True
53,667012601033924608,https://pbs.twimg.com/media/CUG0bC0U8AAw2su.jpg,1,hyena,0.98723,False,African_hunting_dog,0.012601,False,coyote,5.7e-05,False
793,690735892932222976,https://pbs.twimg.com/media/CZX8nyeVAAEstKM.jpg,1,golden_retriever,0.883229,True,Labrador_retriever,0.109635,True,kuvasz,0.002795,True
420,674042553264685056,https://pbs.twimg.com/media/CVquIDRW4AEJrPk.jpg,1,toy_poodle,0.927975,True,miniature_poodle,0.068946,True,standard_poodle,0.001316,True
1833,836753516572119041,https://pbs.twimg.com/media/C5y-4VwWcAIcaoj.jpg,1,mortarboard,0.936882,False,academic_gown,0.020815,False,schipperke,0.011564,True
1253,748307329658011649,https://pbs.twimg.com/media/CmKFi-FXEAAeI37.jpg,2,paddle,0.589066,False,shovel,0.038062,False,mountain_tent,0.029203,False
1770,827600520311402496,https://pbs.twimg.com/media/C3w6RYbWQAAEQ25.jpg,1,Pembroke,0.325638,True,golden_retriever,0.317235,True,Labrador_retriever,0.116087,True


**Notes:**
> **Quality Issues**
>- The Prediction in columns `p1`, `p2` and `p3` are not uniformly formatted. Some names are lowercase, some are uppercase and some are titlecase.
>- The predictions above also have words seperated by underscores instead of spaces.

> **Tidiness Issues**
>- From `p1`, `p2` and `p3`, we only need the most confident prediction that corresponds to an actual dog breed.
<br>

**3. Examining a sample of 50 records each from the `json_tweet_info` dataframe in Jupyter notebook**

In [12]:
json_tweet_info.sample(50, random_state=1)

Unnamed: 0,tweet_id,hashtag,retweets,likes
579,798682547630837760,,4442,0
1255,708479650088034305,,622,2352
47,882992080364220416,,3250,21009
1117,727155742655025152,,1202,3340
614,793614319594401792,,2937,0
108,870804317367881728,,5293,29850
6,890971913173991426,,1673,10382
2162,668892474547511297,,126,354
1178,715733265223708672,,1540,4304
2254,667182792070062081,,5291,12569


**Notes:**
> **Quality Issue**
>- It seems that where hashtags are present, they are stored in list format rather than as individual strings.

### B. Programmatic Assessment
#### 1. WRD Archive Data

In [13]:
wrd_archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

**Notes**
>- `tweet_id` stored as `int` instead of `string/object` type.
>- `181` records are retweets and `78` records are replies. We won't be using these records in our analysis.
>- `timestamp` column is stored as `string/object` type rather than as a `datetime` object.
>- The `expanded_urls` column has some null records

Lets narrow our observations to only records where the `expanded_url` entries are null:

In [14]:
print('Collating the number of null entries for records with missing expanded urls..')
print(wrd_archive[wrd_archive['expanded_urls'].isnull()].isnull().sum())

Collating the number of null entries for records with missing expanded urls..
tweet_id                       0
in_reply_to_status_id          4
in_reply_to_user_id            4
timestamp                      0
source                         0
text                           0
retweeted_status_id           58
retweeted_status_user_id      58
retweeted_status_timestamp    58
expanded_urls                 59
rating_numerator               0
rating_denominator             0
name                           0
doggo                          0
floofer                        0
pupper                         0
puppo                          0
dtype: int64


>Tweets with missing `expanded_urls` are either retweets or likes. We won't be including these records in our analysis.

In [15]:
# Check the archive for duplicate records
duplicates = wrd_archive.duplicated().sum()
print('wrd_archive has {} number of duplicate records'.format(duplicates))

wrd_archive has 0 number of duplicate records


In [16]:
# Examine the unique values in the source column
print(Color.bold+'Examining unique values in the source column'+Color.end)
for i, item in enumerate(wrd_archive['source'].unique()):
    print(i, ': ', item)

[1mExamining unique values in the source column[0m
0 :  <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>
1 :  <a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>
2 :  <a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a>
3 :  <a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>


>- We only want to extract the information between the `<a> </a>` tags, signalling the tweet source.

In [17]:
# Examine the text column and expanded_url columns
pd.set_option("display.max_colwidth",0)
wrd_archive[['text', 'expanded_urls']].sample(5)

Unnamed: 0,text,expanded_urls
1124,This is Ziva. She doesn't know how her collar works. 11/10 would totally fix for her https://t.co/K7pthJXjWE,https://twitter.com/dog_rates/status/730211855403241472/photo/1
281,RT @alexmartindawg: THE DRINK IS DR. PUPPER 10/10 good pun @matt___nelson @GoodDogsGame https://t.co/act3duiqbL,"https://twitter.com/alexmartindawg/status/839289919298224128/photo/1,https://twitter.com/alexmartindawg/status/839289919298224128/photo/1,https://twitter.com/alexmartindawg/status/839289919298224128/photo/1,https://twitter.com/alexmartindawg/status/839289919298224128/photo/1"
415,RT @dog_rates: This is Paisley. She really wanted to be president this time. Dreams officially crushed. 13/10 https://t.co/liJGwMp17E,"https://twitter.com/dog_rates/status/822489057087389700/photo/1,https://twitter.com/dog_rates/status/822489057087389700/photo/1,https://twitter.com/dog_rates/status/822489057087389700/photo/1,https://twitter.com/dog_rates/status/822489057087389700/photo/1,https://twitter.com/dog_rates/status/822489057087389700/photo/1,https://twitter.com/dog_rates/status/822489057087389700/photo/1"
1510,This is Bailey. She likes flowers. 12/10 https://t.co/YBENhr24FV,"https://twitter.com/dog_rates/status/691444869282295808/photo/1,https://twitter.com/dog_rates/status/691444869282295808/photo/1"
30,@NonWhiteHat @MayhewMayhem omg hello tanner you are a scary good boy 12/10 would pet with extreme caution,


On clicking the links and testing them, one would discover that both the `text` and `expanded url` links lead to the same tweet for each record. Some records also have multiple expanded urls seperated by commas, all leading to the same tweet. As a result, we can make the following notes:
>- The `text` column contains both the tweet text and tweet url. 
>- The same tweet url is already present in the `expanded_urls` column

In [18]:
# Examine the distribution of ratings in the dataset
wrd_archive[['rating_numerator', 'rating_denominator']].describe()

Unnamed: 0,rating_numerator,rating_denominator
count,2356.0,2356.0
mean,13.126486,10.455433
std,45.876648,6.745237
min,0.0,0.0
25%,10.0,10.0
50%,11.0,10.0
75%,12.0,10.0
max,1776.0,170.0


In [19]:
# Examine the unique values in rating numerator and denominator
print(Color.bold+'Unique rating numerators'+Color.end)
print(wrd_archive['rating_numerator'].unique())
print(Color.bold+'\nUnique rating denominators'+Color.end)
print(wrd_archive['rating_denominator'].unique())

[1mUnique rating numerators[0m
[  13   12   14    5   17   11   10  420  666    6   15  182  960    0
   75    7   84    9   24    8    1   27    3    4  165 1776  204   50
   99   80   45   60   44  143  121   20   26    2  144   88]
[1m
Unique rating denominators[0m
[ 10   0  15  70   7  11 150 170  20  50  90  80  40 130 110  16 120   2]


Though WeRateDogs post can have numerators higher than 10, they almost always have denominators of 10. Having numerators as high as 1776 and denominators as low as 0 prompts us to inspect the dataframe further:

In [20]:
# Assess instances where rating numerators > 20 and denominators are !=10
rating_check_df = (wrd_archive[(wrd_archive['rating_numerator'] > 20) | (wrd_archive['rating_denominator']!=10)])
# filter out the retweets
rating_check_df = (rating_check_df[rating_check_df['retweeted_status_id'].isnull()])
# filter out the replies
rating_check_df = (rating_check_df[rating_check_df['in_reply_to_status_id'].isnull()])
# Finally examine the text and the ratings
print('{} records found'.format(rating_check_df.shape[0]))
rating_check_df[['text', 'rating_numerator', 'rating_denominator']]

22 records found


Unnamed: 0,text,rating_numerator,rating_denominator
433,The floofs have been released I repeat the floofs have been released. 84/70 https://t.co/NIYC820tmd,84,70
516,Meet Sam. She smiles 24/7 &amp; secretly aspires to be a reindeer. \nKeep Sam smiling by clicking and sharing this link:\nhttps://t.co/98tB8y7y7t https://t.co/LouL5vdvxx,24,7
695,"This is Logan, the Chow who lived. He solemnly swears he's up to lots of good. H*ckin magical af 9.75/10 https://t.co/yBO5wuqaPS",75,10
763,This is Sophie. She's a Jubilant Bush Pupper. Super h*ckin rare. Appears at random just to smile at the locals. 11.27/10 would smile back https://t.co/QFaUiIHxHq,27,10
902,Why does this never happen at my front door... 165/150 https://t.co/HmwrdfEfUE,165,150
979,This is Atticus. He's quite simply America af. 1776/10 https://t.co/GRXwMxLBkh,1776,10
1068,"After so many requests, this is Bretagne. She was the last surviving 9/11 search dog, and our second ever 14/10. RIP https://t.co/XAVDNDaVgQ",9,11
1120,Say hello to this unbelievably well behaved squad of doggos. 204/170 would try to pet all at once https://t.co/yGQI3He3xv,204,170
1165,Happy 4/20 from the squad! 13/10 for all https://t.co/eV1diwds8a,4,20
1202,This is Bluebert. He just saw that both #FinalFur match ups are split 50/50. Amazed af. 11/10 https://t.co/Kky1DPG4iq,50,50


>- Some ratings were erroneously pulled from the original tweet. Especially when dates (e.g 24/7 and 9/11) or decimal ratings (e.g 11.27/10 and 9.75/10) are included in the tweet text.
>- Overly high ratings appear to be addressed to groups of dogs as can be seen in the following tweet urls among others: [165/150](https://t.co/HmwrdfEfUE), [84/70](https://t.co/NIYC820tmd), [88/80](https://t.co/y93p6FLvVw).

In [21]:
# Examine the name column further especially names with 4 characters or less
print(Color.bold+'Specially examine names with four string characters or less..'+Color.end)
print(wrd_archive.name[wrd_archive.name.apply(lambda x: len(x)<=4)].unique())

[1mSpecially examine names with four string characters or less..[0m
['None' 'Jax' 'Zoey' 'Koda' 'Ted' 'Jim' 'Zeke' 'such' 'Maya' 'Earl' 'Lola'
 'Yogi' 'Noah' 'Gus' 'Alfy' 'Koko' 'Rey' 'Gary' 'a' 'Jack' 'Emmy' 'Beau'
 'Aja' 'Cash' 'Coco' 'Jed' 'Kody' 'Dawn' 'Cody' 'Lili' 'Dave' 'Burt'
 'Carl' 'Thor' 'Luna' 'Arya' 'Iggy' 'Kyle' 'Leo' 'Odin' 'Tuck' 'Hank'
 'Ken' 'Max' 'Odie' 'Arlo' 'Lucy' 'Ava' 'Rory' 'Eli' 'Ash' 'Tobi' 'not'
 'Kuyu' 'Pete' 'Kyro' 'Loki' 'Mia' 'one' 'Mutt' 'Bear' 'Kona' 'Phil' 'Ike'
 'Mo' 'Toby' 'Nala' 'Gabe' 'Luca' 'Finn' 'Anna' 'Bo' 'Tom' 'Dido' 'Levi'
 'Alf' 'Sky' 'Tyr' 'Mary' 'Moe' 'Halo' 'Sam' 'Ito' 'Milo' 'Cali' 'Duke'
 'Chef' 'Doc' 'Sobe' 'Iroh' 'Ruby' 'Mack' 'Juno' 'Lily' 'Newt' 'Nida'
 'BeBe' 'mad' 'Dale' 'Hero' 'Godi' 'Dash' 'Bell' 'Jay' 'Mya' 'an' 'Huck'
 'very' 'O' 'Blue' 'Fizz' 'Chip' 'Grey' 'Al' 'just' 'Lou' 'Tito' 'Brat'
 'Tove' 'my' 'Kota' 'Eve' 'Rose' 'Theo' 'Fido' 'Emma' 'Gert' 'Dex' 'Ace'
 'Fred' 'Zoe' 'Blu' 'his' 'Cora' 'Abby' 'Geno' 'Beya' 'Kilo' 'D

>- Again we notice more unusual names like `the`, `my`, `by`, `his`, `all`, `mad`, `life`, `very`, `old`, `this`, `just` etc. All these unusual names are formatted in lower case, while the more viable names are properly capitalized.

In [22]:
#Examine the dog stage columns
for dog_stage in wrd_archive.columns[-4:]:
    print(Color.bold+'\nValue counts for {} column'.format(dog_stage)+Color.end)
    print(wrd_archive[dog_stage].value_counts())

[1m
Value counts for doggo column[0m
None     2259
doggo    97  
Name: doggo, dtype: int64
[1m
Value counts for floofer column[0m
None       2346
floofer    10  
Name: floofer, dtype: int64
[1m
Value counts for pupper column[0m
None      2099
pupper    257 
Name: pupper, dtype: int64
[1m
Value counts for puppo column[0m
None     2326
puppo    30  
Name: puppo, dtype: int64


>- Asides the fact that we have to tidy up these columns into one, everything looks good.
#### 2. Image Predictions Data