# Packages

In [1]:
# Packages used for the project
import pandas as pd
import numpy as np
import requests
import tweepy
import json
import os
import requests
#from PIL import Image
#from io import BytesIO

# Gather the Data

## Enhanced Twitter Archive 

We have this file on hand and so have downloaded it manually

In [2]:
# twitter archive is downloaded manually as a file we have on hand
twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')

## Image Predictions

The file (image_predictions.tsv) hosted on Udacity's servers and was downloaded programmatically using the following URL: https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv

In [3]:
folder_name = 'image_predictions'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)
    
# Download the image_predictions.tsv programmatically from Udacity's servers
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)
response

<Response [200]>

In [4]:
with open(os.path.join(folder_name, 
                      url.split('/')[-1]), mode='wb') as file:
    file.write(response.content)

In [5]:
image_predictions = pd.read_csv('U:/Professional Development/Udacity/Projects/T2 P2/image_predictions/image-predictions.tsv', sep='\t')

## Twitter API

Tweet ID, retweet count, favorite count and date created queried from Twitter's API

In [None]:
# Create authentication to Twtitter's API using Python's Tweepy library
# Take out consumer key, consumer secret, access token and access secret before submitting the project
consumer_key = 'consumer_key'
consumer_secret = 'consumer_secret'
access_token = 'access_token'
access_secret = 'access_secret'

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth)

In [None]:
#Compiling the list of tweet ids from the twitter_archive
tweet_id_list = twitter_archive['tweet_id']
tweet_id_list

In [None]:
#querying twitter's api to get the retweet count, favorite count and the date created.
#https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object

#reply_count: Number of times this Tweet has been replied to
#retweet_count: Number of times this Tweet has been retweeted
#favorite_count: Indicates approximately how many times this Tweet has been liked by Twitter users.
#created_at: UTC time when this Tweet was created
#withheld_in_countries: When present, indicates a list of uppercase two-letter country codes this content is withheld from. Twitter supports the following non-country values for this field:
#“XX” - Content is withheld in all countries “XY” - Content is withheld due to a DMCA request.

tweet_count = []
tweet_errors = []
for tweetid in tweet_id_list:
    try:
        tweet = api.get_status(tweetid, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
        print(tweetid)
        retweet_count = tweet.retweet_count
        favorite_count = tweet.favorite_count
        created_at = tweet.created_at
        tweet_count.append({'tweet_id': str(tweetid),
                            'retweet_count': retweet_count,
                            'favorite_count': favorite_count,
                            'created_at': str(created_at)})
    except Exception as e:
        print(str(tweetid) + str(e))
        tweet_errors.append({'tweet_id': tweetid})


with open('tweet_json.txt', 'w') as outfile:  
    json.dump(tweet_count, outfile)     

In [7]:
# load the json file
with open('tweet_json.txt') as json_file:  
    tweet_count_list = json.load(json_file)

In [8]:
# Create a pandas data frame from the json file
tweet_count_list = pd.DataFrame(tweet_count_list, columns = ['tweet_id', 'retweet_count', 'favorite_count', 'created_at'])
tweet_count_list = tweet_count_list.sort_values('tweet_id').reset_index(drop=True)
tweet_count_list.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2351 entries, 0 to 2350
Data columns (total 4 columns):
tweet_id          2351 non-null object
retweet_count     2351 non-null int64
favorite_count    2351 non-null int64
created_at        2351 non-null object
dtypes: int64(2), object(2)
memory usage: 73.5+ KB


In [9]:
# Create a pandas data frame for the tweets with a tweet_id error
tweet_errors = pd.DataFrame(tweet_errors, columns = ['tweet_id'])
tweet_errors.to_csv('tweet_errors.csv', index=False)

NameError: name 'tweet_errors' is not defined

In [6]:
#For working take out before submitting project
tweet_count_list = pd.read_csv('tweet_count.csv')
tweet_errors = pd.read_csv('tweet_errors.csv')
tweet_errors

Unnamed: 0,tweet_id
0,888202515573088257
1,869988702071779329
2,861769973181624320
3,802247111496568832
4,775096608509886464


# Assessing the Data

In [7]:
twitter_archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

In [8]:
twitter_archive.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


In [9]:
image_predictions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [10]:
image_predictions.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


In [11]:
tweet_count_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2351 entries, 0 to 2350
Data columns (total 4 columns):
tweet_id          2351 non-null int64
retweet_count     2351 non-null int64
favorite_count    2351 non-null int64
created_at        2351 non-null object
dtypes: int64(3), object(1)
memory usage: 73.5+ KB


In [12]:
tweet_count_list.describe()

Unnamed: 0,tweet_id,retweet_count,favorite_count
count,2351.0,2351.0,2351.0
mean,7.42566e+17,3127.64866,8106.415993
std,6.84656e+16,5232.892613,11980.413387
min,6.660209e+17,0.0,0.0
25%,6.783929e+17,617.0,1417.5
50%,7.193395e+17,1451.0,3589.0
75%,7.989987e+17,3618.5,10106.0
max,8.924206e+17,78939.0,132040.0


# Initial Issues

## Quality 

  - We only want original ratings (no retweets) that have images
  - Some dogs don't have a dog type
  - Missing image predictions
  - Missing dog type are not NAN, but 'none'
  - Incorrect ratings example tweet id 883482846933004... should be 13.5/10 not 5/10
  - Incorrect dog names
  - Incomplete dog stages
  - Invalid tweet ids {'tweet_id': 888202515573088257},
 {'tweet_id': 869988702071779329},
 {'tweet_id': 861769973181624320},
 {'tweet_id': 802247111496568832},
 {'tweet_id': 775096608509886464}] 
 
## Tidiness

  - Twitter archive has a column for each dog type
  - Dog ratings and dog type predictions in different files

## Copy

Make a copy of the three datasets to work on

In [13]:
twitter_archive_clean = twitter_archive.copy()
image_predictions_clean = image_predictions.copy()
tweet_count_clean = tweet_count_list.copy()

# Cleaning

## Quality Issue 1

We only want original ratings (no retweets) that have images

## Clean

Remove any tweet information with a retweeted status id

In [14]:
twitter_archive_clean = twitter_archive_clean[pd.isnull(twitter_archive_clean['retweeted_status_id'])]

In [15]:
# Test

twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2175 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2175 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2175 non-null object
source                        2175 non-null object
text                          2175 non-null object
retweeted_status_id           0 non-null float64
retweeted_status_user_id      0 non-null float64
retweeted_status_timestamp    0 non-null object
expanded_urls                 2117 non-null object
rating_numerator              2175 non-null int64
rating_denominator            2175 non-null int64
name                          2175 non-null object
doggo                         2175 non-null object
floofer                       2175 non-null object
pupper                        2175 non-null object
puppo                         2175 non-null object
dtypes: float64(4), int64(3), object(1

## Quality Issue 2 

Not all dog stages are recorded from the text

## Clean

Extract additional dog stages from the text

In [16]:
twitter_archive_clean['doggo'] = twitter_archive_clean['doggo'].replace('None', twitter_archive_clean.text.str.extract('(dogg\w*)', expand=False))
twitter_archive_clean['floofer'] = twitter_archive_clean['floofer'].replace('None', twitter_archive_clean.text.str.extract('(floof\w*)', expand=False))
twitter_archive_clean['pupper'] = twitter_archive_clean['pupper'].replace('None', twitter_archive_clean.text.str.extract('(pupper\w*)', expand=False))
twitter_archive_clean['puppo'] = twitter_archive_clean['puppo'].replace('None', twitter_archive_clean.text.str.extract('(puppo\w*)', expand=False))


In [17]:
# test
twitter_archive_clean['doggo'].value_counts()

doggo      87
doggos      8
doggles     3
dogg        2
doggy       2
Name: doggo, dtype: int64

In [18]:
twitter_archive_clean['floofer'].value_counts()

floof      15
floofer    10
floofy      3
floofs      1
Name: floofer, dtype: int64

In [19]:
twitter_archive_clean['pupper'].value_counts()

pupper          234
puppers          23
puppertunity      1
Name: pupper, dtype: int64

In [20]:
twitter_archive_clean['puppo'].value_counts()

puppo           25
puppos           1
puppoccino       1
pupporting       1
pupposes         1
puppon           1
pupporazzi       1
puppologize      1
puppologized     1
Name: puppo, dtype: int64

In [21]:
# Create a dictionary of all the different iterations of dog type
dog_type = {'doggos': 'doggo',
           'doggles': 'doggo',
           'dogg': 'doggo',
           'doggy': 'doggo',
           'floof': 'floofer',
           'floofy': 'floofer',
           'floofs': 'floofer',
           'puppers': 'pupper',
           'puppertunity': 'pupper',
           'puppologize': 'puppo',
           'puppon': 'puppo',
           'pupposes': 'puppo',
           'puppoccino': 'puppo',
           'pupporazzi': 'puppo',
           'puppos': 'puppo',
           'pupporting': 'puppo',
           'puppologized': 'puppo'}

## Tidiness Issue 1

Twitter archive has a column for each dog type

## Clean

Concatenate the four dog type columns into one column and remove the original four columns

In [22]:
cols = ['doggo', 'floofer', 'pupper', 'puppo']
twitter_archive_clean['dog_stage'] = twitter_archive_clean[cols].apply(lambda x: ','.join(x.dropna()), axis=1)

In [23]:
twitter_archive_clean = twitter_archive_clean.drop(['doggo', 'floofer', 'pupper', 'puppo'], axis=1)

In [24]:
# Test

twitter_archive_clean.sample(10)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,dog_stage
544,805932879469572096,,,2016-12-06 00:32:26 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Major. He put on a tie for his first r...,,,,https://twitter.com/dog_rates/status/805932879...,12,10,Major,
1218,714957620017307648,,,2016-03-29 23:29:14 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Curtis. He's an Albino Haberdasher. Te...,,,,https://twitter.com/dog_rates/status/714957620...,10,10,Curtis,
1303,707420581654872064,,,2016-03-09 04:19:44 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Keurig. He's a rare dog. Laughs like a...,,,,https://twitter.com/dog_rates/status/707420581...,4,10,Keurig,
2248,667866724293877760,,,2015-11-21 00:46:50 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Shaggy. He knows exactly how to solve ...,,,,https://twitter.com/dog_rates/status/667866724...,10,10,Shaggy,
1521,690690673629138944,,,2016-01-23 00:21:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Cedrick. He's a spookster. Did me a di...,,,,https://twitter.com/dog_rates/status/690690673...,10,10,Cedrick,
11,889880896479866881,,,2017-07-25 16:11:53 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Bruno. He is a service shark. Only get...,,,,https://twitter.com/dog_rates/status/889880896...,13,10,Bruno,
681,788765914992902144,,,2016-10-19 15:37:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Butter. She can have whatever she want...,,,,https://twitter.com/dog_rates/status/788765914...,12,10,Butter,
880,760539183865880579,,,2016-08-02 18:14:06 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Atlas. Swinging is his passion. 12/10 ...,,,,https://twitter.com/dog_rates/status/760539183...,12,10,Atlas,
1212,715342466308784130,,,2016-03-31 00:58:29 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Oscar. He's a world renowned snowball ...,,,,https://twitter.com/dog_rates/status/715342466...,10,10,Oscar,
1728,679872969355714560,,,2015-12-24 03:55:21 +0000,"<a href=""http://vine.co"" rel=""nofollow"">Vine -...",This is Rocco. He's in a very intense game of ...,,,,https://vine.co/v/iAAxTbj1UAM,10,10,Rocco,


In [25]:
twitter_archive_clean['dog_stage'].value_counts()

                 1766
pupper            224
doggo              74
puppo              24
puppers            23
floof              15
doggo,pupper       10
floofer             9
doggos              8
doggles             3
floofy              3
doggy               2
dogg                2
puppologize         1
doggo,puppon        1
floofs              1
puppoccino          1
doggo,floofer       1
pupporting          1
doggo,puppo         1
pupporazzi          1
puppologized        1
puppertunity        1
pupposes            1
puppos              1
Name: dog_stage, dtype: int64

## Quality Issue 3

Extracted dog types are varied

## Clean

Change dog types to the column name dog types

In [26]:
def update_type(twitter_archive_clean):
    if twitter_archive_clean['dog_stage'] in dog_type.keys():
        update_dog = dog_type[twitter_archive_clean['dog_stage']]
        return update_dog
    else:
        return twitter_archive_clean['dog_stage']

twitter_archive_clean['dog_stage'] = twitter_archive_clean.apply(update_type, axis=1)

In [27]:
twitter_archive_clean['dog_stage'].value_counts()

                 1766
pupper            248
doggo              89
puppo              31
floofer            28
doggo,pupper       10
doggo,puppo         1
doggo,floofer       1
doggo,puppon        1
Name: dog_stage, dtype: int64

In [28]:
# puppon is not used as a dog type
twitter_archive_clean.dog_stage = twitter_archive_clean.dog_stage.replace('doggo,puppon', 'doggo')

In [29]:
twitter_archive_clean['dog_stage'].value_counts()

                 1766
pupper            248
doggo              90
puppo              31
floofer            28
doggo,pupper       10
doggo,puppo         1
doggo,floofer       1
Name: dog_stage, dtype: int64

## Quality Issue 4

Incorrect ratings example tweet id 883482846933004... should be 13.5/10 not 5/10

## Clean

Extract ratings from text

In [64]:
twitter_archive_clean['rating'] = twitter_archive_clean.text.str.extract('(\d\d?\d?.?\d?\d?/\d\d\d?)', expand=False)

In [65]:
# Test
twitter_archive_clean['rating'].value_counts()

12/10       500
10/10       442
11/10       425
13/10       306
9/10        156
8/10         98
7/10         52
14/10        43
5/10         34
6/10         32
3/10         19
4/10         15
2/10          9
1/10          7
4/20          2
420/10        2
0/10          2
99/90         1
960/00        1
204/170       1
20/16         1
17/10         1
45/50         1
11.27/10      1
182/10        1
9/11          1
3 13/10       1
143/130       1
84/70         1
11.26/10      1
9.75/10       1
88/80         1
007/10        1
1776/10       1
11/15/15      1
44/40         1
15/10         1
666/10        1
144/120       1
165/150       1
9.5/10        1
80/80         1
121/110       1
13.5/10       1
60/50         1
7/11          1
50/50         1
Name: rating, dtype: int64

In [56]:
twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2175 entries, 0 to 2355
Data columns (total 15 columns):
tweet_id                      2175 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2175 non-null object
source                        2175 non-null object
text                          2175 non-null object
retweeted_status_id           0 non-null float64
retweeted_status_user_id      0 non-null float64
retweeted_status_timestamp    0 non-null object
expanded_urls                 2117 non-null object
rating_numerator              2175 non-null int64
rating_denominator            2175 non-null int64
name                          2175 non-null object
dog_stage                     2175 non-null object
rating                        2166 non-null object
dtypes: float64(4), int64(3), object(8)
memory usage: 351.9+ KB


## Quality Issue 5

Date extracted from the tweet with tweet_id 832088576586297345	

## Clean

Replace rating of tweet_id that came out as date

In [66]:
twitter_archive_clean[twitter_archive_clean['rating'] == '11/15/15']

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,dog_stage,rating
342,832088576586297345,8.320875e+17,30582082.0,2017-02-16 04:45:50 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@docmisterio account started on 11/15/15,,,,,11,15,,,11/15/15


In [68]:
twitter_archive_clean['rating'] = twitter_archive_clean['rating'].replace('11/15/15', '')

In [69]:
twitter_archive_clean[twitter_archive_clean['rating'] == '11/15/15']

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,dog_stage,rating


In [None]:
twitter_archive_clean