# Gather data

### Import necessary libraries

In [1]:
import tweepy
import pandas as pd
import requests
import io
import json
import os
import glob
import numpy as np
import re

### Initialize twitter API access

In [45]:

consumer_key = ''
consumer_secret = ''
access_token = ''
access_secret = ''


auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth ,wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

### Import twitter archive of user WeRateDogs

In [46]:
df = pd.read_csv("twitter-archive-enhanced.csv")

### Gather additional tweet data and save it in a folder

In [47]:
# Check which staus info is already collected
folder_name = "tweet_data"
ids_collected = list(map(lambda x: int(x[11:-4]), glob.glob(folder_name+"/*.txt")))

In [260]:
def get_tweet_status():
    '''
    Takes the status ids of the twitter archive csv and queries the twitter api for the complete information.
    Saves a text file with the information for each twitter status. Also saves a list of statuses that were 
    not able to be retrieved from the API
    
    Args:
        None
    Output:
        None
    '''
    # Create a folder to store all the tweet data inside
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    # Error list for possibly deleted tweets
    error_list = []
    # Variable to measure progress
    i = 1
    # Get the status for each tweet id in the tweet_id column of the twitter archive
    
    for tweet_id in list(df.tweet_id):
        
        # Print progress status for every 20th call
        if i%20 ==0:
            print(str(i/len(df)*100)+"% of API download done")
        
        # If the id is already collected it should be skipped to save time
        if tweet_id not in ids_collected:
            try:
                tweet = api.get_status(tweet_id)
                data = tweet._json

                #Save the json data as a txt file with the corresponding tweet id (Saving as .txt to fulfill project rubric)
                with open(folder_name+"/"+str(tweet_id)+'.txt', 'w', encoding='utf-8') as f:
                    json.dump(data, f, ensure_ascii=False, indent=4)
            except Exception as e:
                error_list.append(tweet_id)
                print("The status of the tweet with id {} is not possible to access".format(tweet_id))
            # Store the tweet json data in a variable
            
                
        i+=1
        
    # Save the error list for later usage    
    pd.DataFrame(error_list).to_csv("na_ids.csv", index = False)
    

In [49]:
# Only used once to get the data
#get_tweet_status()

0.8488964346349746% of API download done
The status of the tweet with id 888202515573088257 is not possible to access
1.697792869269949% of API download done
2.5466893039049237% of API download done
3.395585738539898% of API download done
The status of the tweet with id 873697596434513921 is not possible to access
4.244482173174872% of API download done
The status of the tweet with id 872668790621863937 is not possible to access
The status of the tweet with id 872261713294495745 is not possible to access
The status of the tweet with id 869988702071779329 is not possible to access
5.093378607809847% of API download done
The status of the tweet with id 866816280283807744 is not possible to access
5.942275042444821% of API download done
The status of the tweet with id 861769973181624320 is not possible to access
6.791171477079796% of API download done
7.6400679117147705% of API download done
The status of the tweet with id 856602993587888130 is not possible to access
8.488964346349745% of

Rate limit reached. Sleeping for: 682


70.45840407470288% of API download done
71.30730050933785% of API download done
72.15619694397284% of API download done
73.00509337860781% of API download done
The status of the tweet with id 680055455951884288 is not possible to access
73.85398981324278% of API download done
74.70288624787777% of API download done
75.55178268251274% of API download done
76.4006791171477% of API download done
77.24957555178268% of API download done
78.09847198641766% of API download done
78.94736842105263% of API download done
79.79626485568761% of API download done
80.64516129032258% of API download done
81.49405772495756% of API download done
82.34295415959252% of API download done
83.1918505942275% of API download done
84.04074702886248% of API download done
84.88964346349745% of API download done
85.73853989813243% of API download done
86.5874363327674% of API download done
87.43633276740238% of API download done
88.28522920203736% of API download done
89.13412563667232% of API download done
89.983

### Import tweet status data pulled from the API as text and write it in a dataframe
Comment: I saved the data as txt to fulfill the rubric of this project to work with three different kind of data.
It would have been a lot easier to query the json structure than this regex stuff which is quite an unrobust and dirty solution I think.

In [248]:
def text_to_df():  
    '''
    Takes the text files from the subfolder and saves the tweet id,
    follower_count, favourites of user count, retweet count and the 
    favourites of the post count in a dataframe.
    
    Args:
        None
    Output:
        Saves the data as a dataframe, returns no output.
    '''
    # Initialize list of dictionaries
    df_list = []

    for status in glob.glob(folder_name+"/*.txt"):
        with open(status, encoding = "utf-8") as file:
            # Read only the couple of first lines for the id, afterwards the files differ from each other
            lines = file.readlines()[2:4]
            file.seek(0)
            text = file.read()
            text_id = lines[1].strip()[11:-2]

            # Be sure that ids are consistent
            if text_id != status[11:-4]:
                raise Exception("Id saved in file does not match id from filename")

            # Find the needed data with regex, since line by line reading not possible
            follower_count = re.findall(r"\"followers_count\": (\d+),", text)[0]
            favourites_of_user_count = re.findall(r"\"favourites_count\": (\d+),", text)[0]
            favourite_of_post_count = re.findall(r"\"favorite_count\": (\d+),", text)[0]
            retweet_count = re.findall(r"\"retweet_count\": (\d+),", text)[0]

            # Append the list entry of dictionaries to create a dataframe
            df_list.append({"tweet_id": text_id,
                           "follower_count": follower_count,
                           "favourites_of_user_count": favourites_of_user_count,
                           "retweet_count": retweet_count,
                           "favourite_of_post_count": favourite_of_post_count})

            file.close()


    df_additional_data = pd.DataFrame(df_list, columns = ["tweet_id", "follower_count", "favourites_of_user_count", "retweet_count", "favourite_of_post_count"])
    df_additional_data.to_csv("api_twitter_status.csv")

In [259]:
# Only needed to run once to create the df
#text_to_df()

### Download the dog predictions file

In [252]:
r = requests.get('https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv').content

dog_predictions = pd.read_csv(io.StringIO(r.decode('utf-8')), sep = "\t")

dog_predictions.to_csv("dog_predictions.csv")

# Asses
Now that all needed data is gathered the three data frames are assessed.
The dataframes are:
 - twitter-archives-enhanced.csv
 - api_twitter_status.csv
 - dog_predictions.csv
 
 *All found issues are recorded at the bottom of this chapter*


In [2]:
twitter_df = pd.read_csv("twitter-archive-enhanced.csv")
api_df = pd.read_csv("api_twitter_status.csv", index_col = 0)
dogs_df = pd.read_csv("dog_predictions.csv", index_col = 0)
# List of statuses that were not available over the api
na_id = pd.read_csv("na_ids.csv")

In [3]:
twitter_df.sample(5)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
444,819238181065359361,,,2017-01-11 17:42:57 +0000,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",Some happy pupper news to share. 10/10 for eve...,,,,http://us.blastingnews.com/news/2017/01/200-do...,10,10,,,,pupper,
1511,691416866452082688,,,2016-01-25 00:26:41 +0000,"<a href=""http://twitter.com/download/iphone"" r...",I present to you... Dog Jesus. 13/10 (he could...,,,,https://twitter.com/dog_rates/status/691416866...,13,10,,,,,
282,839239871831150596,,,2017-03-07 22:22:32 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Odie. He's big. 13/10 would attempt to...,,,,https://twitter.com/dog_rates/status/839239871...,13,10,Odie,,,,
2003,672475084225949696,,,2015-12-03 17:58:48 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Buddy. He's photogenic af. Loves to se...,,,,https://twitter.com/dog_rates/status/672475084...,8,10,Buddy,,,,
1925,674063288070742018,,,2015-12-08 03:09:46 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Earl. Earl is lost. Someone help Earl....,,,,https://twitter.com/dog_rates/status/674063288...,5,10,Earl,,,,


In [15]:
twitter_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

In [16]:
type(twitter_df.timestamp[0])

str

In [4]:
twitter_df.in_reply_to_status_id.value_counts(dropna = False)

NaN             2278
6.671522e+17       2
6.737159e+17       1
6.753494e+17       1
6.754971e+17       1
                ... 
8.707262e+17       1
8.482121e+17       1
6.715449e+17       1
6.936422e+17       1
8.406983e+17       1
Name: in_reply_to_status_id, Length: 78, dtype: int64

In [11]:
twitter_df.source.value_counts()

<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>     2221
<a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a>                          91
<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>                       33
<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>      11
Name: source, dtype: int64

In [21]:
twitter_df.duplicated().sum()

0

In [5]:
api_df.sample(5)

Unnamed: 0,tweet_id,follower_count,favourites_of_user_count,retweet_count,favourite_of_post_count
1506,838083903487373313,8709205,144674,3163,17791
1380,834458053273591808,8709205,144674,1703,9732
1543,836380477523124226,8709205,144674,3010,14898
965,666345417576210432,8709247,144674,128,275
1677,880095782870896641,8709204,144674,4002,26147


In [17]:
api_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2331 entries, 0 to 2330
Data columns (total 5 columns):
tweet_id                    2331 non-null int64
follower_count              2331 non-null int64
favourites_of_user_count    2331 non-null int64
retweet_count               2331 non-null int64
favourite_of_post_count     2331 non-null int64
dtypes: int64(5)
memory usage: 109.3 KB


In [22]:
api_df.duplicated().sum()

0

In [6]:
dogs_df.sample(5)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
2005,877556246731214848,https://pbs.twimg.com/media/DC20wEcW0AAf59m.jpg,1,basset,0.995368,True,Welsh_springer_spaniel,0.001936,True,bathtub,0.000468,False
167,668986018524233728,https://pbs.twimg.com/media/CUi3PIrWoAAPvPT.jpg,1,doormat,0.976103,False,Chihuahua,0.00564,True,Norfolk_terrier,0.003913,True
1438,774314403806253056,https://pbs.twimg.com/media/Cr7q1VxWIAA5Nm7.jpg,3,Eskimo_dog,0.596045,True,Siberian_husky,0.223067,True,Saluki,0.036325,True
1652,809808892968534016,https://pbs.twimg.com/media/CwS4aqZXUAAe3IO.jpg,1,Labrador_retriever,0.861651,True,golden_retriever,0.044462,True,Staffordshire_bullterrier,0.016497,True
1236,746726898085036033,https://pbs.twimg.com/media/ClzoJz7WYAELHSf.jpg,1,golden_retriever,0.256505,True,Labrador_retriever,0.252417,True,seat_belt,0.203163,False


In [18]:
dogs_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 168.2+ KB


In [7]:
dogs_df.p1.value_counts()

golden_retriever      150
Labrador_retriever    100
Pembroke               89
Chihuahua              83
pug                    57
                     ... 
hand_blower             1
dining_table            1
zebra                   1
restaurant              1
slug                    1
Name: p1, Length: 378, dtype: int64

In [23]:
dogs_df.duplicated().sum()

0

## Recording of issues
### Quality issues (dirty data, content issues)
 - Completeness: Is something missing?
 - Validity: Records are present but are faulty or in the wrong scheme
 - Accuracy: Records that are valid but have inaccurate measurements
 - Consistency: Technical correct but in different formats across tables or rows
#### twitter table
 - Retweets are included although they are not of interest
 - Replies to other statuses included although they are not of interest
 - Replies to users included although they are not of interest
 - Source column does not hold any valuable information
 - Timestamp is of type string
 - tweet_id is of type int
#### api table
 - tweet_id is of type int
 - Has missing tweets compared to the twitter table
#### dogs table
 - Naming of dog type is incoherent
 - tweet_id is of type int
 - Has missing tweets compared to the twitter table
### Tidyness iddues (structural issues)
 - Type of dog in the twitter table is streched over the last four columns although it is one variable
 - The twitter table and the api table are the same type of an observational unit (a twitter post)


# Cleaning the data

In [249]:
t_clean = twitter_df.copy()
a_clean = api_df.copy()
d_clean = dogs_df.copy()

### `twitter and api table` are the same type of an observational unit and api table has missing tweets
#### Define
Merge the two tables together on the tweet_id. Since the api table has less observations an inner merge on the ids available in both tables resolves two issues at the same time.

#### Code

In [250]:
t_clean.head(1)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,


In [251]:
a_clean.head(1)

Unnamed: 0,tweet_id,follower_count,favourites_of_user_count,retweet_count,favourite_of_post_count
0,684588130326986752,8709217,144674,1419,4146


In [252]:
status_df = t_clean.merge(a_clean, how = "inner", on = "tweet_id")

#### Test

In [253]:
status_df.head(1)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,...,rating_denominator,name,doggo,floofer,pupper,puppo,follower_count,favourites_of_user_count,retweet_count,favourite_of_post_count
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,...,10,Phineas,,,,,8709203,144674,7776,36526


In [254]:
status_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2331 entries, 0 to 2330
Data columns (total 21 columns):
tweet_id                      2331 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2331 non-null object
source                        2331 non-null object
text                          2331 non-null object
retweeted_status_id           163 non-null float64
retweeted_status_user_id      163 non-null float64
retweeted_status_timestamp    163 non-null object
expanded_urls                 2272 non-null object
rating_numerator              2331 non-null int64
rating_denominator            2331 non-null int64
name                          2331 non-null object
doggo                         2331 non-null object
floofer                       2331 non-null object
pupper                        2331 non-null object
puppo                         2331 non-null object
follower_count                23

### `twitter table` (now status_df table) Type of dog in the twitter table is streched over the last four columns although it is one variable
#### Define
Make a new column with a variable for the type of dog and get rid of the four type of dog columns.

#### Code

In [255]:
status_df.head(1)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,...,rating_denominator,name,doggo,floofer,pupper,puppo,follower_count,favourites_of_user_count,retweet_count,favourite_of_post_count
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,...,10,Phineas,,,,,8709203,144674,7776,36526


In [277]:
# If no dogtype is present one of the dogtype gets an entry so it is easier to clean after melting
for i in range(len(status_df)):
    if status_df.iloc[i,13]==status_df.iloc[i,14]==status_df.iloc[i,15]==status_df.iloc[i,16]:
        status_df.loc[i, "doggo"] = "no dogtype"

In [281]:
id_columns = list(status_df.columns[:13])
for item in list(status_df.columns[17:]):
    id_columns.append(item)

In [282]:
status_df = pd.melt(status_df, id_vars = id_columns, var_name = "dog_type", value_name='dog')

In [283]:
status_df.sample(5)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,follower_count,favourites_of_user_count,retweet_count,favourite_of_post_count,has_dogtype,dog_type,dog
1218,711998809858043904,,,2016-03-21 19:31:59 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @twitter: @dog_rates Awesome Tweet! 12/10. ...,7.119983e+17,783214.0,2016-03-21 19:29:52 +0000,https://twitter.com/twitter/status/71199827977...,12,10,,8709216,144674,126,920,no,doggo,
1292,706644897839910912,,,2016-03-07 00:57:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Shadow. She's tired of the responsibiliti...,,,,https://twitter.com/dog_rates/status/706644897...,9,10,Shadow,8709217,144674,1095,2591,no,doggo,
6954,666644823164719104,,,2015-11-17 15:51:26 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Jimothy. He is a Botwanian Gouda. Can ...,,,,https://twitter.com/dog_rates/status/666644823...,9,10,Jimothy,8709248,144674,76,221,no,pupper,
1325,704134088924532736,,,2016-02-29 02:40:23 +0000,"<a href=""http://vine.co"" rel=""nofollow"">Vine -...",This sneezy pupper is just adorable af. 12/10 ...,,,,https://vine.co/v/igW2OEwu9vg,12,10,,8709217,144674,462,1514,yes,doggo,
4424,670420569653809152,,,2015-11-28 01:54:54 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Damon. The newest presidential candida...,,,,https://twitter.com/dog_rates/status/670420569...,10,10,Damon,8709246,144674,303,606,no,floofer,


In [248]:
status_df.query("dog != 'None'")

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,follower_count,favourites_of_user_count,retweet_count,favourite_of_post_count,dog_type,dog
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,8709203,144674,7776,36526,doggo,No dog type
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,8709202,144674,5762,31472,doggo,No dog type
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,8709202,144674,3814,23715,doggo,No dog type
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,8709203,144674,7948,39850,doggo,No dog type
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,8709202,144674,8574,38047,doggo,No dog type
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7916,752519690950500352,,,2016-07-11 15:07:30 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Hopefully this puppo on a swing will help get ...,,,,https://twitter.com/dog_rates/status/752519690...,11,10,,8709211,144674,3482,7463,puppo,puppo
7930,751132876104687617,,,2016-07-07 19:16:47 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Cooper. He's just so damn happy. 10/10...,,,,https://twitter.com/dog_rates/status/751132876...,10,10,Cooper,8709212,144674,1317,5146,puppo,puppo
8004,744995568523612160,,,2016-06-20 20:49:19 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Abby. She got her face stuck in a glas...,,,,https://twitter.com/dog_rates/status/744995568...,9,10,Abby,8709212,144674,626,2992,puppo,puppo
8017,743253157753532416,,,2016-06-16 01:25:36 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Kilo. He cannot reach the snackum. Nif...,,,,https://twitter.com/dog_rates/status/743253157...,10,10,Kilo,8709212,144674,1236,4261,puppo,puppo


#### Test

In [222]:
status_df.dog_type.value_counts()

doggo      2331
pupper     2331
floofer    2331
puppo      2331
Name: dog_type, dtype: int64