# Gather data

### Import necessary libraries

In [175]:
import tweepy
import pandas as pd
import requests
import io
import json
import os
import glob
import numpy as np
import re

### Initialize twitter API access

In [45]:
################## DELETE BEFORE SUBMISSION ###########################
consumer_key = ''
consumer_secret = ''
access_token = ''
access_secret = ''
################## DELETE BEFORE SUBMISSION ###########################

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth ,wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

### Import twitter archive of user WeRateDogs

In [46]:
df = pd.read_csv("twitter-archive-enhanced.csv")

### Gather additional tweet data and save it in a folder

In [47]:
# Check which staus info is already collected
folder_name = "tweet_data"
ids_collected = list(map(lambda x: int(x[11:-4]), glob.glob(folder_name+"/*.txt")))

In [260]:
def get_tweet_status():
    '''
    Takes the status ids of the twitter archive csv and queries the twitter api for the complete information.
    Saves a text file with the information for each twitter status. Also saves a list of statuses that were 
    not able to be retrieved from the API
    
    Args:
        None
    Output:
        None
    '''
    # Create a folder to store all the tweet data inside
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    # Error list for possibly deleted tweets
    error_list = []
    # Variable to measure progress
    i = 1
    # Get the status for each tweet id in the tweet_id column of the twitter archive
    
    for tweet_id in list(df.tweet_id):
        
        # Print progress status for every 20th call
        if i%20 ==0:
            print(str(i/len(df)*100)+"% of API download done")
        
        # If the id is already collected it should be skipped to save time
        if tweet_id not in ids_collected:
            try:
                tweet = api.get_status(tweet_id)
                data = tweet._json

                #Save the json data as a txt file with the corresponding tweet id (Saving as .txt to fulfill project rubric)
                with open(folder_name+"/"+str(tweet_id)+'.txt', 'w', encoding='utf-8') as f:
                    json.dump(data, f, ensure_ascii=False, indent=4)
            except Exception as e:
                error_list.append(tweet_id)
                print("The status of the tweet with id {} is not possible to access".format(tweet_id))
            # Store the tweet json data in a variable
            
                
        i+=1
        
    # Save the error list for later usage    
    pd.DataFrame(error_list).to_csv("na_ids.csv", index = False)
    

In [49]:
# Only used once to get the data
#get_tweet_status()

0.8488964346349746% of API download done
The status of the tweet with id 888202515573088257 is not possible to access
1.697792869269949% of API download done
2.5466893039049237% of API download done
3.395585738539898% of API download done
The status of the tweet with id 873697596434513921 is not possible to access
4.244482173174872% of API download done
The status of the tweet with id 872668790621863937 is not possible to access
The status of the tweet with id 872261713294495745 is not possible to access
The status of the tweet with id 869988702071779329 is not possible to access
5.093378607809847% of API download done
The status of the tweet with id 866816280283807744 is not possible to access
5.942275042444821% of API download done
The status of the tweet with id 861769973181624320 is not possible to access
6.791171477079796% of API download done
7.6400679117147705% of API download done
The status of the tweet with id 856602993587888130 is not possible to access
8.488964346349745% of

Rate limit reached. Sleeping for: 682


70.45840407470288% of API download done
71.30730050933785% of API download done
72.15619694397284% of API download done
73.00509337860781% of API download done
The status of the tweet with id 680055455951884288 is not possible to access
73.85398981324278% of API download done
74.70288624787777% of API download done
75.55178268251274% of API download done
76.4006791171477% of API download done
77.24957555178268% of API download done
78.09847198641766% of API download done
78.94736842105263% of API download done
79.79626485568761% of API download done
80.64516129032258% of API download done
81.49405772495756% of API download done
82.34295415959252% of API download done
83.1918505942275% of API download done
84.04074702886248% of API download done
84.88964346349745% of API download done
85.73853989813243% of API download done
86.5874363327674% of API download done
87.43633276740238% of API download done
88.28522920203736% of API download done
89.13412563667232% of API download done
89.983

### Import tweet status data pulled from the API as text and write it in a dataframe
Comment: I saved the data as txt to fulfill the rubric of this project to work with three different kind of data.
It would have been a lot easier to query the json structure than this regex stuff which is quite an unrobust and dirty solution I think.

In [248]:
def text_to_df():  
    '''
    Takes the text files from the subfolder and saves the tweet id,
    follower_count, favourites of user count, retweet count and the 
    favourites of the post count in a dataframe.
    
    Args:
        None
    Output:
        Saves the data as a dataframe, returns no output.
    '''
    # Initialize list of dictionaries
    df_list = []

    for status in glob.glob(folder_name+"/*.txt"):
        with open(status, encoding = "utf-8") as file:
            # Read only the couple of first lines for the id, afterwards the files differ from each other
            lines = file.readlines()[2:4]
            file.seek(0)
            text = file.read()
            text_id = lines[1].strip()[11:-2]

            # Be sure that ids are consistent
            if text_id != status[11:-4]:
                raise Exception("Id saved in file does not match id from filename")

            # Find the needed data with regex, since line by line reading not possible
            follower_count = re.findall(r"\"followers_count\": (\d+),", text)[0]
            favourites_of_user_count = re.findall(r"\"favourites_count\": (\d+),", text)[0]
            favourite_of_post_count = re.findall(r"\"favorite_count\": (\d+),", text)[0]
            retweet_count = re.findall(r"\"retweet_count\": (\d+),", text)[0]

            # Append the list entry of dictionaries to create a dataframe
            df_list.append({"tweet_id": text_id,
                           "follower_count": follower_count,
                           "favourites_of_user_count": favourites_of_user_count,
                           "retweet_count": retweet_count,
                           "favourite_of_post_count": favourite_of_post_count})

            file.close()


    df_additional_data = pd.DataFrame(df_list, columns = ["tweet_id", "follower_count", "favourites_of_user_count", "retweet_count", "favourite_of_post_count"])
    df_additional_data.to_csv("api_twitter_status.csv")

In [259]:
# Only needed to run once to create the df
#text_to_df()

### Download the dog predictions file

In [252]:
r = requests.get('https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv').content

dog_predictions = pd.read_csv(io.StringIO(r.decode('utf-8')), sep = "\t")

dog_predictions.to_csv("dog_predictions.csv")

# Asses
Now that all needed data is gathered the three data frames are assessed.
The dataframes are:
 - twitter-archives-enhanced.csv
 - api_twitter_status.csv
 - dog_predictions.csv
 
 *All found issues are recorded at the bottom of this chapter*


In [262]:
twitter_df = pd.read_csv("twitter-archive-enhanced.csv")
api_df = pd.read_csv("api_twitter_status.csv")
dogs_df = pd.read_csv("dog_predictions.csv")
# List of statuses that were not available over the api
na_id = pd.read_csv("na_ids.csv")

In [265]:
twitter_df.sample(5)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
2203,668641109086707712,,,2015-11-23 04:03:57 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Hanz. He heard some thunder. 10/10 https:...,,,,https://twitter.com/dog_rates/status/668641109...,10,10,Hanz,,,,
2092,670782429121134593,,,2015-11-29 01:52:48 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This dude slaps your girl's ass what do you do...,,,,https://twitter.com/dog_rates/status/670782429...,5,10,,,,,
276,840632337062862849,,,2017-03-11 18:35:42 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Say hello to Maddie and Gunner. They are consi...,,,,"https://www.gofundme.com/3hgsuu0,https://twitt...",12,10,Maddie,,,,
552,804413760345620481,,,2016-12-01 19:56:00 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: This is Rusty. He's going D1 fo...,7.84826e+17,4196984000.0,2016-10-08 18:41:19 +0000,https://twitter.com/dog_rates/status/784826020...,13,10,Rusty,,,,
665,790698755171364864,,,2016-10-24 23:37:28 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Mosby. He appears to be rather h*ckin ...,,,,https://twitter.com/dog_rates/status/790698755...,12,10,Mosby,,,,


## Recording of issues
### Quality issues (dirty data, content issues)
#### twitter table
#### api table
#### dogs table

### Tidyness iddues (structural issues)
 - Type of dog in the twitter table is streched over the last four columns although it is one variable
 - The twitter table and the api table should be merged together as they are the same type of an observational unit (a twitter post)
