# Data Gathering

In [1]:
import pandas as pd
import json
from timeit import default_timer as timer
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

### 1. Create a DF out of the desired data from the JSON file created using the Twitter API

In [13]:
tweet_data = []
#Open and read each line of the Json file. 
with open("tweet_json.txt", 'r') as json_file:
    for line in json_file:
        json_data = json.loads(line)
        timestamp = json_data['created_at'] #Pull out 4 data points (timestamp, tweet_id, likes, and retweets)
        likes = json_data['favorite_count']
        tweet_id = json_data['id']
        retweets = json_data['retweet_count']
        
        tweet_data.append({'timestamp': timestamp, #Append those 4 pieces of data to a list in the form of a dict (key/value pairs) to be converted later into a DF
                         'likes': likes,
                         'tweet_id': tweet_id,
                         'retweets': retweets})


In [14]:
tweet_data_df = pd.DataFrame(tweet_data, columns = ['timestamp', 'likes', 'tweet_id', 'retweets'])#Create the dataframe with the JSON data above and specific column names

In [16]:
tweet_data_df.head()

Unnamed: 0,timestamp,likes,tweet_id,retweets
0,Tue Aug 01 16:23:56 +0000 2017,35562,892420643555336193,7534
1,Tue Aug 01 00:17:27 +0000 2017,30753,892177421306343426,5589
2,Mon Jul 31 00:18:03 +0000 2017,23130,891815181378084864,3703
3,Sun Jul 30 15:58:51 +0000 2017,38869,891689557279858688,7717
4,Sat Jul 29 16:00:24 +0000 2017,37129,891327558926688256,8316


In [18]:
tweet_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2331 entries, 0 to 2330
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   timestamp  2331 non-null   object
 1   likes      2331 non-null   int64 
 2   tweet_id   2331 non-null   int64 
 3   retweets   2331 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 73.0+ KB


### 2. Create a DF from the provided WeRateDogs twitter archive data csv

In [19]:
twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')

In [20]:
twitter_archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

In [27]:
twitter_archive.in_reply_to_status_id.isnull().value_counts()

True     2278
False      78
Name: in_reply_to_status_id, dtype: int64

### 3. Create a DF utilizing the requests package to work with the provided image predictions url

In [40]:
import requests
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'

In [41]:
r = requests.get(url)

In [42]:
with open('image_predictions.tsv', 'wb') as file:
    file.write(r.content)

In [44]:
image_predictions = pd.read_csv('image_predictions.tsv', sep='\t')
image_predictions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [45]:
image_predictions.sample(3)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
347,672475084225949696,https://pbs.twimg.com/media/CVUchRHXAAE4rtp.jpg,1,terrapin,0.879286,False,cockroach,0.045252,False,box_turtle,0.016404,False
403,673700254269775872,https://pbs.twimg.com/media/CVl2ydUWsAA1jD6.jpg,1,water_bottle,0.614536,False,ashcan,0.050911,False,bucket,0.037432,False
1936,860184849394610176,https://pbs.twimg.com/media/C-_9jWWUwAAnwkd.jpg,1,chimpanzee,0.267612,False,gorilla,0.104293,False,orangutan,0.059908,False


#### Now that the DFs have been created, a copy of each is generated for future use during assessment or cleaning stages of the Data Wrangling process

In [46]:
tweet_data_df_clean = tweet_data_df.copy()
image_predictions_clean = image_predictions.copy()
twitter_archive_clean = twitter_archive.copy()

# Assess

### 1. Visual Assessment

#### Quality
<ul> 
    <li>tweet_data_df_clean:</li>
        <ol>
            <li></li>
        </ol>
    <li>image_predictions_clean</li>
    <li>twitter_archive_clean</li>
</ul>        

#### Tidiness
<ul> 
    <li>tweet_data_df_clean:</li>
        <ol>
            <li>timestamp is not in the correct datatype</li>
        </ol>
    <li>image_predictions_clean</li>
    <li>twitter_archive_clean</li>
</ul>        

In [47]:
tweet_data_df_clean

Unnamed: 0,timestamp,likes,tweet_id,retweets
0,Tue Aug 01 16:23:56 +0000 2017,35562,892420643555336193,7534
1,Tue Aug 01 00:17:27 +0000 2017,30753,892177421306343426,5589
2,Mon Jul 31 00:18:03 +0000 2017,23130,891815181378084864,3703
3,Sun Jul 30 15:58:51 +0000 2017,38869,891689557279858688,7717
4,Sat Jul 29 16:00:24 +0000 2017,37129,891327558926688256,8316
...,...,...,...,...
2326,Mon Nov 16 00:24:50 +0000 2015,95,666049248165822465,40
2327,Mon Nov 16 00:04:52 +0000 2015,265,666044226329800704,126
2328,Sun Nov 15 23:21:54 +0000 2015,110,666033412701032449,39
2329,Sun Nov 15 23:05:30 +0000 2015,119,666029285002620928,41


In [48]:
tweet_data_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2331 entries, 0 to 2330
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   timestamp  2331 non-null   object
 1   likes      2331 non-null   int64 
 2   tweet_id   2331 non-null   int64 
 3   retweets   2331 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 73.0+ KB
