# Data Gathering

In [1]:
import pandas as pd
import json
from timeit import default_timer as timer
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

### 1. Create a DF out of the desired data from the JSON file created using the Twitter API

In [2]:
tweet_data = []
#Open and read each line of the Json file. 
with open("tweet_json.txt", 'r') as json_file:
    for line in json_file:
        json_data = json.loads(line)
        timestamp = json_data['created_at'] #Pull out 4 data points (timestamp, tweet_id, likes, and retweets)
        likes = json_data['favorite_count']
        tweet_id = json_data['id']
        retweets = json_data['retweet_count']
        
        tweet_data.append({'timestamp': timestamp, #Append those 4 pieces of data to a list in the form of a dict (key/value pairs) to be converted later into a DF
                         'likes': likes,
                         'tweet_id': tweet_id,
                         'retweets': retweets})


In [3]:
tweet_data_df = pd.DataFrame(tweet_data, columns = ['timestamp', 'likes', 'tweet_id', 'retweets'])#Create the dataframe with the JSON data above and specific column names

In [4]:
tweet_data_df.head()

Unnamed: 0,timestamp,likes,tweet_id,retweets
0,Tue Aug 01 16:23:56 +0000 2017,35562,892420643555336193,7534
1,Tue Aug 01 00:17:27 +0000 2017,30753,892177421306343426,5589
2,Mon Jul 31 00:18:03 +0000 2017,23130,891815181378084864,3703
3,Sun Jul 30 15:58:51 +0000 2017,38869,891689557279858688,7717
4,Sat Jul 29 16:00:24 +0000 2017,37129,891327558926688256,8316


In [5]:
tweet_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2331 entries, 0 to 2330
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   timestamp  2331 non-null   object
 1   likes      2331 non-null   int64 
 2   tweet_id   2331 non-null   int64 
 3   retweets   2331 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 73.0+ KB


### 2. Create a DF from the provided WeRateDogs twitter archive data csv

In [6]:
twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')

In [7]:
twitter_archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

In [8]:
twitter_archive.in_reply_to_status_id.isnull().value_counts()

True     2278
False      78
Name: in_reply_to_status_id, dtype: int64

### 3. Create a DF utilizing the requests package to work with the provided image predictions url

In [9]:
import requests
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'

In [10]:
r = requests.get(url)

In [11]:
with open('image_predictions.tsv', 'wb') as file:
    file.write(r.content)

In [12]:
image_predictions = pd.read_csv('image_predictions.tsv', sep='\t')
image_predictions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [13]:
image_predictions.sample(3)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
285,671147085991960577,https://pbs.twimg.com/media/CVBktzQXAAAPpUA.jpg,1,Yorkshire_terrier,0.467202,True,cairn,0.440122,True,silky_terrier,0.05869,True
1966,867774946302451713,https://pbs.twimg.com/media/DAr0tDZXUAEMvdu.jpg,2,Border_collie,0.661953,True,Cardigan,0.175718,True,collie,0.087142,True
1106,723179728551723008,https://pbs.twimg.com/media/CglAHjAUgAAfxcq.jpg,1,tennis_ball,0.176495,False,badger,0.059905,False,Norwegian_elkhound,0.05685,True


#### Now that the DFs have been created, a copy of each is generated for future use during assessment or cleaning stages of the Data Wrangling process

In [14]:
tweet_data_df_clean = tweet_data_df.copy()
image_predictions_clean = image_predictions.copy()
twitter_archive_clean = twitter_archive.copy()

# Assess

### 1. Programmatic and Visual Assessment

#### Quality
<ul> 
    <li>tweet_data_df_clean:
        <ol>
            <li>timestamp data provides more data than necessary</li>
            <li>there are 163 tweets with 0 likes, but all of them have at least 19 or more retweets</li>
            <li>timestamp is datatype is an object and it should be datetime</li>
            <li>tweet_id datatype is int64 and it should be a string</li>
        </ol>
    </li>
    <li>image_predictions_clean
        <ol>
            <li>inconsistent formatting for the names potential image matches</li>
            <li>there are many image predictions (even in p1) which are not dogs, sometimes not even animals</li>
            <li>there are 64 image predictions where the first two predictions are not dogs</li>
            <li>there are 324 image predictions where all three predictions are not dogs</li>
        </ol>
    </li>
    <li>twitter_archive_clean
    </li>
</ul>        

#### Tidiness
<ul> 
    <li>tweet_data_df_clean:
        <ol>
            <li>tweet_data_df_clean does not need to be in a separate table and should be merged with twitter_archive_clean, the larger complete df</li>
        </ol>
    </li>
    <li>image_predictions_clean
        <ol>
            <li>image_predictions_clean data could be condensed and combined with twitter_archive_clean</li>
        </ol>
    </li>
    <li>twitter_archive_clean
    </li>
</ul>        

## tweet_data_df_clean

In [48]:
tweet_data_df_clean.sort_values(by = 'retweets').head(10)

Unnamed: 0,timestamp,likes,tweet_id,retweets
279,Sat Mar 04 17:56:49 +0000 2017,138,838085839343206401,1
1271,Thu Mar 10 17:35:20 +0000 2016,49,707983188426153984,2
262,Sat Mar 11 22:59:09 +0000 2017,171,840698636975636481,2
328,Thu Feb 16 04:45:50 +0000 2017,62,832088576586297345,2
109,Fri Jun 02 19:38:25 +0000 2017,113,870726314365509632,3
29,Sat Jul 15 16:51:35 +0000 2017,110,886267009285017600,4
1056,Sat Jun 04 00:32:32 +0000 2016,106,738891149612572673,6
54,Sun Jul 02 21:58:53 +0000 2017,115,881633300179243008,7
411,Tue Jan 17 00:33:26 +0000 2017,254,821153421864615936,10
63,Tue Jun 27 12:14:36 +0000 2017,291,879674319642796034,10


In [16]:
tweet_data_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2331 entries, 0 to 2330
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   timestamp  2331 non-null   object
 1   likes      2331 non-null   int64 
 2   tweet_id   2331 non-null   int64 
 3   retweets   2331 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 73.0+ KB


In [17]:
tweet_data_df_clean.describe()

Unnamed: 0,likes,tweet_id,retweets
count,2331.0,2331.0,2331.0
mean,7430.58387,7.419079e+17,2648.574432
std,11539.21545,6.82317e+16,4479.174285
min,0.0,6.660209e+17,1.0
25%,1292.5,6.78267e+17,537.0
50%,3229.0,7.182469e+17,1239.0
75%,9097.0,7.986692e+17,3077.0
max,153472.0,8.924206e+17,76095.0


In [27]:
tweet_data_df_clean.likes.sort_values()

453          0
439          0
437          0
434          0
431          0
         ...  
1055    113395
129     114540
517     118456
397     130823
1015    153472
Name: likes, Length: 2331, dtype: int64

In [37]:
tweet_data_df_clean.likes.value_counts().sort_values().tail(11)

110       2
3062      3
1362      3
2771      3
752       3
491       3
291       3
755       3
2033      3
3727      3
0       163
Name: likes, dtype: int64

In [24]:
tweet_data_df_clean.query("likes == 0")

Unnamed: 0,timestamp,likes,tweet_id,retweets
31,Sat Jul 15 02:45:48 +0000 2017,0,886054160059072513,99
35,Thu Jul 13 01:35:06 +0000 2017,0,885311592912609280,16528
67,Mon Jun 26 00:13:58 +0000 2017,0,879130579576475649,6094
72,Sat Jun 24 00:09:53 +0000 2017,0,878404777348136964,1148
73,Fri Jun 23 18:17:33 +0000 2017,0,878316110768087041,5950
...,...,...,...,...
999,Sat Jun 25 01:52:36 +0000 2016,0,746521445350707200,966
1019,Fri Jun 17 16:01:16 +0000 2016,0,743835915802583040,2024
1218,Mon Mar 21 19:31:59 +0000 2016,0,711998809858043904,125
2234,Fri Nov 20 03:51:52 +0000 2015,0,667550904950915073,31


In [25]:
tweet_data_df_clean.query("retweets == 1")

Unnamed: 0,timestamp,likes,tweet_id,retweets
279,Sat Mar 04 17:56:49 +0000 2017,138,838085839343206401,1


In [26]:
tweet_data_df_clean.query("likes == 0 and retweets <= 20")

Unnamed: 0,timestamp,likes,tweet_id,retweets
203,Tue Apr 11 18:15:55 +0000 2017,0,851861385021730816,19


In [40]:
tweet_data_df_clean.retweets.sort_values()

279         1
1271        2
262         2
328         2
109         3
        ...  
65      39888
397     42889
517     55468
1055    56601
1015    76095
Name: retweets, Length: 2331, dtype: int64

## image_predictions_clean

In [49]:
image_predictions_clean

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.072010,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True
...,...,...,...,...,...,...,...,...,...,...,...,...
2070,891327558926688256,https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg,2,basset,0.555712,True,English_springer,0.225770,True,German_short-haired_pointer,0.175219,True
2071,891689557279858688,https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg,1,paper_towel,0.170278,False,Labrador_retriever,0.168086,True,spatula,0.040836,False
2072,891815181378084864,https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg,1,Chihuahua,0.716012,True,malamute,0.078253,True,kelpie,0.031379,True
2073,892177421306343426,https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,1,Chihuahua,0.323581,True,Pekinese,0.090647,True,papillon,0.068957,True


In [50]:
image_predictions_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [51]:
image_predictions_clean.describe()

Unnamed: 0,tweet_id,img_num,p1_conf,p2_conf,p3_conf
count,2075.0,2075.0,2075.0,2075.0,2075.0
mean,7.384514e+17,1.203855,0.594548,0.1345886,0.06032417
std,6.785203e+16,0.561875,0.271174,0.1006657,0.05090593
min,6.660209e+17,1.0,0.044333,1.0113e-08,1.74017e-10
25%,6.764835e+17,1.0,0.364412,0.05388625,0.0162224
50%,7.119988e+17,1.0,0.58823,0.118181,0.0494438
75%,7.932034e+17,1.0,0.843855,0.1955655,0.09180755
max,8.924206e+17,4.0,1.0,0.488014,0.273419


In [63]:
image_predictions_clean.p1.value_counts()

golden_retriever      150
Labrador_retriever    100
Pembroke               89
Chihuahua              83
pug                    57
                     ... 
shopping_basket         1
sliding_door            1
coho                    1
wooden_spoon            1
tiger_shark             1
Name: p1, Length: 378, dtype: int64

In [67]:
image_predictions_clean.query("p1_dog == False and p2_dog == False and p3_dog == True")

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
8,666057090499244032,https://pbs.twimg.com/media/CT5PY90WoAAQGLo.jpg,1,shopping_cart,0.962465,False,shopping_basket,0.014594,False,golden_retriever,0.007959,True
87,667524857454854144,https://pbs.twimg.com/media/CUOGUfJW4AA_eni.jpg,1,hare,0.447893,False,dhole,0.092435,False,Chesapeake_Bay_retriever,0.088122,True
126,668256321989451776,https://pbs.twimg.com/media/CUYflCXWEAAzQVu.jpg,1,canoe,0.407683,False,paddle,0.115550,False,Pembroke,0.094429,True
131,668297328638447616,https://pbs.twimg.com/media/CUZE4IWW4AAZmDf.jpg,1,king_penguin,0.606747,False,ice_bear,0.264221,False,Eskimo_dog,0.032784,True
158,668872652652679168,https://pbs.twimg.com/media/CUhQIAhXAAA2j7u.jpg,1,teddy,0.413379,False,pillow,0.325623,False,miniature_schnauzer,0.035537,True
...,...,...,...,...,...,...,...,...,...,...,...,...
1872,844979544864018432,https://pbs.twimg.com/media/C7n4aQ0VAAAohkL.jpg,3,tennis_ball,0.999281,False,racket,0.000370,False,Shetland_sheepdog,0.000132,True
1899,851224888060895234,https://pbs.twimg.com/media/C9AohFoWsAUmxDs.jpg,3,car_mirror,0.971512,False,seat_belt,0.007063,False,standard_poodle,0.005683,True
1942,861288531465048066,https://pbs.twimg.com/ext_tw_video_thumb/86128...,1,syringe,0.144712,False,oxygen_mask,0.106684,False,Bouvier_des_Flandres,0.082610,True
1944,861769973181624320,https://pbs.twimg.com/media/CzG425nWgAAnP7P.jpg,2,Arabian_camel,0.366248,False,house_finch,0.209852,False,cocker_spaniel,0.046403,True


In [68]:
image_predictions_clean.query("p1_dog == False and p2_dog == False and p3_dog == False")

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
6,666051853826850816,https://pbs.twimg.com/media/CT5KoJ1WoAAJash.jpg,1,box_turtle,0.933012,False,mud_turtle,0.045885,False,terrapin,0.017885,False
17,666104133288665088,https://pbs.twimg.com/media/CT56LSZWoAAlJj2.jpg,1,hen,0.965932,False,cock,0.033919,False,partridge,0.000052,False
18,666268910803644416,https://pbs.twimg.com/media/CT8QCd1WEAADXws.jpg,1,desktop_computer,0.086502,False,desk,0.085547,False,bookcase,0.079480,False
21,666293911632134144,https://pbs.twimg.com/media/CT8mx7KW4AEQu8N.jpg,1,three-toed_sloth,0.914671,False,otter,0.015250,False,great_grey_owl,0.013207,False
25,666362758909284353,https://pbs.twimg.com/media/CT9lXGsUcAAyUFt.jpg,1,guinea_pig,0.996496,False,skunk,0.002402,False,hamster,0.000461,False
...,...,...,...,...,...,...,...,...,...,...,...,...
2021,880935762899988482,https://pbs.twimg.com/media/DDm2Z5aXUAEDS2u.jpg,1,street_sign,0.251801,False,umbrella,0.115123,False,traffic_light,0.069534,False
2022,881268444196462592,https://pbs.twimg.com/media/DDrk-f9WAAI-WQv.jpg,1,tusker,0.473303,False,Indian_elephant,0.245646,False,ibex,0.055661,False
2046,886680336477933568,https://pbs.twimg.com/media/DE4fEDzWAAAyHMM.jpg,1,convertible,0.738995,False,sports_car,0.139952,False,car_wheel,0.044173,False
2052,887517139158093824,https://pbs.twimg.com/ext_tw_video_thumb/88751...,1,limousine,0.130432,False,tow_truck,0.029175,False,shopping_cart,0.026321,False
