# Data Gathering

In [1]:
import pandas as pd
import json
from timeit import default_timer as timer
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

%matplotlib inline

### 1. Create a DF out of the desired data from the JSON file created using the Twitter API

In [2]:
tweet_data = []
#Open and read each line of the Json file. 
with open("tweet_json.txt", 'r') as json_file:
    for line in json_file:
        json_data = json.loads(line)
        timestamp = json_data['created_at'] #Pull out 4 data points (timestamp, tweet_id, likes, and retweets)
        likes = json_data['favorite_count']
        tweet_id = json_data['id']
        retweets = json_data['retweet_count']
        
        tweet_data.append({'timestamp': timestamp, #Append those 4 pieces of data to a list in the form of a dict (key/value pairs) to be converted later into a DF
                         'likes': likes,
                         'tweet_id': tweet_id,
                         'retweets': retweets})


In [3]:
tweet_data_df = pd.DataFrame(tweet_data, columns = ['timestamp', 'likes', 'tweet_id', 'retweets'])#Create the dataframe with the JSON data above and specific column names

In [4]:
tweet_data_df.head()

Unnamed: 0,timestamp,likes,tweet_id,retweets
0,Tue Aug 01 16:23:56 +0000 2017,35562,892420643555336193,7534
1,Tue Aug 01 00:17:27 +0000 2017,30753,892177421306343426,5589
2,Mon Jul 31 00:18:03 +0000 2017,23130,891815181378084864,3703
3,Sun Jul 30 15:58:51 +0000 2017,38869,891689557279858688,7717
4,Sat Jul 29 16:00:24 +0000 2017,37129,891327558926688256,8316


In [5]:
tweet_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2331 entries, 0 to 2330
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   timestamp  2331 non-null   object
 1   likes      2331 non-null   int64 
 2   tweet_id   2331 non-null   int64 
 3   retweets   2331 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 73.0+ KB


### 2. Create a DF from the provided WeRateDogs twitter archive data csv

In [6]:
twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')

In [7]:
twitter_archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

In [8]:
twitter_archive.in_reply_to_status_id.isnull().value_counts()

True     2278
False      78
Name: in_reply_to_status_id, dtype: int64

### 3. Create a DF utilizing the requests package to work with the provided image predictions url

In [9]:
import requests
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'

In [10]:
r = requests.get(url)

In [11]:
with open('image_predictions.tsv', 'wb') as file:
    file.write(r.content)

In [12]:
image_predictions = pd.read_csv('image_predictions.tsv', sep='\t')
image_predictions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [13]:
image_predictions.sample(3)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
1744,822872901745569793,https://pbs.twimg.com/media/C2tugXLXgAArJO4.jpg,1,Lakeland_terrier,0.196015,True,Labrador_retriever,0.160329,True,Irish_terrier,0.069126,True
1177,737800304142471168,https://pbs.twimg.com/media/Cj0xdMBVAAEbDHp.jpg,1,malamute,0.374682,True,Norwegian_elkhound,0.334853,True,limousine,0.068173,False
787,690374419777196032,https://pbs.twimg.com/media/CZSz3vWXEAACElU.jpg,1,kuvasz,0.286345,True,Labrador_retriever,0.107144,True,ice_bear,0.085086,False


#### Now that the DFs have been created, a copy of each is generated for future use during assessment or cleaning stages of the Data Wrangling process

In [14]:
tweet_data_df_clean = tweet_data_df.copy()
image_predictions_clean = image_predictions.copy()
twitter_archive_clean = twitter_archive.copy()

# Assess

### 1. Programmatic and Visual Assessment

#### Tidiness
<ul>
    <p>
        <li><em>tweet_data_df_clean:</em>
            <ul {
  list-style-type: square;
}>
                <li>tweet_data_df_clean does not need to be in a separate df</li>
            </ul>
        </li>
    </p>
    <p>
        <li><em>image_predictions_clean:</em>
            <ul {
  list-style-type: square;
}>
                <li>image_predictions_clean and twitter_archive_clean do not need to be separate dfs</li>
            </ul>
        </li>
    </p>
        <li><em>twitter_archive_clean:</em>
            <ul {
  list-style-type: square;
}>
                <li>dog stages are in four different columns</li>
                <li>text column has both a url and the tweet text</li>
                <li>combination of twitter_archive_clean and tweet_data_df_clean may duplicate the timestamp column</li>
            </ul>
        </li>
</ul>        

#### Quality
<ul> 
    <p>
        <li><em>tweet_data_df_clean:</em>
            <ul {
  list-style-type: square;
}>
                <li>data quality issues will be addressed after merge with twitter_archive_clean</li>
            </ul>
        </li>
    </p>
    <p>
        <li><em>image_predictions_clean:</em>
            <ul {
  list-style-type: square;
}>
                <li>inconsistent formatting for the names of potential image matches in p1, p2, and p3</li>
                <li>there are many image predictions (even in p1) which are not dogs, sometimes not even animals</li>
                <li>there are 64 image predictions where the first two predictions are not dogs</li>
                <li>there are 324 image predictions where all three predictions are not dogs</li>
                <li>tweet_id datatype is int64 and it should be a string</li>
            </ul>
        </li>
    </p>
        <li><em>twitter_archive_clean:</em>
            <ul {
  list-style-type: square;
}>
                <li>rows which are retweets or replies are unnecessary</li>
                <li>retweet and reply related columns are unnecessary after the related rows are removed</li>           
                <li>tweet_id datatype is int64 and it should be a string</li>
                <li>timestamp is datatype is an object and it should be datetime</li>
                <li>inaccurate rating_numerator and rating_denominator information</li>
                <li>source and expanded_url columns contains unecessary information after the url is stripped from the text</li>
                <li>url column created with split is missing https</li>
                <li>fix remaining timestamp column's name</li>
                <li>timestamp data provides more data than necessary</li>
                <li>3 url rows did not successfully split the text column in Tidiness #3</li>
            </ul>
        </li>
</ul>        

#### Tidiness
<ul>
    <p>
        <li><em>tweet_data_df_clean:</em>
            <ul {
  list-style-type: square;
}>
                <li><del>tweet_data_df_clean does not need to be in a separate df</del></li>
            </ul>
        </li>
    </p>
    <p>
        <li><em>image_predictions_clean:</em>
            <ul {
  list-style-type: square;
}>
                <li><del>image_predictions_clean and twitter_archive_clean do not need to be separate dfs</del></li>
            </ul>
        </li>
    </p>
        <li><em>twitter_archive_clean:</em>
            <ul {
  list-style-type: square;
}>
                <li><del>dog stages are in four different columns</del></li>
                <li><del>text column has both a url and the tweet text</del></li>
                <li><del>combination of twitter_archive_clean and tweet_data_df_clean may duplicate the timestamp column</del></li>
            </ul>
        </li>
</ul>        

#### Quality
<ul> 
    <p>
        <li><del><em>tweet_data_df_clean:</em>
            <ul {
  list-style-type: square;
}>
                <li>data quality issues will be addressed after merge with twitter_archive_clean</li>
            </ul>
        </del></li>
    </p>
    <p>
        <li><em>image_predictions_clean:</em>
            <ul {
  list-style-type: square;
}>
                <li><del>inconsistent formatting for the names of potential image matches in p1, p2, and p3</del></li>
                <li>there are many image predictions (even in p1) which are not dogs, sometimes not even animals</li>
                <li>there are 64 image predictions where the first two predictions are not dogs</li>
                <li>there are 324 image predictions where all three predictions are not dogs</li>
                <li>tweet_id datatype is int64 and it should be a string</li>
            </ul>
        </li>
    </p>
        <li><em>twitter_archive_clean:</em>
            <ul {
  list-style-type: square;
}>
                <li><del>rows which are retweets or replies are unnecessary</del></li>
                <li><del>retweet and reply related columns are unnecessary after the related rows are removed</del></li>       
                <li><del>tweet_id datatype is int64 and it should be a string</del></li>
                <li><del>timestamp is datatype is an object and it should be datetime</del></li>
                <li>inaccurate rating_numerator and rating_denominator information</li>
                <li><del>source and expanded_url columns contains unecessary information after the url is stripped from the text</del></li>   
                <li><del>url column created with split is missing https</del></li>
                <li><del>fix remaining timestamp column's name</del></li>
                <li><del>timestamp data provides more data than necessary</del></li>
                <li><del>3 url rows did not successfully split the text column in Tidiness #3</del></li>
            </ul>
        </li>
</ul>        

## tweet_data_df_clean

In [15]:
tweet_data_df_clean.sort_values(by = 'retweets').head(10)

Unnamed: 0,timestamp,likes,tweet_id,retweets
279,Sat Mar 04 17:56:49 +0000 2017,138,838085839343206401,1
1271,Thu Mar 10 17:35:20 +0000 2016,49,707983188426153984,2
262,Sat Mar 11 22:59:09 +0000 2017,171,840698636975636481,2
328,Thu Feb 16 04:45:50 +0000 2017,62,832088576586297345,2
109,Fri Jun 02 19:38:25 +0000 2017,113,870726314365509632,3
29,Sat Jul 15 16:51:35 +0000 2017,110,886267009285017600,4
1056,Sat Jun 04 00:32:32 +0000 2016,106,738891149612572673,6
54,Sun Jul 02 21:58:53 +0000 2017,115,881633300179243008,7
411,Tue Jan 17 00:33:26 +0000 2017,254,821153421864615936,10
63,Tue Jun 27 12:14:36 +0000 2017,291,879674319642796034,10


In [16]:
tweet_data_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2331 entries, 0 to 2330
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   timestamp  2331 non-null   object
 1   likes      2331 non-null   int64 
 2   tweet_id   2331 non-null   int64 
 3   retweets   2331 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 73.0+ KB


In [17]:
tweet_data_df_clean.describe()

Unnamed: 0,likes,tweet_id,retweets
count,2331.0,2331.0,2331.0
mean,7430.58387,7.419079e+17,2648.574432
std,11539.21545,6.82317e+16,4479.174285
min,0.0,6.660209e+17,1.0
25%,1292.5,6.78267e+17,537.0
50%,3229.0,7.182469e+17,1239.0
75%,9097.0,7.986692e+17,3077.0
max,153472.0,8.924206e+17,76095.0


In [18]:
tweet_data_df_clean.likes.sort_values()

453          0
439          0
437          0
434          0
431          0
         ...  
1055    113395
129     114540
517     118456
397     130823
1015    153472
Name: likes, Length: 2331, dtype: int64

In [19]:
tweet_data_df_clean.likes.value_counts().sort_values().tail(11)

110       2
3062      3
1362      3
2771      3
752       3
491       3
291       3
755       3
2033      3
3727      3
0       163
Name: likes, dtype: int64

In [20]:
tweet_data_df_clean.query("likes == 0")

Unnamed: 0,timestamp,likes,tweet_id,retweets
31,Sat Jul 15 02:45:48 +0000 2017,0,886054160059072513,99
35,Thu Jul 13 01:35:06 +0000 2017,0,885311592912609280,16528
67,Mon Jun 26 00:13:58 +0000 2017,0,879130579576475649,6094
72,Sat Jun 24 00:09:53 +0000 2017,0,878404777348136964,1148
73,Fri Jun 23 18:17:33 +0000 2017,0,878316110768087041,5950
...,...,...,...,...
999,Sat Jun 25 01:52:36 +0000 2016,0,746521445350707200,966
1019,Fri Jun 17 16:01:16 +0000 2016,0,743835915802583040,2024
1218,Mon Mar 21 19:31:59 +0000 2016,0,711998809858043904,125
2234,Fri Nov 20 03:51:52 +0000 2015,0,667550904950915073,31


In [21]:
tweet_data_df_clean.query("retweets == 1")

Unnamed: 0,timestamp,likes,tweet_id,retweets
279,Sat Mar 04 17:56:49 +0000 2017,138,838085839343206401,1


In [22]:
tweet_data_df_clean.query("likes == 0 and retweets <= 20")

Unnamed: 0,timestamp,likes,tweet_id,retweets
203,Tue Apr 11 18:15:55 +0000 2017,0,851861385021730816,19


In [23]:
tweet_data_df_clean.retweets.sort_values()

279         1
1271        2
262         2
328         2
109         3
        ...  
65      39888
397     42889
517     55468
1055    56601
1015    76095
Name: retweets, Length: 2331, dtype: int64

## image_predictions_clean

In [24]:
image_predictions_clean

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.072010,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True
...,...,...,...,...,...,...,...,...,...,...,...,...
2070,891327558926688256,https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg,2,basset,0.555712,True,English_springer,0.225770,True,German_short-haired_pointer,0.175219,True
2071,891689557279858688,https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg,1,paper_towel,0.170278,False,Labrador_retriever,0.168086,True,spatula,0.040836,False
2072,891815181378084864,https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg,1,Chihuahua,0.716012,True,malamute,0.078253,True,kelpie,0.031379,True
2073,892177421306343426,https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,1,Chihuahua,0.323581,True,Pekinese,0.090647,True,papillon,0.068957,True


In [25]:
image_predictions_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [26]:
image_predictions_clean.describe()

Unnamed: 0,tweet_id,img_num,p1_conf,p2_conf,p3_conf
count,2075.0,2075.0,2075.0,2075.0,2075.0
mean,7.384514e+17,1.203855,0.594548,0.1345886,0.06032417
std,6.785203e+16,0.561875,0.271174,0.1006657,0.05090593
min,6.660209e+17,1.0,0.044333,1.0113e-08,1.74017e-10
25%,6.764835e+17,1.0,0.364412,0.05388625,0.0162224
50%,7.119988e+17,1.0,0.58823,0.118181,0.0494438
75%,7.932034e+17,1.0,0.843855,0.1955655,0.09180755
max,8.924206e+17,4.0,1.0,0.488014,0.273419


In [27]:
image_predictions_clean.p1.value_counts()

golden_retriever      150
Labrador_retriever    100
Pembroke               89
Chihuahua              83
pug                    57
                     ... 
radio_telescope         1
sliding_door            1
bonnet                  1
shopping_basket         1
flamingo                1
Name: p1, Length: 378, dtype: int64

In [28]:
image_predictions_clean.query("p1_dog == False and p2_dog == False and p3_dog == True")

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
8,666057090499244032,https://pbs.twimg.com/media/CT5PY90WoAAQGLo.jpg,1,shopping_cart,0.962465,False,shopping_basket,0.014594,False,golden_retriever,0.007959,True
87,667524857454854144,https://pbs.twimg.com/media/CUOGUfJW4AA_eni.jpg,1,hare,0.447893,False,dhole,0.092435,False,Chesapeake_Bay_retriever,0.088122,True
126,668256321989451776,https://pbs.twimg.com/media/CUYflCXWEAAzQVu.jpg,1,canoe,0.407683,False,paddle,0.115550,False,Pembroke,0.094429,True
131,668297328638447616,https://pbs.twimg.com/media/CUZE4IWW4AAZmDf.jpg,1,king_penguin,0.606747,False,ice_bear,0.264221,False,Eskimo_dog,0.032784,True
158,668872652652679168,https://pbs.twimg.com/media/CUhQIAhXAAA2j7u.jpg,1,teddy,0.413379,False,pillow,0.325623,False,miniature_schnauzer,0.035537,True
...,...,...,...,...,...,...,...,...,...,...,...,...
1872,844979544864018432,https://pbs.twimg.com/media/C7n4aQ0VAAAohkL.jpg,3,tennis_ball,0.999281,False,racket,0.000370,False,Shetland_sheepdog,0.000132,True
1899,851224888060895234,https://pbs.twimg.com/media/C9AohFoWsAUmxDs.jpg,3,car_mirror,0.971512,False,seat_belt,0.007063,False,standard_poodle,0.005683,True
1942,861288531465048066,https://pbs.twimg.com/ext_tw_video_thumb/86128...,1,syringe,0.144712,False,oxygen_mask,0.106684,False,Bouvier_des_Flandres,0.082610,True
1944,861769973181624320,https://pbs.twimg.com/media/CzG425nWgAAnP7P.jpg,2,Arabian_camel,0.366248,False,house_finch,0.209852,False,cocker_spaniel,0.046403,True


In [29]:
image_predictions_clean.query("p1_dog == False and p2_dog == False and p3_dog == False")

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
6,666051853826850816,https://pbs.twimg.com/media/CT5KoJ1WoAAJash.jpg,1,box_turtle,0.933012,False,mud_turtle,0.045885,False,terrapin,0.017885,False
17,666104133288665088,https://pbs.twimg.com/media/CT56LSZWoAAlJj2.jpg,1,hen,0.965932,False,cock,0.033919,False,partridge,0.000052,False
18,666268910803644416,https://pbs.twimg.com/media/CT8QCd1WEAADXws.jpg,1,desktop_computer,0.086502,False,desk,0.085547,False,bookcase,0.079480,False
21,666293911632134144,https://pbs.twimg.com/media/CT8mx7KW4AEQu8N.jpg,1,three-toed_sloth,0.914671,False,otter,0.015250,False,great_grey_owl,0.013207,False
25,666362758909284353,https://pbs.twimg.com/media/CT9lXGsUcAAyUFt.jpg,1,guinea_pig,0.996496,False,skunk,0.002402,False,hamster,0.000461,False
...,...,...,...,...,...,...,...,...,...,...,...,...
2021,880935762899988482,https://pbs.twimg.com/media/DDm2Z5aXUAEDS2u.jpg,1,street_sign,0.251801,False,umbrella,0.115123,False,traffic_light,0.069534,False
2022,881268444196462592,https://pbs.twimg.com/media/DDrk-f9WAAI-WQv.jpg,1,tusker,0.473303,False,Indian_elephant,0.245646,False,ibex,0.055661,False
2046,886680336477933568,https://pbs.twimg.com/media/DE4fEDzWAAAyHMM.jpg,1,convertible,0.738995,False,sports_car,0.139952,False,car_wheel,0.044173,False
2052,887517139158093824,https://pbs.twimg.com/ext_tw_video_thumb/88751...,1,limousine,0.130432,False,tow_truck,0.029175,False,shopping_cart,0.026321,False


In [30]:
image_predictions_clean.query("p1_dog == False and p2_dog == True and p3_dog == True")

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
22,666337882303524864,https://pbs.twimg.com/media/CT9OwFIWEAMuRje.jpg,1,ox,0.416669,False,Newfoundland,0.278407,True,groenendael,0.102643,True
43,666776908487630848,https://pbs.twimg.com/media/CUDeDoWUYAAD-EM.jpg,1,seat_belt,0.375057,False,miniature_pinscher,0.167175,True,Chihuahua,0.086951,True
95,667550904950915073,https://pbs.twimg.com/media/CUOb_gUUkAACXdS.jpg,1,web_site,0.999335,False,vizsla,0.000081,True,collie,0.000069,True
103,667806454573760512,https://pbs.twimg.com/media/CUSGbXeVAAAgztZ.jpg,1,toyshop,0.253089,False,Chihuahua,0.187155,True,Brabancon_griffon,0.112799,True
108,667878741721415682,https://pbs.twimg.com/media/CUTILFiWcAE8Rle.jpg,1,seat_belt,0.200373,False,miniature_pinscher,0.106003,True,schipperke,0.104733,True
...,...,...,...,...,...,...,...,...,...,...,...,...
1858,841833993020538882,https://pbs.twimg.com/ext_tw_video_thumb/81742...,1,ice_bear,0.336200,False,Samoyed,0.201358,True,Eskimo_dog,0.186789,True
1896,850145622816686080,https://pbs.twimg.com/media/C8xS655XkAAv9vo.jpg,2,tennis_ball,0.714798,False,kelpie,0.105390,True,malinois,0.058553,True
1904,852189679701164033,https://pbs.twimg.com/media/C9OV99SXsAEmj1U.jpg,1,barrow,0.423150,False,Bernese_mountain_dog,0.415374,True,EntleBucher,0.067345,True
1984,872122724285648897,https://pbs.twimg.com/media/DBpm-5UXcAUeCru.jpg,1,basketball,0.808396,False,pug,0.066736,True,dalmatian,0.054570,True


In [31]:
image_predictions_clean.query("p1_dog == True")

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.072010,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True
...,...,...,...,...,...,...,...,...,...,...,...,...
2068,890971913173991426,https://pbs.twimg.com/media/DF1eOmZXUAALUcq.jpg,1,Appenzeller,0.341703,True,Border_collie,0.199287,True,ice_lolly,0.193548,False
2069,891087950875897856,https://pbs.twimg.com/media/DF3HwyEWsAABqE6.jpg,1,Chesapeake_Bay_retriever,0.425595,True,Irish_terrier,0.116317,True,Indian_elephant,0.076902,False
2070,891327558926688256,https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg,2,basset,0.555712,True,English_springer,0.225770,True,German_short-haired_pointer,0.175219,True
2072,891815181378084864,https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg,1,Chihuahua,0.716012,True,malamute,0.078253,True,kelpie,0.031379,True


## twitter_archive_clean

In [32]:
twitter_archive_clean.sample(5)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
244,846042936437604353,,,2017-03-26 16:55:29 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Jarvis. The snow pupsets him. Officially ...,,,,https://twitter.com/dog_rates/status/846042936...,12,10,Jarvis,,,,
2058,671347597085433856,,,2015-11-30 15:18:34 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Lola. She was not fully prepared for t...,,,,https://twitter.com/dog_rates/status/671347597...,9,10,Lola,,,,
2253,667793409583771648,,,2015-11-20 19:55:30 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Dogs only please. Small cows and other non can...,,,,https://twitter.com/dog_rates/status/667793409...,8,10,,,,,
975,750041628174217216,,,2016-07-04 19:00:33 +0000,"<a href=""https://about.twitter.com/products/tw...",This is Beau. He's trying to keep his daddy fr...,,,,https://twitter.com/dog_rates/status/750041628...,13,10,Beau,,,,
1750,679001094530465792,,,2015-12-21 18:10:50 +0000,"<a href=""http://vine.co"" rel=""nofollow"">Vine -...",This is Rascal. He's paddling an imaginary can...,,,,https://vine.co/v/iKIwAzEatd6,11,10,Rascal,,,,


In [33]:
twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

In [34]:
twitter_archive_clean.describe()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,retweeted_status_id,retweeted_status_user_id,rating_numerator,rating_denominator
count,2356.0,78.0,78.0,181.0,181.0,2356.0,2356.0
mean,7.427716e+17,7.455079e+17,2.014171e+16,7.7204e+17,1.241698e+16,13.126486,10.455433
std,6.856705e+16,7.582492e+16,1.252797e+17,6.236928e+16,9.599254e+16,45.876648,6.745237
min,6.660209e+17,6.658147e+17,11856340.0,6.661041e+17,783214.0,0.0,0.0
25%,6.783989e+17,6.757419e+17,308637400.0,7.186315e+17,4196984000.0,10.0,10.0
50%,7.196279e+17,7.038708e+17,4196984000.0,7.804657e+17,4196984000.0,11.0,10.0
75%,7.993373e+17,8.257804e+17,4196984000.0,8.203146e+17,4196984000.0,12.0,10.0
max,8.924206e+17,8.862664e+17,8.405479e+17,8.87474e+17,7.874618e+17,1776.0,170.0


In [35]:
twitter_archive_clean.duplicated().value_counts()

False    2356
dtype: int64

In [36]:
2356 - 2297

59

In [37]:
twitter_archive_clean.query("rating_denominator == 0")

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
313,835246439529840640,8.35246e+17,26259576.0,2017-02-24 21:54:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@jonnysun @Lin_Manuel ok jomny I know you're e...,,,,,960,0,,,,,


In [38]:
pd.set_option('display.max_colwidth', None)


In [39]:
twitter_archive_clean[['text', 'rating_numerator', 'rating_denominator']]

Unnamed: 0,text,rating_numerator,rating_denominator
0,This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU,13,10
1,"This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 https://t.co/0Xxu71qeIV",13,10
2,This is Archie. He is a rare Norwegian Pouncing Corgo. Lives in the tall grass. You never know when one may strike. 12/10 https://t.co/wUnZnhtVJB,12,10
3,This is Darla. She commenced a snooze mid meal. 13/10 happens to the best of us https://t.co/tD36da7qLQ,13,10
4,"This is Franklin. He would like you to stop calling him ""cute."" He is a very fierce shark and should be respected as such. 12/10 #BarkWeek https://t.co/AtUZn91f7f",12,10
...,...,...,...
2351,Here we have a 1949 1st generation vulpix. Enjoys sweat tea and Fox News. Cannot be phased. 5/10 https://t.co/4B7cOc1EDq,5,10
2352,This is a purebred Piers Morgan. Loves to Netflix and chill. Always looks like he forgot to unplug the iron. 6/10 https://t.co/DWnyCjf2mx,6,10
2353,Here is a very happy pup. Big fan of well-maintained decks. Just look at that tongue. 9/10 would cuddle af https://t.co/y671yMhoiR,9,10
2354,This is a western brown Mitsubishi terrier. Upset about leaf. Actually 2 dogs here. 7/10 would walk the shit out of https://t.co/r7mOb2m0UI,7,10


In [40]:
twitter_archive_clean.head(20)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU,,,,https://twitter.com/dog_rates/status/892420643555336193/photo/1,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 https://t.co/0Xxu71qeIV",,,,https://twitter.com/dog_rates/status/892177421306343426/photo/1,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Archie. He is a rare Norwegian Pouncing Corgo. Lives in the tall grass. You never know when one may strike. 12/10 https://t.co/wUnZnhtVJB,,,,https://twitter.com/dog_rates/status/891815181378084864/photo/1,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Darla. She commenced a snooze mid meal. 13/10 happens to the best of us https://t.co/tD36da7qLQ,,,,https://twitter.com/dog_rates/status/891689557279858688/photo/1,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This is Franklin. He would like you to stop calling him ""cute."" He is a very fierce shark and should be respected as such. 12/10 #BarkWeek https://t.co/AtUZn91f7f",,,,"https://twitter.com/dog_rates/status/891327558926688256/photo/1,https://twitter.com/dog_rates/status/891327558926688256/photo/1",12,10,Franklin,,,,
5,891087950875897856,,,2017-07-29 00:08:17 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",Here we have a majestic great white breaching off South Africa's coast. Absolutely h*ckin breathtaking. 13/10 (IG: tucker_marlo) #BarkWeek https://t.co/kQ04fDDRmh,,,,https://twitter.com/dog_rates/status/891087950875897856/photo/1,13,10,,,,,
6,890971913173991426,,,2017-07-28 16:27:12 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",Meet Jax. He enjoys ice cream so much he gets nervous around it. 13/10 help Jax enjoy more things by clicking below\n\nhttps://t.co/Zr4hWfAs1H https://t.co/tVJBRMnhxl,,,,"https://gofundme.com/ydvmve-surgery-for-jax,https://twitter.com/dog_rates/status/890971913173991426/photo/1",13,10,Jax,,,,
7,890729181411237888,,,2017-07-28 00:22:40 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",When you watch your owner call another dog a good boy but then they turn back to you and say you're a great boy. 13/10 https://t.co/v0nONBcwxq,,,,"https://twitter.com/dog_rates/status/890729181411237888/photo/1,https://twitter.com/dog_rates/status/890729181411237888/photo/1",13,10,,,,,
8,890609185150312448,,,2017-07-27 16:25:51 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Zoey. She doesn't want to be one of the scary sharks. Just wants to be a snuggly pettable boatpet. 13/10 #BarkWeek https://t.co/9TwLuAGH0b,,,,https://twitter.com/dog_rates/status/890609185150312448/photo/1,13,10,Zoey,,,,
9,890240255349198849,,,2017-07-26 15:59:51 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Cassie. She is a college pup. Studying international doggo communication and stick theory. 14/10 so elegant much sophisticate https://t.co/t1bfwz5S2A,,,,https://twitter.com/dog_rates/status/890240255349198849/photo/1,14,10,Cassie,doggo,,,


In [41]:
twitter_archive_clean.query("doggo == 'doggo' & floofer =='floofer'")

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
200,854010172552949760,,,2017-04-17 16:34:26 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","At first I thought this was a shy doggo, but it's actually a Rare Canadian Floofer Owl. Amateurs would confuse the two. 11/10 only send dogs https://t.co/TXdT3tmuYk",,,,"https://twitter.com/dog_rates/status/854010172552949760/photo/1,https://twitter.com/dog_rates/status/854010172552949760/photo/1",11,10,,doggo,floofer,,


# Clean

## Tidiness

### 1. tweet_data_df_clean and twitter_archive_clean do not need to be in separate dfs

<em><b>Define</b></em>

Merge tweet_data_df_clean and twitter_archive_clean on tweet_id

<em><b>Code</b></em>

In [42]:
columns= ['tweet_id']
twitter_archive_clean = twitter_archive_clean.merge(tweet_data_df_clean, how='outer', on=columns, )

<em><b>Test</b></em>

In [43]:
twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2356 entries, 0 to 2355
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp_x                 2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

### 2. image_predictions_clean and twitter_archive_clean do not need to be separate dfs

<em><b>Define</b></em>

Merge image_predictions_clean and twitter_archive_clean on tweet_id

<em><b>Code</b></em>

In [44]:
columns= ['tweet_id']
twitter_archive_clean = twitter_archive_clean.merge(image_predictions_clean, how='outer', on=columns)

<em><b>Test</b></em>

In [45]:
twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2356 entries, 0 to 2355
Data columns (total 31 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp_x                 2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

### 3. text column has both a url and the tweet text

<em><b>Define</b></em>

display just the text by splitting on 'https' and getting the left (0 index) of the split for the text column

<em><b>Code</b></em>

In [46]:
twitter_archive_clean['text'] = twitter_archive_clean['text'].str.split('https').str.get(0)

In [47]:
#twitter_archive_clean['text'], twitter_archive_clean['url'] = twitter_archive_clean.text.str.split('https', 1).str


<em><b>Test</b></em>

In [48]:
twitter_archive_clean.head(2)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp_x,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,...,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10,,,,https://twitter.com/dog_rates/status/892420643555336193/photo/1,...,1.0,orange,0.097049,False,bagel,0.085851,False,banana,0.07611,False
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10",,,,https://twitter.com/dog_rates/status/892177421306343426/photo/1,...,1.0,Chihuahua,0.323581,True,Pekinese,0.090647,True,papillon,0.068957,True


In [49]:
twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2356 entries, 0 to 2355
Data columns (total 31 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp_x                 2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

In [50]:
#break

### 4. Combination of twitter_archive_clean and tweet_data_df_clean has duplicated the timestamp column

<em><b>Define</b></em>

Drop the secondary timestamp column so there are no duplicate columns in twitter_archive_clean

<em><b>Code</b></em>

In [51]:
twitter_archive_clean = twitter_archive_clean.drop('timestamp_y', axis=1)

<em><b>Test</b></em>

In [52]:
twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2356 entries, 0 to 2355
Data columns (total 30 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp_x                 2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

### 5. four columns are used for each dog stage

<em><b>Define</b></em>

Combine four columns into one dog_stage column, replace 'None' with '', separate multiple stage doggs with ', ', and drop the four, now unnecessary, columns

<em><b>Code</b></em>

In [53]:
# https://knowledge.udacity.com/questions/155520

twitter_archive_clean.doggo.replace('None', '', inplace=True)
twitter_archive_clean.floofer.replace('None', '', inplace=True)
twitter_archive_clean.pupper.replace('None', '', inplace=True)
twitter_archive_clean.puppo.replace('None', '', inplace=True)


twitter_archive_clean['dog_stage'] = twitter_archive_clean.doggo + twitter_archive_clean.floofer + twitter_archive_clean.pupper + twitter_archive_clean.puppo

twitter_archive_clean.loc[twitter_archive_clean.dog_stage == 'doggopupper', 'dog_stage'] = 'doggo, pupper'
twitter_archive_clean.loc[twitter_archive_clean.dog_stage == 'doggopuppo', 'dog_stage'] = 'doggo, puppo'
twitter_archive_clean.loc[twitter_archive_clean.dog_stage == 'doggofloofer', 'dog_stage'] = 'doggo, floofer'
twitter_archive_clean.loc[twitter_archive_clean.dog_stage == 'flooferpupper', 'dog_stage'] = 'floofer, pupper'
twitter_archive_clean.loc[twitter_archive_clean.dog_stage == 'flooferpuppo', 'dog_stage'] = 'floofer, puppo'
twitter_archive_clean.loc[twitter_archive_clean.dog_stage == 'pupperpuppo', 'dog_stage'] = 'pupper, puppo'
twitter_archive_clean.loc[twitter_archive_clean.dog_stage == 'doggoflooferpupper', 'dog_stage'] = 'doggo, floofer, pupper'
twitter_archive_clean.loc[twitter_archive_clean.dog_stage == 'doggoflooferpuppo', 'dog_stage'] = 'doggo, floofer, puppo'
twitter_archive_clean.loc[twitter_archive_clean.dog_stage == 'flooferpupperpuppo', 'dog_stage'] = 'floofer, pupper, puppo'
twitter_archive_clean.loc[twitter_archive_clean.dog_stage == 'doggoflooferpupperpuppo', 'dog_stage'] = 'doggo, floofer, pupper, puppo'

In [54]:
twitter_archive_clean = twitter_archive_clean.drop(['doggo', 'floofer', 'pupper', 'puppo'], axis=1)

<em><b>Test</b></em>

In [55]:
twitter_archive_clean[twitter_archive_clean['dog_stage'].notnull()].sample(3)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp_x,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,...,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog,dog_stage
2322,666430724426358785,,,2015-11-17 01:40:41 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",Oh boy what a pup! Sunglasses take this one to the next level. Weirdly folds front legs. Pretty big. 6/10,,,,https://twitter.com/dog_rates/status/666430724426358785/photo/1,...,llama,0.505184,False,Irish_terrier,0.104109,True,dingo,0.062071,False,
581,800443802682937345,,,2016-11-20 21:00:48 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",RT @dog_rates: This is Sampson. He's about to get hit with a vicious draw 2. Has no idea. 11/10 poor pupper,7.761133e+17,4196984000.0,2016-09-14 17:40:06 +0000,"https://twitter.com/dog_rates/status/776113305656188928/photo/1,https://twitter.com/dog_rates/status/776113305656188928/photo/1",...,mousetrap,0.777468,False,black_widow,0.09394,False,paddlewheel,0.017492,False,pupper
1805,676942428000112642,,,2015-12-16 01:50:26 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",Who leaves the last cupcake just sitting there? 9/10,,,,https://twitter.com/dog_rates/status/676942428000112642/photo/1,...,black-footed_ferret,0.707199,False,polecat,0.15463,False,weasel,0.097626,False,


In [56]:
twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2356 entries, 0 to 2355
Data columns (total 27 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp_x                 2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  likes                       2331 

## Quality

### 1. timestamp_x provides more data than necessary and has unecessary characters in name from merge

<em><b>Define</b></em>

Strip the ' +0000' from the timestamp_x column while it is an object and create a column without the '_x'

<em><b>Code</b></em>

In [57]:
new_dates = []

for timestamp in twitter_archive_clean.timestamp_x:
    new_date = timestamp[:-6]
    new_dates.append(new_date)
twitter_archive_clean['timestamp'] = new_dates

<em><b>Test</b></em>

In [58]:
twitter_archive_clean.timestamp

0       2017-08-01 16:23:56
1       2017-08-01 00:17:27
2       2017-07-31 00:18:03
3       2017-07-30 15:58:51
4       2017-07-29 16:00:24
               ...         
2351    2015-11-16 00:24:50
2352    2015-11-16 00:04:52
2353    2015-11-15 23:21:54
2354    2015-11-15 23:05:30
2355    2015-11-15 22:32:08
Name: timestamp, Length: 2356, dtype: object

### 2. timestamp is an object and it should be datetime

<em><b>Define</b></em>

Convert timestamp to datetime datatype using pd.to_datetime

<em><b>Code</b></em>

In [59]:
twitter_archive_clean.timestamp = pd.to_datetime(twitter_archive_clean['timestamp'])

<em><b>Test</b></em>

In [60]:
twitter_archive_clean.timestamp

0      2017-08-01 16:23:56
1      2017-08-01 00:17:27
2      2017-07-31 00:18:03
3      2017-07-30 15:58:51
4      2017-07-29 16:00:24
               ...        
2351   2015-11-16 00:24:50
2352   2015-11-16 00:04:52
2353   2015-11-15 23:21:54
2354   2015-11-15 23:05:30
2355   2015-11-15 22:32:08
Name: timestamp, Length: 2356, dtype: datetime64[ns]

### 3. duplicated timestamp column, timestamp_x

<em><b>Define</b></em>

Drop the remaining unnecessary timestamp_x column

<em><b>Code</b></em>

In [61]:
twitter_archive_clean = twitter_archive_clean.drop('timestamp_x', axis=1)

<em><b>Test</b></em>

In [62]:
twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2356 entries, 0 to 2355
Data columns (total 27 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   tweet_id                    2356 non-null   int64         
 1   in_reply_to_status_id       78 non-null     float64       
 2   in_reply_to_user_id         78 non-null     float64       
 3   source                      2356 non-null   object        
 4   text                        2356 non-null   object        
 5   retweeted_status_id         181 non-null    float64       
 6   retweeted_status_user_id    181 non-null    float64       
 7   retweeted_status_timestamp  181 non-null    object        
 8   expanded_urls               2297 non-null   object        
 9   rating_numerator            2356 non-null   int64         
 10  rating_denominator          2356 non-null   int64         
 11  name                        2356 non-null   object      

### 4. rows which are retweets or replies are unnecessary

<em><b>Define</b></em>

Remove the rows where the entry is either a reply or a retweet

<em><b>Code</b></em>

In [63]:
twitter_archive_clean.drop(twitter_archive_clean.loc[twitter_archive_clean['in_reply_to_status_id'].notnull()].index, inplace=True)
twitter_archive_clean.drop(twitter_archive_clean.loc[twitter_archive_clean['retweeted_status_id'].notnull()].index, inplace=True)

<em><b>Test</b></em>

In [64]:
twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2097 entries, 0 to 2355
Data columns (total 27 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   tweet_id                    2097 non-null   int64         
 1   in_reply_to_status_id       0 non-null      float64       
 2   in_reply_to_user_id         0 non-null      float64       
 3   source                      2097 non-null   object        
 4   text                        2097 non-null   object        
 5   retweeted_status_id         0 non-null      float64       
 6   retweeted_status_user_id    0 non-null      float64       
 7   retweeted_status_timestamp  0 non-null      object        
 8   expanded_urls               2094 non-null   object        
 9   rating_numerator            2097 non-null   int64         
 10  rating_denominator          2097 non-null   int64         
 11  name                        2097 non-null   object      

### 5.  in_reply_to_status_id,	in_reply_to_user_id, retweeted_status_id, retweeted_status_user_id, retweeted_status_timestamp columns are no longer necessary

<em><b>Define</b></em>

Drop the in_reply_to_status_id,	in_reply_to_user_id, retweeted_status_id, retweeted_status_user_id, retweeted_status_timestamp

<em><b>Code</b></em>

In [65]:
twitter_archive_clean = twitter_archive_clean.drop(['in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id', 'retweeted_status_user_id', 'retweeted_status_timestamp'], axis=1)

<em><b>Test</b></em>

In [66]:
twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2097 entries, 0 to 2355
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   tweet_id            2097 non-null   int64         
 1   source              2097 non-null   object        
 2   text                2097 non-null   object        
 3   expanded_urls       2094 non-null   object        
 4   rating_numerator    2097 non-null   int64         
 5   rating_denominator  2097 non-null   int64         
 6   name                2097 non-null   object        
 7   likes               2090 non-null   float64       
 8   retweets            2090 non-null   float64       
 9   jpg_url             1971 non-null   object        
 10  img_num             1971 non-null   float64       
 11  p1                  1971 non-null   object        
 12  p1_conf             1971 non-null   float64       
 13  p1_dog              1971 non-null   object      

### 6. tweet_id datatype is int64 and it should be a string

<em><b>Define</b></em>

Change datatype of tweet_id with .astype to make it a string

<em><b>Code</b></em>

In [67]:
twitter_archive_clean.tweet_id = twitter_archive_clean.tweet_id.astype('str')

<em><b>Test</b></em>

In [68]:
twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2097 entries, 0 to 2355
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   tweet_id            2097 non-null   object        
 1   source              2097 non-null   object        
 2   text                2097 non-null   object        
 3   expanded_urls       2094 non-null   object        
 4   rating_numerator    2097 non-null   int64         
 5   rating_denominator  2097 non-null   int64         
 6   name                2097 non-null   object        
 7   likes               2090 non-null   float64       
 8   retweets            2090 non-null   float64       
 9   jpg_url             1971 non-null   object        
 10  img_num             1971 non-null   float64       
 11  p1                  1971 non-null   object        
 12  p1_conf             1971 non-null   float64       
 13  p1_dog              1971 non-null   object      

### 7. dog_stage datatype is a string and it should be a category

<em><b>Define</b></em>

Change datatype of dog_stage with .astype to make it a category

<em><b>Code</b></em>

In [69]:
twitter_archive_clean.dog_stage = twitter_archive_clean.dog_stage.astype('category')

<em><b>Test</b></em>

In [70]:
twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2097 entries, 0 to 2355
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   tweet_id            2097 non-null   object        
 1   source              2097 non-null   object        
 2   text                2097 non-null   object        
 3   expanded_urls       2094 non-null   object        
 4   rating_numerator    2097 non-null   int64         
 5   rating_denominator  2097 non-null   int64         
 6   name                2097 non-null   object        
 7   likes               2090 non-null   float64       
 8   retweets            2090 non-null   float64       
 9   jpg_url             1971 non-null   object        
 10  img_num             1971 non-null   float64       
 11  p1                  1971 non-null   object        
 12  p1_conf             1971 non-null   float64       
 13  p1_dog              1971 non-null   object      

### 8.  3 rows do not have expanded_urls data

<em><b>Define</b></em>

Examine the three rows and collect the url information, if possible. Delete the rows if the tweets no longer exist or are not relevant to this project (i.e. contain no images)

<em><b>Code</b></em>

In [71]:
twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2097 entries, 0 to 2355
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   tweet_id            2097 non-null   object        
 1   source              2097 non-null   object        
 2   text                2097 non-null   object        
 3   expanded_urls       2094 non-null   object        
 4   rating_numerator    2097 non-null   int64         
 5   rating_denominator  2097 non-null   int64         
 6   name                2097 non-null   object        
 7   likes               2090 non-null   float64       
 8   retweets            2090 non-null   float64       
 9   jpg_url             1971 non-null   object        
 10  img_num             1971 non-null   float64       
 11  p1                  1971 non-null   object        
 12  p1_conf             1971 non-null   float64       
 13  p1_dog              1971 non-null   object      

In [72]:
twitter_archive_clean[twitter_archive_clean['expanded_urls'].isnull()]

Unnamed: 0,tweet_id,source,text,expanded_urls,rating_numerator,rating_denominator,name,likes,retweets,jpg_url,...,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog,dog_stage,timestamp
375,828361771580813312,"<a href=""http://twitter.com"" rel=""nofollow"">Twitter Web Client</a>",Beebop and Doobert should start a band 12/10 would listen,,12,10,,2169.0,173.0,,...,,,,,,,,,,2017-02-05 21:56:51
707,785515384317313025,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","Today, 10/10, should be National Dog Rates Day",,10,10,,6128.0,1259.0,,...,,,,,,,,,,2016-10-10 16:20:36
1445,696518437233913856,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",Oh my god 10/10 for every little hot dog pupper,,10,10,,3824.0,1676.0,,...,,,,,,,,,pupper,2016-02-08 02:18:30


In [73]:
image_predictions_clean.query("tweet_id == '828361771580813312' or tweet_id == '785515384317313025' or tweet_id == '696518437233913856'")

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog


Examination of the tweets in image_predictions and by following the links has revealed that the tweets do not have images and should be dropped by this project's parameters

In [74]:
indexes = twitter_archive_clean.query("tweet_id == '828361771580813312' or tweet_id == '785515384317313025' or tweet_id == '696518437233913856'").index
indexes

Int64Index([375, 707, 1445], dtype='int64')

In [75]:
# https://thispointer.com/python-pandas-how-to-drop-rows-in-dataframe-by-conditions-on-column-values/
twitter_archive_clean.drop(indexes, inplace=True)

<em><b>Test</b></em>

In [76]:
twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2094 entries, 0 to 2355
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   tweet_id            2094 non-null   object        
 1   source              2094 non-null   object        
 2   text                2094 non-null   object        
 3   expanded_urls       2094 non-null   object        
 4   rating_numerator    2094 non-null   int64         
 5   rating_denominator  2094 non-null   int64         
 6   name                2094 non-null   object        
 7   likes               2087 non-null   float64       
 8   retweets            2087 non-null   float64       
 9   jpg_url             1971 non-null   object        
 10  img_num             1971 non-null   float64       
 11  p1                  1971 non-null   object        
 12  p1_conf             1971 non-null   float64       
 13  p1_dog              1971 non-null   object      

In [77]:
#twitter_archive_clean[twitter_archive_clean['expanded_urls'].isnull()]

### 9. inconsistent formatting for the names of potential image matches in p1, p2, and p3

<em><b>Define</b></em>

Replace the separation '\_' with a space and make all entries lowercase

<em><b>Code</b></em>

In [78]:
twitter_archive_clean.p1 = twitter_archive_clean.p1.str.replace('_', ' ').str.lower()
twitter_archive_clean.p2 = twitter_archive_clean.p2.str.replace('_', ' ').str.lower()
twitter_archive_clean.p3 = twitter_archive_clean.p3.str.replace('_', ' ').str.lower()

<em><b>Test</b></em>

In [79]:
twitter_archive_clean.head()

Unnamed: 0,tweet_id,source,text,expanded_urls,rating_numerator,rating_denominator,name,likes,retweets,jpg_url,...,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog,dog_stage,timestamp
0,892420643555336193,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10,https://twitter.com/dog_rates/status/892420643555336193/photo/1,13,10,Phineas,35562.0,7534.0,https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg,...,0.097049,False,bagel,0.085851,False,banana,0.07611,False,,2017-08-01 16:23:56
1,892177421306343426,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10",https://twitter.com/dog_rates/status/892177421306343426/photo/1,13,10,Tilly,30753.0,5589.0,https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,...,0.323581,True,pekinese,0.090647,True,papillon,0.068957,True,,2017-08-01 00:17:27
2,891815181378084864,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Archie. He is a rare Norwegian Pouncing Corgo. Lives in the tall grass. You never know when one may strike. 12/10,https://twitter.com/dog_rates/status/891815181378084864/photo/1,12,10,Archie,23130.0,3703.0,https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg,...,0.716012,True,malamute,0.078253,True,kelpie,0.031379,True,,2017-07-31 00:18:03
3,891689557279858688,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Darla. She commenced a snooze mid meal. 13/10 happens to the best of us,https://twitter.com/dog_rates/status/891689557279858688/photo/1,13,10,Darla,38869.0,7717.0,https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg,...,0.170278,False,labrador retriever,0.168086,True,spatula,0.040836,False,,2017-07-30 15:58:51
4,891327558926688256,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","This is Franklin. He would like you to stop calling him ""cute."" He is a very fierce shark and should be respected as such. 12/10 #BarkWeek","https://twitter.com/dog_rates/status/891327558926688256/photo/1,https://twitter.com/dog_rates/status/891327558926688256/photo/1",12,10,Franklin,37129.0,8316.0,https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg,...,0.555712,True,english springer,0.22577,True,german short-haired pointer,0.175219,True,,2017-07-29 16:00:24


### 10. source column contains unecessary information

<em><b>Define</b></em>

Drop the source column as it has unnecessary information

<em><b>Code</b></em>

In [80]:
twitter_archive_clean = twitter_archive_clean.drop(['source'], axis=1)

<em><b>Test</b></em>

In [81]:
twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2094 entries, 0 to 2355
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   tweet_id            2094 non-null   object        
 1   text                2094 non-null   object        
 2   expanded_urls       2094 non-null   object        
 3   rating_numerator    2094 non-null   int64         
 4   rating_denominator  2094 non-null   int64         
 5   name                2094 non-null   object        
 6   likes               2087 non-null   float64       
 7   retweets            2087 non-null   float64       
 8   jpg_url             1971 non-null   object        
 9   img_num             1971 non-null   float64       
 10  p1                  1971 non-null   object        
 11  p1_conf             1971 non-null   float64       
 12  p1_dog              1971 non-null   object        
 13  p2                  1971 non-null   object      

### 11. 123 rows do not have image data also found in image_predictions_clean

<em><b>Define</b></em>

Drop the rows which do not have image prediction data

<em><b>Code</b></em>

In [82]:
no_images = twitter_archive_clean[twitter_archive_clean.p1.isnull()].index
no_images

Int64Index([  35,   42,   72,   83,   88,  110,  133,  190,  192,  241,
            ...
            1743, 1750, 1760, 1776, 1791, 1807, 1818, 1834, 1916, 2212],
           dtype='int64', length=123)

In [83]:
twitter_archive_clean.drop(no_images, inplace=True)

<em><b>Test</b></em>

In [84]:
twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1971 entries, 0 to 2355
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   tweet_id            1971 non-null   object        
 1   text                1971 non-null   object        
 2   expanded_urls       1971 non-null   object        
 3   rating_numerator    1971 non-null   int64         
 4   rating_denominator  1971 non-null   int64         
 5   name                1971 non-null   object        
 6   likes               1964 non-null   float64       
 7   retweets            1964 non-null   float64       
 8   jpg_url             1971 non-null   object        
 9   img_num             1971 non-null   float64       
 10  p1                  1971 non-null   object        
 11  p1_conf             1971 non-null   float64       
 12  p1_dog              1971 non-null   object        
 13  p2                  1971 non-null   object      

### 12. 7 rows do not have like or retweet information

<em><b>Define</b></em>

These 7 rows do not have like or retweet information as they have been deleted and this information was not available via the twitter API. They should be dropped

<em><b>Code</b></em>

In [91]:
nolikes = twitter_archive_clean[twitter_archive_clean.likes.isnull()].index
nolikes

Int64Index([104, 253, 296, 363, 750, 932, 1726], dtype='int64')

In [92]:
twitter_archive_clean.drop(nolikes, inplace=True)

<em><b>Test</b></em>

In [93]:
twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1964 entries, 0 to 2355
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   tweet_id            1964 non-null   object        
 1   text                1964 non-null   object        
 2   expanded_urls       1964 non-null   object        
 3   rating_numerator    1964 non-null   int64         
 4   rating_denominator  1964 non-null   int64         
 5   name                1964 non-null   object        
 6   likes               1964 non-null   float64       
 7   retweets            1964 non-null   float64       
 8   jpg_url             1964 non-null   object        
 9   img_num             1964 non-null   float64       
 10  p1                  1964 non-null   object        
 11  p1_conf             1964 non-null   float64       
 12  p1_dog              1964 non-null   object        
 13  p2                  1964 non-null   object      

### 13. rating_denominator is sometimes not 10

<em><b>Define</b></em>



### Reassessing twitter_archive_clean df after initial tidiness and quality cleaning steps of the 3 original dfs

In [86]:
twitter_archive_clean[twitter_archive_clean['rating_denominator'] != 10].tweet_id.count()

17

In [90]:
twitter_archive_clean[twitter_archive_clean.likes.isnull()]

Unnamed: 0,tweet_id,text,expanded_urls,rating_numerator,rating_denominator,name,likes,retweets,jpg_url,img_num,...,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog,dog_stage,timestamp
104,872261713294495745,This is Harry. His ears are activated one at a time. Incredibly rare to witness in person. Very special moment here. 13/10 blessed as h*ck,"https://twitter.com/dog_rates/status/872261713294495745/photo/1,https://twitter.com/dog_rates/status/872261713294495745/photo/1",13,10,Harry,,,https://pbs.twimg.com/media/DBrlZk2UQAAfAkd.jpg,2.0,...,0.972019,True,flat-coated retriever,0.008178,True,chesapeake bay retriever,0.007359,True,,2017-06-07 01:19:32
253,844704788403113984,This is Luna. It's her first time outside and a bee stung her nose. Completely h*ckin uncalled for. 13/10 where's the bee I just wanna talk,https://twitter.com/dog_rates/status/844704788403113984/photo/1,13,10,Luna,,,https://pbs.twimg.com/media/C7j-hkSW0AIxCZC.jpg,1.0,...,0.980213,True,golden retriever,0.007012,True,beagle,0.003147,True,,2017-03-23 00:18:10
296,837366284874571778,This is Lucy. She has a portrait of herself on her ear. Excellent for identification pupposes. 13/10 innovative af,https://twitter.com/dog_rates/status/837366284874571778/photo/1,13,10,Lucy,,,https://pbs.twimg.com/media/C57sMJwXMAASBSx.jpg,1.0,...,0.660085,True,staffordshire bullterrier,0.334947,True,dalmatian,0.002697,True,,2017-03-02 18:17:34
363,829374341691346946,This is Astrid. She's a guide doggo in training. 13/10 would follow anywhere,"https://twitter.com/dog_rates/status/829374341691346946/photo/1,https://twitter.com/dog_rates/status/829374341691346946/photo/1",13,10,Astrid,,,https://pbs.twimg.com/media/C4KHj-nWQAA3poV.jpg,1.0,...,0.757547,True,american staffordshire terrier,0.14995,True,chesapeake bay retriever,0.047523,True,doggo,2017-02-08 17:00:26
750,779123168116150273,This is Reggie. He hugs everyone he meets. 12/10 keep spreading the love Reggie,https://twitter.com/dog_rates/status/779123168116150273/photo/1,12,10,Reggie,,,https://pbs.twimg.com/media/CtAAYizW8AAWzBZ.jpg,1.0,...,0.43108,True,soft-coated wheaten terrier,0.060365,True,cocker spaniel,0.059845,True,,2016-09-23 01:00:13
932,754011816964026368,This is Charlie. He pouts until he gets to go on the swing. 12/10 manipulative af,"https://twitter.com/dog_rates/status/754011816964026368/photo/1,https://twitter.com/dog_rates/status/754011816964026368/photo/1",12,10,Charlie,,,https://pbs.twimg.com/media/CnbJuPoXEAAjcVF.jpg,1.0,...,0.600985,True,boston bull,0.273176,True,boxer,0.056772,True,,2016-07-15 17:56:40
1726,680055455951884288,"Meet Sammy. At first I was like ""that's a snowflake. we only rate dogs,"" but he would've melted by now, so 10/10",https://twitter.com/dog_rates/status/680055455951884288/photo/1,10,10,Sammy,,,https://pbs.twimg.com/media/CW-ZRC_WQAAyFrL.jpg,1.0,...,0.995466,True,great pyrenees,0.001834,True,pomeranian,0.000667,True,,2015-12-24 16:00:30
