# Project: Wrangle and analyze 'WeRateDogs' Twitter Data 

## Introduction:

>

**Step 1:** First, all the necessary packages for wrangling, analyzing and visualizing data must be imported.

In [1]:
#import necessary packages
import numpy as np
import pandas as pd
import requests
import os
import json
import tweepy
import matplotlib.pyplot as plt
import seaborn as sns

#display visualizations in this notebook
%matplotlib inline

#format all visualization backgrounds with seaborn
sns.set()


 ## Part I: Gather Data

**Step 1:** The `twitter_archive_enhanced.csv` file onhand is read into a pandas dataframe.

In [2]:
#read the .csv file as a pandas dataframe and assign it to the variable twdf
twdf = pd.read_csv('twitter-archive-enhanced.csv')
#preview first few lines
twdf.head()


Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


**Step 2:** Now, the Udacity hosted file: `image_predictions.tsv` is downloaded via the requests library and assigned to a pandas dataframe.

In [3]:
#open image_predictions.tsv and write the response to the `image_predictions.tsv` file
def download_preds():
    '''
    First this assigns file location to url variable.
    Then th requests library is used to download url and assign to response variable.
    Finally, with the file open, the response is written to the `image_predictions.tsv` file.
    '''
    url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
    response = requests.get(url)
    with open('image_predictions.tsv', 'wb') as file:
        file.write(response.content)
#download_preds()

In [4]:
#read the predictions file to a pandas dataframe and assign to pred_df variable
pred_df = pd.read_csv('image_predictions.tsv', sep='\t')
pred_df.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


**Step 3:** Use the Tweepy library to download each tweets JSON data into the single text file: `tweet_json.txt`. Then read the information on each line of the file into a pandas dataframe.

In [5]:
#assign twitter developer keys, authorze them, then assign twitter API to the api object variable
consumer_key = 'SP2BQjxXdQO2Z9b63y1pnbG56'
consumer_secret = '7NA3xssg2dpKOUhnCSXCvqJvVCJca5Mr9E6zcdwu1kxL86Ml8N'
access_token = '829356934478204928-UP3nvECGL7KwtPyvk6QsKZQrDPZct9A'
access_secret = 'PnFzZMNHunRW2oQItUkoTDmavcw2jES2ytvNXaunucR2p'

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [6]:
#save all tweets json data to a single text document
def save_tweet_jsons():
    '''
    -writes each tweets JSON data as a one_line string to 'tweet_json.txt' queried by each tweet_id in the dataframe: twdf
    -also has counter to completion
    -try block is used to continue in case there is no data to write
    '''
    count = twdf.tweet_id.count()
    with open('tweet_json.txt', 'w') as file:
        for tweet_id in twdf['tweet_id']:
            try:
                tweet = api.get_status(tweet_id, tweet_mode='extended')._json
                json.dump(tweet, file)
                file.write('\n')
                count -= 1 
                print(count)
            except tweepy.error.TweepError:
                count -= 1 
                print('TweepError', count)
                continue
#save_tweet_jsons()

In [7]:
#read each tweets JSON data as a line from `tweet_json.txt` and append their desired data to a list.
df_list =[]
index=0
with open('tweet_json.txt') as full_json_file:
    for line in full_json_file.readlines():
        data = json.loads(line)
        tweet_id = data['id']
        retweet_count = data['retweet_count']
        favorite_count = data['favorite_count']
        df_list.append({'tweet_id': tweet_id,
                        'retweet_count': retweet_count,
                        'favorite_count': favorite_count})
        

    

In [8]:
#converts the list with json data to a pandas dataframe assigned to the variable: json_df
json_df = pd.DataFrame(df_list)
json_df.head()

Unnamed: 0,favorite_count,retweet_count,tweet_id
0,37524,8188,892420643555336193
1,32256,6057,892177421306343426
2,24300,4005,891815181378084864
3,40872,8330,891689557279858688
4,39065,9033,891327558926688256


## Part II: Assess Data

**Step 1:** First, a random sample of five observations are displayed for each of the three dataframes in order to become familiar with the data.

In [9]:
#inspect five lines of the twdf dataframe
twdf.sample(5)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
530,808134635716833280,,,2016-12-12 02:21:26 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: This is Milo. I would do terrib...,8.011679e+17,4196984000.0,2016-11-22 20:58:07 +0000,https://twitter.com/dog_rates/status/801167903...,13,10,Milo,,,,
1200,716730379797970944,,,2016-04-03 20:53:33 +0000,"<a href=""http://twitter.com/download/iphone"" r...",There has clearly been a mistake. Pup did noth...,,,,https://twitter.com/chpsanfrancisco/status/716...,12,10,,,,,
39,884876753390489601,,,2017-07-11 20:47:12 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Lola. It's her first time outside. Mus...,,,,https://twitter.com/dog_rates/status/884876753...,13,10,Lola,,,,
498,813130366689148928,8.131273e+17,4196984000.0,2016-12-25 21:12:41 +0000,"<a href=""http://twitter.com/download/iphone"" r...",I've been informed by multiple sources that th...,,,,,12,10,,,,,
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,


In [10]:
#inspect five lines of the pred_df dataframe
pred_df.sample(5)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
1848,839549326359670784,https://pbs.twimg.com/media/C6atpTLWYAIL7bU.jpg,1,swing,0.393527,False,Norwich_terrier,0.05248,True,Pembroke,0.049901,True
1389,766423258543644672,https://pbs.twimg.com/media/CqLh4yJWcAAHomv.jpg,2,keeshond,0.995823,True,Pomeranian,0.003897,True,Norwegian_elkhound,0.000253,True
1742,822647212903690241,https://pbs.twimg.com/media/C2oRbOuWEAAbVSl.jpg,1,Samoyed,0.416769,True,malamute,0.252706,True,kuvasz,0.157028,True
94,667550882905632768,https://pbs.twimg.com/media/CUObvUJVEAAnYPF.jpg,1,web_site,0.998258,False,dishwasher,0.000201,False,oscilloscope,0.000142,False
978,707059547140169728,https://pbs.twimg.com/media/Cc_64zVWEAAeXs7.jpg,1,Samoyed,0.897312,True,Great_Pyrenees,0.03918,True,kuvasz,0.019516,True


In [11]:
#inspect five lines of the json_df dataframe
json_df.sample(5)

Unnamed: 0,favorite_count,retweet_count,tweet_id
443,11577,2453,818259473185828864
1522,10717,4070,689659372465688576
232,45734,12082,846514051647705089
603,0,15489,796177847564038144
1840,3187,1341,675497103322386432


**Step 2:** One thing that catches the eye is inconsistent capitalization of dognames in the collumn: `pred_df['p1']`, so the fifteen most common values are inspected below.  Interestingly, the fifteenth is not even a dog.

In [12]:
#inspect the fifteen most common dog prediction names in pred_df
pred_df.p1.value_counts().head(15)

golden_retriever            150
Labrador_retriever          100
Pembroke                     89
Chihuahua                    83
pug                          57
chow                         44
Samoyed                      43
toy_poodle                   39
Pomeranian                   38
malamute                     30
cocker_spaniel               30
French_bulldog               26
Chesapeake_Bay_retriever     23
miniature_pinscher           23
seat_belt                    22
Name: p1, dtype: int64

**Step 3:** To better understand the quality of data, the pandas `.info()` method is used on all three dataframes.  From this, various issues of wrong datatypes and missing values are revealed.

In [13]:
#view shape, collumn names and datatypes of twdf dataframe
twdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

In [14]:
#view shape, collumn names and datatypes of pred_df dataframe
pred_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [15]:
#view shape, collumn names and datatypes of json_df dataframe
json_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2337 entries, 0 to 2336
Data columns (total 3 columns):
favorite_count    2337 non-null int64
retweet_count     2337 non-null int64
tweet_id          2337 non-null int64
dtypes: int64(3)
memory usage: 54.9 KB


**Step 4:** since there are not many rows with data in the `twdf['retweeted_status_id']` collumn, a sample of five of these rows are queried. It appears that these are retweets. 

In [16]:
#sample retreat observations in the twdf dataframe for closer inspection
twdf[~twdf['retweeted_status_id'].isnull()].sample(5)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
506,812747805718642688,,,2016-12-24 19:52:31 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: Meet Sammy. At first I was like...,6.800555e+17,4196984000.0,2015-12-24 16:00:30 +0000,https://twitter.com/dog_rates/status/680055455...,10,10,Sammy,,,,
686,788070120937619456,,,2016-10-17 17:32:13 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: This is Bo and Ty. Bo eats pape...,7.610045e+17,4196984000.0,2016-08-04 01:03:17 +0000,https://twitter.com/dog_rates/status/761004547...,11,10,Bo,,,,
415,822647212903690241,,,2017-01-21 03:29:14 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: This is Paisley. She really wan...,8.224891e+17,4196984000.0,2017-01-20 17:00:46 +0000,https://twitter.com/dog_rates/status/822489057...,13,10,Paisley,,,,
476,816014286006976512,,,2017-01-02 20:12:21 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: This is Larry. He has no self c...,7.320056e+17,4196984000.0,2016-05-16 00:31:53 +0000,https://twitter.com/dog_rates/status/732005617...,11,10,Larry,,,,
589,799308762079035393,,,2016-11-17 17:50:33 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: I WAS SENT THE ACTUAL DOG IN TH...,7.743144e+17,4196984000.0,2016-09-09 18:31:54 +0000,https://twitter.com/dog_rates/status/774314403...,14,10,,,,,


**Step 5:** Since the number of rows in the three dataframes are not the same, it might help to check and see how many of the tweet_id's do not match accross dataframes for tidiness reasons.  This operation is performed below.

In [17]:
#create a list of tweet_id's in pred_df in order to count how many are not in the twdf dataframe

pred_ids = list(pred_df['tweet_id'])
count = 0
for tweet_id in twdf['tweet_id']:
    if tweet_id not in pred_ids:
        count += 1
count   

281

In [18]:
#then count how many json_df tweet_id's are not in the pred_df dataframe

count = 0
for tweet_id in json_df['tweet_id']:
    if tweet_id not in pred_ids:
        count += 1
count

272

**Step 6:**  Finally, the quality and tidiness issues are summarized as follows.

### Quality Issues:

- tweet_id is the wrong data type in all three tables
- `twdf` timestamps are wrong data type
- dog names are 'None' strings when they should be null in the `twdf` table
- some of the `twdf` observations are retweets
- missing `twdf['expanded_url']` data
- some of the `pred_df` observations are probably not dogs
- dog types in the p1, p2 and p3 collumns of the `pred_df` table are inconsistently capitalized
- since some tweet_ids in `twdf` and `json_df` are not in the pred_df, there is missing image data

### Tidiness Issues:

- the four 'doggo', 'floofer', 'pupper' and 'puppo' collumns represent one categorical variable
- some of the `json_df` tweet_id's have no prediction data
- there is no prediction info for some of the tweet_ids
- all data should be in a single table without the 'in_reply_to_status_id',	'in_reply_to_user_id', 'retweeted_status_id', and 'retweeted_status_user_id' collumns
- p2 and p3 info from pred_df does not need to be included since it is not relevent to the desired statistical analysis and visualization

## Part III: Clean Data

**Step 1:** Before cleaning, a copy of the `twdf` dataframe is created as `master_df` to initiate the process.

In [19]:
#create a copy of twdf for cleaning as the master_df
#then copy pred_df and json_df for cleaning
master_df = twdf.copy()
json_clean = json_df.copy()
pred_clean = pred_df.copy()
master_df.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


**Step 2:** Address the following datatype issues:

- tweet_id is the wrong data type in all three tables
- `twdf` timestamps are wrong data type

##### Definition:
- use the pandas `.astype()` method to convert each table's `tweet_id` collumn to string datatype
- use to pandas `to_datetime()` method to convert the timestamps in `master_df` to datetime

##### Code:

In [20]:
#set the tweet_id for each table equal to itself as a string to save changes inplace
master_df['tweet_id'] = master_df['tweet_id'].astype('str')
json_clean['tweet_id'] = json_clean['tweet_id'].astype('str')
pred_clean['tweet_id'] = pred_clean['tweet_id'].astype('str')

In [21]:
#convert the timestamp collumn in master_df to datetime
master_df['timestamp'] = pd.to_datetime(master_df['timestamp'])


##### Test:

In [22]:
#check datatype of each change
print(master_df['timestamp'].dtypes)
print(master_df['tweet_id'].dtypes)
print(json_clean['tweet_id'].dtypes)
print(pred_clean['tweet_id'].dtypes)

datetime64[ns, UTC]
object
object
object


**Step 3:** Address the tidiness and cleanliness issue in the `doggo`, `floofer`, `pupper`, and `puppo` collumns.  For convenience, the remaining two quality issues in `master_df` are addressed as well.

- dog names are 'None' strings when they should be null in the `twdf` table
- the four 'doggo', 'floofer', 'pupper' and 'puppo' collumns represent one categorical variable
- some of the `twdf` observations are retweets


##### Definition:

- replace 'None' with '' for all for `doggo`, `floofer`, `pupper`, and `puppo` collumns
- concatenate `doggo`, `floofer`, `pupper`, and `puppo` collunns into a single collumn
- query the rows without a `retweeted_status_id` value and assign to the `master_df` variable


##### Code:

In [23]:
#replace all 'None' values in the four dog title categories with an empty string
for collumn in master_df.iloc[:,13:17]:
    master_df[collumn].replace({'None': ''}, regex=True, inplace=True)


In [24]:
#concatinate all strings in the four categories and create new 'dog_title' collumn
master_df['dog_title'] = master_df['doggo'] + master_df['floofer'] + master_df['pupper'] + master_df['puppo']


In [25]:
#Drop the four unnecessary collumns
master_df.drop(['doggo', 'floofer', 'pupper', 'puppo'], axis=1, inplace=True)

In [49]:
#query all rows that do not have a retweeted_status_id and assign as master_df
master_df = master_df.query("retweeted_status_id != retweeted_status_id")

##### Test:

In [26]:
#check the collumns for the four drops and single addition
master_df.head(1)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,dog_title
0,892420643555336193,,,2017-08-01 16:23:56+00:00,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,


In [27]:
#check the 'dog_title' values
master_df.dog_title.value_counts()

                1976
pupper           245
doggo             83
puppo             29
doggopupper       12
floofer            9
doggopuppo         1
doggofloofer       1
Name: dog_title, dtype: int64

**Note:** Uh oh! It looks like the possibility of two dog titles was not taken into consideration.  In order to completely clean this data set, one would need to iterate and create a secondary dog title collumn.

In [50]:
#check for values in master_df.retweeted_status_id
master_df.retweeted_status_id.value_counts()

Series([], Name: retweeted_status_id, dtype: int64)

**Step 4:** Address the remaining two quality issues in the `pred_clean` table.

- some of the `pred_df` observations are probably not dogs
- dog types in the p1, p2 and p3 collumns of the `pred_df` table are inconsistently capitalized

##### Definition:

- drop the rows in `pred_clean` where the `p1_dog` collumn is False
- convert all rows in the `p1` collumn to lowercase. (`p2` and `p3` will be dropped eventually, so they are ignored)

##### Code:

In [28]:
#queries only the rows where the observation is probably a dog
pred_clean = pred_clean.query('p1_dog == True')

In [29]:
#converts all 'p1' strings to lowercase
pred_clean['p1'] = pred_clean.p1.str.lower()

##### Test:

In [30]:
#Ensures there are no observations that are probably not dogs in pred_clean
pred_clean.query('p1_dog != True')['p1_dog'].count() == 0

True

In [32]:
#sums the strings in 'p1' that contain capital letters
pred_clean['p1'].str.contains('[A-Z]', regex=True).sum()

0

**Step 5:** The remaining quality issue and four tidiness issues are addressed by joining tables.

- since some tweet_ids in `twdf` and `json_df` are not in the pred_df, there is missing image data
- missing `twdf['expanded_urls']` data
- some of the `json_df` tweet_id's have no prediction data
- there is no prediction info for some of the tweet_ids
- all data should be in a single table without the 'in_reply_to_status_id',	'in_reply_to_user_id', 'retweeted_status_id', and 'retweeted_status_user_id' collumns
- p2 and p3 info from pred_df does not need to be included since it is not relevent to the desired statistical analysis and visualization

##### Define:

- First, drop the unnecessary collumns for this analysis:
 - `in_reply_to_status_id`
 - `in_reply_to_user_id`
 - `source`
 - `retweeted_status_id`
 - `retweeted_status_user_id`
 - `retweeted_status_timestamp`
 - `expanded_urls`
 

##### Code:

In [62]:
master_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2175 entries, 0 to 2355
Data columns (total 14 columns):
tweet_id                      2175 non-null object
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2175 non-null datetime64[ns, UTC]
source                        2175 non-null object
text                          2175 non-null object
retweeted_status_id           0 non-null float64
retweeted_status_user_id      0 non-null float64
retweeted_status_timestamp    0 non-null object
expanded_urls                 2117 non-null object
rating_numerator              2175 non-null int64
rating_denominator            2175 non-null int64
name                          2175 non-null object
dog_title                     2175 non-null object
dtypes: datetime64[ns, UTC](1), float64(4), int64(2), object(7)
memory usage: 254.9+ KB


In [63]:
pred_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1532 entries, 0 to 2073
Data columns (total 12 columns):
tweet_id    1532 non-null object
jpg_url     1532 non-null object
img_num     1532 non-null int64
p1          1532 non-null object
p1_conf     1532 non-null float64
p1_dog      1532 non-null bool
p2          1532 non-null object
p2_conf     1532 non-null float64
p2_dog      1532 non-null bool
p3          1532 non-null object
p3_conf     1532 non-null float64
p3_dog      1532 non-null bool
dtypes: bool(3), float64(3), int64(1), object(5)
memory usage: 124.2+ KB


##### Test:

## Part IV: Analyze and Visualize Data