### GATHER :
================

Working environment : Jupyter Notebook - python version : '0.23.3'

Gathering is the first step in the data wrangling process.
    
    1-Import all needed libraries 
    2-Obtaining data from different resources :

        -Reading from .csv file using pandas (twitter-archive-enhanced.csv) 
        -Downloading  .tsv file from the internet using requests (image-predictions.tsv) 
        -Getting JSON object of all the tweet_ids using TweepyQuerying an API (tweet_json.txt)
    3-Importing that data into data frames 

In [141]:
# Import all required libraries 
import numpy as np
import pandas as pd
import json
import re
import requests
import configparser
import datetime
import matplotlib.pyplot as plt
import os
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore') 
%matplotlib inline 
import tweepy
from tweepy import OAuthHandler
from timeit import default_timer as timer

In [142]:
pd. __version__ 

'0.23.3'

In [143]:
# load twitter-archive-enhanced.CSV into a data frame
twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')

In [144]:
# load image_predictions.tsv file programatically using the Requests Library
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)

# save to .tsv file
with open('image_predictions.tsv', 'wb') as file:
    file.write(response.content)


In [145]:
image_predictions = pd.read_csv('image_predictions.tsv',sep='\t')

In [146]:
# Create a copy of the data frames 
archive_df = twitter_archive.copy()
Image_predictions_df = image_predictions.copy()

In [147]:
# inspect twitter_archive dataframe
archive_df.shape

(2356, 17)

In [148]:
archive_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

In [149]:
archive_df.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


In [150]:
# inspect Image_predictions data frame
Image_predictions_df.shape

(2075, 12)

In [151]:
Image_predictions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [152]:
Image_predictions_df.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


In [153]:
# Query Twitter API for each tweet in the Twitter archive and save JSON in a text file
# These are hidden to comply with Twitter's API terms and conditions
#consumer_key = 'HIDDEN'
#consumer_secret = 'HIDDEN'
#access_token = 'HIDDEN'
#access_secret = 'HIDDEN'

#auth = OAuthHandler(consumer_key, consumer_secret)
#auth.set_access_token(access_token, access_secret)

#api = tweepy.API(auth, wait_on_rate_limit=True)


# NOTE TO REVIEWER: this student had mobile verification issues so the following
# Twitter API code was sent to this student from a Udacity instructor

# Tweet IDs for which to gather additional data via Twitter's API
#tweet_ids = twitter_archive.tweet_id.values
#len(tweet_ids)

# Query Twitter's API for JSON data for each tweet ID in the Twitter archive
#count = 0
#fails_dict = {}
#start = timer()
# Save each tweet's returned JSON as a new line in a .txt file
#with open('tweet_json.txt', 'w') as outfile:
    # This loop will likely take 20-30 minutes to run because of Twitter's rate limit
#    for tweet_id in tweet_ids:
#        count += 1
#        print(str(count) + ": " + str(tweet_id))
#        try:
#            tweet = api.get_status(tweet_id, tweet_mode='extended')
#            print("Success")
#            json.dump(tweet._json, outfile)
#            outfile.write('\n')
#        except tweepy.TweepError as e:
#            print("Fail")
#            fails_dict[tweet_id] = e
#            pass
#end = timer()
#print(end - start)
#print(fails_dict)

In [154]:
# Create new dataframe with id, retweet_count, and favorite_count from the gived JSON file from UDACITY
# change id to tweet_id for consistence in naming Variables

tweet_json = open('tweet-json.txt', 'r')
api_df = pd.DataFrame(columns=['tweet_id', 'retweets', 'favorites'])

for line in tweet_json:
    tweet = json.loads(line)
    api_df = api_df.append({'tweet_id': tweet['id'], 'retweets': tweet['retweet_count'], 'favorites': tweet['favorite_count']}, ignore_index=True)
tweet_json.close()

In [155]:
# make sure that dataframe is created and rows are selected
api_df.shape

(2354, 3)

In [156]:
api_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2354 entries, 0 to 2353
Data columns (total 3 columns):
tweet_id     2354 non-null object
retweets     2354 non-null object
favorites    2354 non-null object
dtypes: object(3)
memory usage: 55.2+ KB


In [157]:
# inspect image_predictions data frame 
api_df.head()

Unnamed: 0,tweet_id,retweets,favorites
0,892420643555336193,8853,39467
1,892177421306343426,6514,33819
2,891815181378084864,4328,25461
3,891689557279858688,8964,42908
4,891327558926688256,9774,41048


ASSESS:
========

After obtaining these data into data frames, we assess them ( in excel and jupyter notebook) visually and programmatically for detecting and documenting both quality and tidiness issues

archive_df Inspection:
===================

In [158]:
archive_df.describe()


Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,retweeted_status_id,retweeted_status_user_id,rating_numerator,rating_denominator
count,2356.0,78.0,78.0,181.0,181.0,2356.0,2356.0
mean,7.427716e+17,7.455079e+17,2.014171e+16,7.7204e+17,1.241698e+16,13.126486,10.455433
std,6.856705e+16,7.582492e+16,1.252797e+17,6.236928e+16,9.599254e+16,45.876648,6.745237
min,6.660209e+17,6.658147e+17,11856340.0,6.661041e+17,783214.0,0.0,0.0
25%,6.783989e+17,6.757419e+17,308637400.0,7.186315e+17,4196984000.0,10.0,10.0
50%,7.196279e+17,7.038708e+17,4196984000.0,7.804657e+17,4196984000.0,11.0,10.0
75%,7.993373e+17,8.257804e+17,4196984000.0,8.203146e+17,4196984000.0,12.0,10.0
max,8.924206e+17,8.862664e+17,8.405479e+17,8.87474e+17,7.874618e+17,1776.0,170.0


In [159]:
archive_df.sample(50)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
1669,682429480204398592,,,2015-12-31 05:14:01 +0000,"<a href=""http://twitter.com/download/iphone"" r...","I know we joke around on here, but this is get...",,,,https://twitter.com/dog_rates/status/682429480...,8,10,,,,,
2134,670069087419133954,,,2015-11-27 02:38:14 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Randall. He's from Chernobyl. Built pl...,,,,https://twitter.com/dog_rates/status/670069087...,5,10,Randall,,,,
614,796759840936919040,,,2016-11-10 17:02:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Say hello to Romeo. He was just told that it's...,,,,https://twitter.com/dog_rates/status/796759840...,11,10,Romeo,,,,
1823,676533798876651520,,,2015-12-14 22:46:41 +0000,"<a href=""http://twitter.com/download/iphone"" r...",ITSOFLUFFAYYYYY 12/10 https://t.co/bfw13CnuuZ,,,,https://twitter.com/dog_rates/status/676533798...,12,10,,,,,
1426,697881462549430272,,,2016-02-11 20:34:41 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Carter. He wakes up in the morning and...,,,,https://twitter.com/dog_rates/status/697881462...,10,10,Carter,,,,
1073,739932936087216128,,,2016-06-06 21:32:13 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Say hello to Rorie. She's zen af. Just enjoyin...,,,,https://twitter.com/dog_rates/status/739932936...,10,10,Rorie,,,,
829,768909767477751808,,,2016-08-25 20:35:48 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: When it's Janet from accounting...,7.001438e+17,4196984000.0,2016-02-18 02:24:13 +0000,https://twitter.com/dog_rates/status/700143752...,10,10,,,,pupper,
1689,681340665377193984,6.813394e+17,4196984000.0,2015-12-28 05:07:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",I've been told there's a slight possibility he...,,,,,5,10,,,,,
1873,675145476954566656,,,2015-12-11 02:49:59 +0000,"<a href=""http://twitter.com/download/iphone"" r...",What an honor. 3 dogs here. Blond one is clear...,,,,https://twitter.com/dog_rates/status/675145476...,9,10,,,,,
1374,701952816642965504,,,2016-02-23 02:12:47 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Rilo. He's a Northern Curly Ticonderoga. ...,,,,https://twitter.com/dog_rates/status/701952816...,11,10,Rilo,,,,


Image_predictions_df Inspection:
============================

In [160]:
print ( 'Total duplicated rows:', archive_df.duplicated().sum())

Total duplicated rows: 0


In [161]:
print ( 'Total Null Values :', archive_df.isnull().sum().sum())

Total Null Values : 11140


In [162]:
archive_df.isnull().sum()

tweet_id                         0
in_reply_to_status_id         2278
in_reply_to_user_id           2278
timestamp                        0
source                           0
text                             0
retweeted_status_id           2175
retweeted_status_user_id      2175
retweeted_status_timestamp    2175
expanded_urls                   59
rating_numerator                 0
rating_denominator               0
name                             0
doggo                            0
floofer                          0
pupper                           0
puppo                            0
dtype: int64

In [163]:
print('Columns are :',archive_df.columns)

Columns are : Index(['tweet_id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'timestamp',
       'source', 'text', 'retweeted_status_id', 'retweeted_status_user_id',
       'retweeted_status_timestamp', 'expanded_urls', 'rating_numerator',
       'rating_denominator', 'name', 'doggo', 'floofer', 'pupper', 'puppo'],
      dtype='object')


In [164]:
# Make sure numerators are consistent.
archive_df.rating_numerator.unique()

array([  13,   12,   14,    5,   17,   11,   10,  420,  666,    6,   15,
        182,  960,    0,   75,    7,   84,    9,   24,    8,    1,   27,
          3,    4,  165, 1776,  204,   50,   99,   80,   45,   60,   44,
        143,  121,   20,   26,    2,  144,   88])

In [165]:
archive_df.rating_denominator.unique()

array([ 10,   0,  15,  70,   7,  11, 150, 170,  20,  50,  90,  80,  40,
       130, 110,  16, 120,   2])

In [166]:
# checking 'rating_numerator' for irregular values < 10 or >14 
archive_df.loc[(archive_df['rating_numerator'] < 10 ) | (archive_df['rating_numerator'] > 14 )]


Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
45,883482846933004288,,,2017-07-08 00:28:19 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Bella. She hopes her smile made you sm...,,,,https://twitter.com/dog_rates/status/883482846...,5,10,Bella,,,,
55,881633300179243008,8.816070e+17,4.738443e+07,2017-07-02 21:58:53 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@roushfenway These are good dogs but 17/10 is ...,,,,,17,10,,,,,
188,855862651834028034,8.558616e+17,1.943518e+08,2017-04-22 19:15:32 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@dhmontgomery We also gave snoop dogg a 420/10...,,,,,420,10,,,,,
189,855860136149123072,8.558585e+17,1.361572e+07,2017-04-22 19:05:32 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@s8n You tried very hard to portray this good ...,,,,,666,10,,,,,
229,848212111729840128,,,2017-04-01 16:35:01 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Jerry. He's doing a distinguished tong...,,,,https://twitter.com/dog_rates/status/848212111...,6,10,Jerry,,,,
285,838916489579200512,,,2017-03-07 00:57:32 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @KibaDva: I collected all the good dogs!! 1...,8.389060e+17,8.117408e+08,2017-03-07 00:15:46 +0000,https://twitter.com/KibaDva/status/83890598062...,15,10,,,,,
290,838150277551247360,8.381455e+17,2.195506e+07,2017-03-04 22:12:52 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@markhoppus 182/10,,,,,182,10,,,,,
291,838085839343206401,8.380855e+17,2.894131e+09,2017-03-04 17:56:49 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@bragg6of8 @Andy_Pace_ we are still looking fo...,,,,,15,10,,,,,
313,835246439529840640,8.352460e+17,2.625958e+07,2017-02-24 21:54:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@jonnysun @Lin_Manuel ok jomny I know you're e...,,,,,960,0,,,,,
315,835152434251116546,,,2017-02-24 15:40:31 +0000,"<a href=""http://twitter.com/download/iphone"" r...",When you're so blinded by your systematic plag...,,,,https://twitter.com/dog_rates/status/835152434...,0,10,,,,,


In [167]:
# checking 'rating_denominator' for irregular values < 10 or >14 
archive_df.loc[(archive_df['rating_denominator'] < 10 ) | (archive_df['rating_denominator'] > 14 )]

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
313,835246439529840640,8.35246e+17,26259580.0,2017-02-24 21:54:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@jonnysun @Lin_Manuel ok jomny I know you're e...,,,,,960,0,,,,,
342,832088576586297345,8.320875e+17,30582080.0,2017-02-16 04:45:50 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@docmisterio account started on 11/15/15,,,,,11,15,,,,,
433,820690176645140481,,,2017-01-15 17:52:40 +0000,"<a href=""http://twitter.com/download/iphone"" r...",The floofs have been released I repeat the flo...,,,,https://twitter.com/dog_rates/status/820690176...,84,70,,,,,
516,810984652412424192,,,2016-12-19 23:06:23 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Sam. She smiles 24/7 &amp; secretly aspir...,,,,"https://www.gofundme.com/sams-smile,https://tw...",24,7,Sam,,,,
902,758467244762497024,,,2016-07-28 01:00:57 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Why does this never happen at my front door......,,,,https://twitter.com/dog_rates/status/758467244...,165,150,,,,,
1120,731156023742988288,,,2016-05-13 16:15:54 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Say hello to this unbelievably well behaved sq...,,,,https://twitter.com/dog_rates/status/731156023...,204,170,this,,,,
1165,722974582966214656,,,2016-04-21 02:25:47 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Happy 4/20 from the squad! 13/10 for all https...,,,,https://twitter.com/dog_rates/status/722974582...,4,20,,,,,
1202,716439118184652801,,,2016-04-03 01:36:11 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Bluebert. He just saw that both #Final...,,,,https://twitter.com/dog_rates/status/716439118...,50,50,Bluebert,,,,
1228,713900603437621249,,,2016-03-27 01:29:02 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Happy Saturday here's 9 puppers on a bench. 99...,,,,https://twitter.com/dog_rates/status/713900603...,99,90,,,,,
1254,710658690886586372,,,2016-03-18 02:46:49 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here's a brigade of puppers. All look very pre...,,,,https://twitter.com/dog_rates/status/710658690...,80,80,,,,,


In [168]:
# rating statistics to get the meaan , 5 number summary ,count 
archive_df['rating_numerator'].describe()

count    2356.000000
mean       13.126486
std        45.876648
min         0.000000
25%        10.000000
50%        11.000000
75%        12.000000
max      1776.000000
Name: rating_numerator, dtype: float64

In [169]:
archive_df['rating_denominator'].describe()

count    2356.000000
mean       10.455433
std         6.745237
min         0.000000
25%        10.000000
50%        10.000000
75%        10.000000
max       170.000000
Name: rating_denominator, dtype: float64

In [170]:
#check empty rating 
archive_df['rating_denominator'].isnull().sum()

0

In [171]:
archive_df['rating_numerator'].isnull().sum()

0

In [172]:
# While assessing visually in excel " GÃ²rdÃ³n , AmÃ©lie ,  OliviÃ©r ,FrÃ¶nq ,RalphÃ©.. etc" this was inspected visually in the excel sheet and after inspecting it programatically 
# I figured out that its French names it not considered a quality issue any more 
archive_df.iloc[2217]
archive_df.iloc[2195]
archive_df.iloc[2164]

tweet_id                                                     669371483794317312
in_reply_to_status_id                                                       NaN
in_reply_to_user_id                                                         NaN
timestamp                                             2015-11-25 04:26:12 +0000
source                        <a href="http://twitter.com/download/iphone" r...
text                          This is Oliviér. He's a Baptist Hindquarter. A...
retweeted_status_id                                                         NaN
retweeted_status_user_id                                                    NaN
retweeted_status_timestamp                                                  NaN
expanded_urls                 https://twitter.com/dog_rates/status/669371483...
rating_numerator                                                             10
rating_denominator                                                           10
name                                    

In [174]:
# Many names are invalid names and many NONE values are observed
archive_df['name'].value_counts()

None         745
a             55
Charlie       12
Cooper        11
Lucy          11
Oliver        11
Penny         10
Tucker        10
Lola          10
Winston        9
Bo             9
Sadie          8
the            8
Buddy          7
Bailey         7
Daisy          7
Toby           7
an             7
Jax            6
Bella          6
Milo           6
Oscar          6
Stanley        6
Rusty          6
Dave           6
Jack           6
Scout          6
Leo            6
Koda           6
Alfie          5
            ... 
Jebberson      1
Ralpher        1
Sprout         1
Skittles       1
Ester          1
Harrison       1
Nimbus         1
Halo           1
Snicku         1
Bubba          1
Bode           1
Brandi         1
Dot            1
Lipton         1
Genevieve      1
Covach         1
Julius         1
Tango          1
Clybe          1
Birf           1
Venti          1
Grizz          1
Vince          1
River          1
Obie           1
Jeffrie        1
Baron          1
Tyrus         

In [175]:
# Inspect dog names 
# Noted : non meaningful / incorrect data and also don't conform to the writing standards 
# Name was extracted in a wrong way .Any value after "This is" is extracted as name
lower= archive_df['name'].str.islower()
lower_values = archive_df['name'].loc[lower].unique()
lower_values

array(['such', 'a', 'quite', 'not', 'one', 'incredibly', 'mad', 'an',
       'very', 'just', 'my', 'his', 'actually', 'getting', 'this',
       'unacceptable', 'all', 'old', 'infuriating', 'the', 'by',
       'officially', 'life', 'light', 'space'], dtype=object)

In [176]:
title= archive_df['name'].str.istitle()
title_values = archive_df['name'].loc[title].unique()
title_values

array(['Phineas', 'Tilly', 'Archie', 'Darla', 'Franklin', 'None', 'Jax',
       'Zoey', 'Cassie', 'Koda', 'Bruno', 'Ted', 'Stuart', 'Oliver', 'Jim',
       'Zeke', 'Ralphus', 'Canela', 'Gerald', 'Jeffrey', 'Maya', 'Mingus',
       'Derek', 'Roscoe', 'Waffles', 'Jimbo', 'Maisey', 'Lilly', 'Earl',
       'Lola', 'Kevin', 'Yogi', 'Noah', 'Bella', 'Grizzwald', 'Rusty',
       'Gus', 'Stanley', 'Alfy', 'Koko', 'Rey', 'Gary', 'Elliot', 'Louis',
       'Jesse', 'Romeo', 'Bailey', 'Duddles', 'Jack', 'Emmy', 'Steven',
       'Beau', 'Snoopy', 'Shadow', 'Terrance', 'Aja', 'Penny', 'Dante',
       'Nelly', 'Ginger', 'Benedict', 'Venti', 'Goose', 'Nugget', 'Cash',
       'Coco', 'Jed', 'Sebastian', 'Walter', 'Sierra', 'Monkey', 'Harry',
       'Kody', 'Lassie', 'Rover', 'Napolean', 'Dawn', 'Boomer', 'Cody',
       'Rumble', 'Clifford', 'Dewey', 'Scout', 'Gizmo', 'Cooper', 'Harold',
       'Shikha', 'Jamesy', 'Lili', 'Sammy', 'Meatball', 'Paisley', 'Albus',
       'Neptune', 'Quinn', 'Belle', 'Zooe

In [177]:
# all title names were changed to lower case for matching process
title= archive_df['name'].str.lower()
title.unique()

array(['phineas', 'tilly', 'archie', 'darla', 'franklin', 'none', 'jax',
       'zoey', 'cassie', 'koda', 'bruno', 'ted', 'stuart', 'oliver', 'jim',
       'zeke', 'ralphus', 'canela', 'gerald', 'jeffrey', 'such', 'maya',
       'mingus', 'derek', 'roscoe', 'waffles', 'jimbo', 'maisey', 'lilly',
       'earl', 'lola', 'kevin', 'yogi', 'noah', 'bella', 'grizzwald',
       'rusty', 'gus', 'stanley', 'alfy', 'koko', 'rey', 'gary', 'a',
       'elliot', 'louis', 'jesse', 'romeo', 'bailey', 'duddles', 'jack',
       'emmy', 'steven', 'beau', 'snoopy', 'shadow', 'terrance', 'aja',
       'penny', 'dante', 'nelly', 'ginger', 'benedict', 'venti', 'goose',
       'nugget', 'cash', 'coco', 'jed', 'sebastian', 'walter', 'sierra',
       'monkey', 'harry', 'kody', 'lassie', 'rover', 'napolean', 'dawn',
       'boomer', 'cody', 'rumble', 'clifford', 'quite', 'dewey', 'scout',
       'gizmo', 'cooper', 'harold', 'shikha', 'jamesy', 'lili', 'sammy',
       'meatball', 'paisley', 'albus', 'neptune', '

In [178]:
type(title)

pandas.core.series.Series

In [179]:
lower_list = set(lower_values)
title_list = set(title)
common_values = title_list.intersection(lower_list)
print (' number of common values are :',len(common_values),"\n",'intersected values are: ' ,common_values)

 number of common values are : 25 
 intersected values are:  {'light', 'actually', 'mad', 'such', 'space', 'just', 'all', 'this', 'life', 'very', 'the', 'by', 'an', 'old', 'getting', 'not', 'a', 'incredibly', 'quite', 'one', 'unacceptable', 'infuriating', 'officially', 'my', 'his'}


In [180]:
# inspect each type and if its complete or contains NONE 
# missing incomplete data are noted 
archive_df['doggo'].value_counts() 

None     2259
doggo      97
Name: doggo, dtype: int64

In [181]:
archive_df['floofer'].value_counts()

None       2346
floofer      10
Name: floofer, dtype: int64

In [182]:
archive_df['pupper'].value_counts()

None      2099
pupper     257
Name: pupper, dtype: int64

In [183]:
archive_df['puppo'].value_counts()

None     2326
puppo      30
Name: puppo, dtype: int64

In [184]:
# Detect records where there are more than one dog type 
archive_df.loc[(archive_df[['doggo', 'floofer', 'pupper', 'puppo']] != 'None').sum(axis=1) > 1]

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
191,855851453814013952,,,2017-04-22 18:31:02 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here's a puppo participating in the #ScienceMa...,,,,https://twitter.com/dog_rates/status/855851453...,13,10,,doggo,,,puppo
200,854010172552949760,,,2017-04-17 16:34:26 +0000,"<a href=""http://twitter.com/download/iphone"" r...","At first I thought this was a shy doggo, but i...",,,,https://twitter.com/dog_rates/status/854010172...,11,10,,doggo,floofer,,
460,817777686764523521,,,2017-01-07 16:59:28 +0000,"<a href=""http://twitter.com/download/iphone"" r...","This is Dido. She's playing the lead role in ""...",,,,https://twitter.com/dog_rates/status/817777686...,13,10,Dido,doggo,,pupper,
531,808106460588765185,,,2016-12-12 00:29:28 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have Burke (pupper) and Dexter (doggo)...,,,,https://twitter.com/dog_rates/status/808106460...,12,10,,doggo,,pupper,
565,802265048156610565,7.331095e+17,4196984000.0,2016-11-25 21:37:47 +0000,"<a href=""http://twitter.com/download/iphone"" r...","Like doggo, like pupper version 2. Both 11/10 ...",,,,https://twitter.com/dog_rates/status/802265048...,11,10,,doggo,,pupper,
575,801115127852503040,,,2016-11-22 17:28:25 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Bones. He's being haunted by another d...,,,,https://twitter.com/dog_rates/status/801115127...,12,10,Bones,doggo,,pupper,
705,785639753186217984,,,2016-10-11 00:34:48 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Pinot. He's a sophisticated doggo. You...,,,,https://twitter.com/dog_rates/status/785639753...,10,10,Pinot,doggo,,pupper,
733,781308096455073793,,,2016-09-29 01:42:20 +0000,"<a href=""http://vine.co"" rel=""nofollow"">Vine -...","Pupper butt 1, Doggo 0. Both 12/10 https://t.c...",,,,https://vine.co/v/5rgu2Law2ut,12,10,,doggo,,pupper,
778,775898661951791106,,,2016-09-14 03:27:11 +0000,"<a href=""http://twitter.com/download/iphone"" r...","RT @dog_rates: Like father (doggo), like son (...",7.331095e+17,4196984000.0,2016-05-19 01:38:16 +0000,https://twitter.com/dog_rates/status/733109485...,12,10,,doggo,,pupper,
822,770093767776997377,,,2016-08-29 03:00:36 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: This is just downright precious...,7.410673e+17,4196984000.0,2016-06-10 00:39:48 +0000,https://twitter.com/dog_rates/status/741067306...,12,10,just,doggo,,pupper,


In [185]:
# check duplicated name and inspect its records
# retweets are noted which need to be excluded as per project requirement 
archive_df['name'].loc[archive_df['name'].duplicated()]
archive_df.loc[archive_df['name'] == 'Charlie']

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
230,847978865427394560,,,2017-04-01 01:08:10 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: This is Charlie. He fell asleep...,8.323699e+17,4196984000.0,2017-02-16 23:23:38 +0000,https://twitter.com/dog_rates/status/832369877...,11,10,Charlie,,,,
254,844580511645339650,,,2017-03-22 16:04:20 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Charlie. He wants to know if you have ...,,,,https://twitter.com/dog_rates/status/844580511...,11,10,Charlie,,,,
261,842846295480000512,,,2017-03-17 21:13:10 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Charlie. He's wishing you a very fun a...,,,,https://twitter.com/dog_rates/status/842846295...,13,10,Charlie,,,,
326,833826103416520705,,,2017-02-20 23:50:09 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Charlie. She asked u to change the channe...,,,,https://twitter.com/dog_rates/status/833826103...,13,10,Charlie,,,,
338,832369877331693569,,,2017-02-16 23:23:38 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Charlie. He fell asleep on a heating v...,,,,https://twitter.com/dog_rates/status/832369877...,11,10,Charlie,,,,
383,827199976799354881,,,2017-02-02 17:00:17 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Charlie. He wins every game of chess h...,,,,https://twitter.com/dog_rates/status/827199976...,13,10,Charlie,,,,
813,771102124360998913,,,2016-08-31 21:47:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Charlie. He works for @TODAYshow. Supe...,,,,https://twitter.com/dog_rates/status/771102124...,12,10,Charlie,,,,
932,754011816964026368,,,2016-07-15 17:56:40 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Charlie. He pouts until he gets to go ...,,,,https://twitter.com/dog_rates/status/754011816...,12,10,Charlie,,,,
1358,703382836347330562,,,2016-02-27 00:55:11 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Charlie. He's a West Side Niddlewog. M...,,,,https://twitter.com/dog_rates/status/703382836...,12,10,Charlie,,,,
1436,697255105972801536,,,2016-02-10 03:05:46 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Charlie. He likes to kiss all the big mil...,,,,https://twitter.com/dog_rates/status/697255105...,10,10,Charlie,,,,


Image_predictions_df Inspection:
============================

In [186]:
Image_predictions_df.sample(50)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
1238,746818907684614144,https://pbs.twimg.com/media/Cl071YVWEAAlF7N.jpg,1,dingo,0.175518,False,timber_wolf,0.133647,False,Ibizan_hound,0.101537,True
274,670833812859932673,https://pbs.twimg.com/media/CU9HyzSWIAAVcte.jpg,1,Pekinese,0.609853,True,Persian_cat,0.265442,False,Japanese_spaniel,0.02746,True
277,670842764863651840,https://pbs.twimg.com/media/CU9P717W4AAOlKx.jpg,1,microphone,0.096063,False,accordion,0.094075,False,drumstick,0.061113,False
1804,832215726631055365,https://pbs.twimg.com/media/CwJR1okWIAA6XMp.jpg,1,Afghan_hound,0.274637,True,borzoi,0.142204,True,doormat,0.109677,False
1503,784826020293709826,https://pbs.twimg.com/media/CuRDF-XWcAIZSer.jpg,1,chow,0.090341,True,binoculars,0.083499,False,Irish_setter,0.077456,True
718,685667379192414208,https://pbs.twimg.com/media/CYP62A6WkAAOnL4.jpg,1,sliding_door,0.344526,False,doormat,0.190027,False,washbasin,0.046326,False
1512,786363235746385920,https://pbs.twimg.com/media/Cum5LlfWAAAyPcS.jpg,1,golden_retriever,0.929266,True,Labrador_retriever,0.062867,True,Saluki,0.002157,True
1235,746507379341139972,https://pbs.twimg.com/media/Clwgf4bWgAAB15c.jpg,1,toy_poodle,0.508292,True,Lakeland_terrier,0.234458,True,affenpinscher,0.084563,True
118,668154635664932864,https://pbs.twimg.com/media/CUXDGR2WcAAUQKz.jpg,1,Arctic_fox,0.473584,False,wallaby,0.261411,False,white_wolf,0.080948,False
1628,804738756058218496,https://pbs.twimg.com/media/CysBn-lWIAAoRx1.jpg,1,Tibetan_mastiff,0.91579,True,German_shepherd,0.06248,True,Leonberg,0.008297,True


In [187]:
Image_predictions_df.describe()

Unnamed: 0,tweet_id,img_num,p1_conf,p2_conf,p3_conf
count,2075.0,2075.0,2075.0,2075.0,2075.0
mean,7.384514e+17,1.203855,0.594548,0.1345886,0.06032417
std,6.785203e+16,0.561875,0.271174,0.1006657,0.05090593
min,6.660209e+17,1.0,0.044333,1.0113e-08,1.74017e-10
25%,6.764835e+17,1.0,0.364412,0.05388625,0.0162224
50%,7.119988e+17,1.0,0.58823,0.118181,0.0494438
75%,7.932034e+17,1.0,0.843855,0.1955655,0.09180755
max,8.924206e+17,4.0,1.0,0.488014,0.273419


In [188]:
#checking completeness of data
Image_predictions_df['p1'].isnull().sum()

0

In [189]:
Image_predictions_df['p2'].isnull().sum()

0

In [190]:
Image_predictions_df['p3'].isnull().sum()

0

In [191]:
#Examine p1, p2 and p3 to find top predicted 
#noted that (Labrador_retriever) is most predicted breed type
Image_predictions_df['p1'].describe()

count                 2075
unique                 378
top       golden_retriever
freq                   150
Name: p1, dtype: object

In [192]:
Image_predictions_df['p2'].describe()

count                   2075
unique                   405
top       Labrador_retriever
freq                     104
Name: p2, dtype: object

In [193]:
Image_predictions_df['p3'].describe()

count                   2075
unique                   408
top       Labrador_retriever
freq                      79
Name: p3, dtype: object

In [194]:
#find count of false vaues for p1_dogs 
#locate the predicted values for them 
(Image_predictions_df['p1_dog']== False ).sum()

543

In [195]:
#locate the predicted values for them 
Image_predictions_df['p1'].loc[(Image_predictions_df['p1_dog']== False )]

6             box_turtle
8          shopping_cart
17                   hen
18      desktop_computer
21      three-toed_sloth
22                    ox
25            guinea_pig
29                  coho
33                 llama
43             seat_belt
45                 snail
50           triceratops
51                  swab
52                   hay
53                 hyena
56         jigsaw_puzzle
69                vacuum
73                 teddy
77             porcupine
78                 goose
87                  hare
93          electric_fan
94              web_site
95              web_site
96                  ibex
98           fire_engine
100             lorikeet
103              toyshop
106        jigsaw_puzzle
107        common_iguana
              ...       
1900            web_site
1902          pencil_box
1904              barrow
1905              prison
1906             barbell
1910              grille
1931            revolver
1932              Angora
1936          chimpanzee


In [196]:
#find count of false vaules for p2_dogs
(Image_predictions_df['p2_dog']== False ).sum()

522

In [197]:
#locate the predicted values for them 
Image_predictions_df['p1'].loc[(Image_predictions_df['p1_dog']== False )]

6             box_turtle
8          shopping_cart
17                   hen
18      desktop_computer
21      three-toed_sloth
22                    ox
25            guinea_pig
29                  coho
33                 llama
43             seat_belt
45                 snail
50           triceratops
51                  swab
52                   hay
53                 hyena
56         jigsaw_puzzle
69                vacuum
73                 teddy
77             porcupine
78                 goose
87                  hare
93          electric_fan
94              web_site
95              web_site
96                  ibex
98           fire_engine
100             lorikeet
103              toyshop
106        jigsaw_puzzle
107        common_iguana
              ...       
1900            web_site
1902          pencil_box
1904              barrow
1905              prison
1906             barbell
1910              grille
1931            revolver
1932              Angora
1936          chimpanzee


In [198]:
#find count of false vaules for p3_dogs
(Image_predictions_df['p3_dog']== False ).sum()

576

In [199]:
#locate the predicted values for them 
Image_predictions_df['p1'].loc[(Image_predictions_df['p1_dog']== False )]

6             box_turtle
8          shopping_cart
17                   hen
18      desktop_computer
21      three-toed_sloth
22                    ox
25            guinea_pig
29                  coho
33                 llama
43             seat_belt
45                 snail
50           triceratops
51                  swab
52                   hay
53                 hyena
56         jigsaw_puzzle
69                vacuum
73                 teddy
77             porcupine
78                 goose
87                  hare
93          electric_fan
94              web_site
95              web_site
96                  ibex
98           fire_engine
100             lorikeet
103              toyshop
106        jigsaw_puzzle
107        common_iguana
              ...       
1900            web_site
1902          pencil_box
1904              barrow
1905              prison
1906             barbell
1910              grille
1931            revolver
1932              Angora
1936          chimpanzee


In [200]:
#examine the confidence of each prediction ('p1_conf', 'p2_conf', 'p3_conf') statistically.
Image_predictions_df['p1_conf'].describe()

count    2075.000000
mean        0.594548
std         0.271174
min         0.044333
25%         0.364412
50%         0.588230
75%         0.843855
max         1.000000
Name: p1_conf, dtype: float64

In [201]:
Image_predictions_df['p2_conf'].describe()

count    2.075000e+03
mean     1.345886e-01
std      1.006657e-01
min      1.011300e-08
25%      5.388625e-02
50%      1.181810e-01
75%      1.955655e-01
max      4.880140e-01
Name: p2_conf, dtype: float64

In [202]:
Image_predictions_df['p3_conf'].describe()

count    2.075000e+03
mean     6.032417e-02
std      5.090593e-02
min      1.740170e-10
25%      1.622240e-02
50%      4.944380e-02
75%      9.180755e-02
max      2.734190e-01
Name: p3_conf, dtype: float64

In [203]:
#need to check number of images and corresponding number of tweets to see the missing records 
Image_predictions_df['jpg_url'].value_counts()

https://pbs.twimg.com/media/Cq9guJ5WgAADfpF.jpg                                            2
https://pbs.twimg.com/ext_tw_video_thumb/817423809049493505/pu/img/5OFW0yueFu9oTUiQ.jpg    2
https://pbs.twimg.com/media/ChK1tdBWwAQ1flD.jpg                                            2
https://pbs.twimg.com/media/CvyVxQRWEAAdSZS.jpg                                            2
https://pbs.twimg.com/media/Cp6db4-XYAAMmqL.jpg                                            2
https://pbs.twimg.com/tweet_video_thumb/CeBym7oXEAEWbEg.jpg                                2
https://pbs.twimg.com/media/CkNjahBXAAQ2kWo.jpg                                            2
https://pbs.twimg.com/media/DFDw2tyUQAAAFke.jpg                                            2
https://pbs.twimg.com/media/CdHwZd0VIAA4792.jpg                                            2
https://pbs.twimg.com/media/CuRDF-XWcAIZSer.jpg                                            2
https://pbs.twimg.com/media/CvJCabcWgAIoUxW.jpg                       

api_df Inspection:
==================

In [204]:
#examine the 'retweets', 'favorite'_count' statistically.
api_df.columns

Index(['tweet_id', 'retweets', 'favorites'], dtype='object')

In [205]:
api_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2354 entries, 0 to 2353
Data columns (total 3 columns):
tweet_id     2354 non-null object
retweets     2354 non-null object
favorites    2354 non-null object
dtypes: object(3)
memory usage: 55.2+ KB


In [206]:
# need to change the types of variables for statistical inspections
api_df = api_df.astype(int)
api_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2354 entries, 0 to 2353
Data columns (total 3 columns):
tweet_id     2354 non-null int64
retweets     2354 non-null int64
favorites    2354 non-null int64
dtypes: int64(3)
memory usage: 55.2 KB


In [207]:
api_df['tweet_id'].describe()

count    2.354000e+03
mean     7.426978e+17
std      6.852812e+16
min      6.660209e+17
25%      6.783975e+17
50%      7.194596e+17
75%      7.993058e+17
max      8.924206e+17
Name: tweet_id, dtype: float64

In [208]:
api_df['retweets'].describe()

count     2354.000000
mean      3164.797366
std       5284.770364
min          0.000000
25%        624.500000
50%       1473.500000
75%       3652.000000
max      79515.000000
Name: retweets, dtype: float64

In [209]:
api_df['favorites'].describe()

count      2354.000000
mean       8080.968564
std       11814.771334
min           0.000000
25%        1415.000000
50%        3603.500000
75%       10122.250000
max      132810.000000
Name: favorites, dtype: float64

In [210]:
api_df['retweets'].value_counts()

1972     5
3652     5
83       5
146      4
61       4
748      4
2243     4
336      4
183      4
179      4
1207     4
265      4
115      4
71       4
1124     4
542      4
819      4
577      4
516      4
397      3
619      3
661      3
2511     3
261      3
431      3
482      3
403      3
557      3
572      3
576      3
        ..
2088     1
1271     1
2030     1
43       1
5365     1
4143     1
3316     1
1263     1
16439    1
2104     1
4125     1
27       1
4121     1
4119     1
4079     1
1285     1
10226    1
8183     1
2042     1
11524    1
6148     1
7        1
1281     1
2060     1
1825     1
8209     1
19       1
2068     1
30742    1
0        1
Name: retweets, Length: 1724, dtype: int64

In [211]:
api_df['favorites'].describe()

count      2354.000000
mean       8080.968564
std       11814.771334
min           0.000000
25%        1415.000000
50%        3603.500000
75%       10122.250000
max      132810.000000
Name: favorites, dtype: float64

In [212]:
api_df['favorites'].value_counts()

0        179
610        3
345        3
2918       3
1691       3
2176       3
2768       3
1339       3
2706       3
522        2
3134       2
1618       2
250        2
2250       2
2660       2
2262       2
2305       2
1111       2
784        2
4878       2
346        2
14685      2
780        2
6923       2
6515       2
2433       2
3603       2
13518      2
3593       2
1536       2
        ... 
4681       1
523        1
559        1
802        1
527        1
27154      1
6676       1
535        1
537        1
6682       1
8731       1
23074      1
21029      1
667        1
6696       1
2608       1
35400      1
21041      1
4659       1
10804      1
4099       1
68152      1
10812      1
573        1
6718       1
33345      1
814        1
23108      1
2630       1
8143       1
Name: favorites, Length: 2007, dtype: int64

In [213]:
#api_df.describe()
api_df.sample(50)

Unnamed: 0,tweet_id,retweets,favorites
1602,685906723014619143,3303,8277
428,821107785811234820,2487,10645
365,828801551087042563,3901,0
2308,666786068205871104,521,800
246,845459076796616705,15071,0
558,803276597545603072,2887,11207
530,808106460588765185,2525,9701
234,847606175596138505,3774,20208
68,879050749262655488,4941,23022
680,788765914992902144,12014,30658


ASSESSMENT SUMMARY :
=====================

Quality Issues : 
=============

A- archive_df:
============

1-WRONG DATA TYPES:
    
    -change 'retweeted_status_timestamp'  type  to datetime 
    -change 'timestamp'          		  type  to datetime


2-INACOMPLETE DATA
	
    -inspect each type and if its complete or contains NONE 
	 
    -missing incomplete data are noted 

MISSING VALUES:

	Total data count :  2356
    78 missing values  from ‘ in_reply_to_status_id ‘      2278     
    78 missing values  from  ‘in_reply_to_user_id’         2278
    181 missing values  from  ‘ retweeted_status_id’         2175
    181 missing values  from  ‘retweeted_status_user_id’    2175
    181 missing values  from  retweeted_status_timestamp’  2175
    2297 missing values  from  ‘expanded_urls ‘              59


3-INVALID DATA
	
    
    -wrong ‘rating_denominator’ , ‘rating_numerator’ must be float
    -Invalid values ‘rating_denominator’ , ‘rating_numerator’  must be higher than 10 & less than 14 
   
     weird data is noted (165 ,1776,143,165,420,666,6,182,960,75,1,27,3,4,48,121)

    -Name: 
	Many names are invalid names and many NONE values are observed
	Noted : non meaningful / incorrect data and also don't conform to the writing standards 
 	Name was extracted in a wrong way .Any value after "This is" is extracted as name





4-INACCURATE DATA
     
     -Name: the main problem in this column arised because it takes any word in the "text" column and after
      "This is ....." statement and it considers it as the dog name
 
     -inaccurate names holding value :  ( a ,an ,the , quite…etc )
     
     -missing names holding value : (none)
     
     - While assessing visually in excel " GÃ²rdÃ³n , AmÃ©lie ,  OliviÃ©r ,FrÃ¶nq ,RalphÃ©.. etc" 
     this was inspected visually in the excel sheet and after inspecting it programatically 
     I figured out that its French names it not considered a quality issue any more 



5-DUPLICATE DATA
    
    -Column'name' has 1399 duplicated 958 unique to be inspected 

6-NON DESCRIPTIVE VAR NAMES 
    
    -TEXT could be descriptive better if its changed to tweet


B- Image_predictions_df:
=====================

1-NON DESCRIPTIVE VAR NAMES:
    
    -Rename column p1 	      as 'predicted_dog1' 
    -Rename column p1_conf    as 'predicted_confidence1' , 
    -Rename column p1_dog'    as prediction_check’ with meaningful names

2-INCONSISTENT DATA
   
    - column  'p1' 
    - column  'p2'
    - column  'p3'
writing standards are not consistent sometimes lower case others are title case  

3-INACOMPLETE DATA
    
    -missing images image_predictions count is 2075 while twitter archive count is 2356

C- api_df:
========

    -Delete august data because there are no corresponding data in image prediction table
    
    -change all column to int for statistical inspections
 
Tidying issues :
=============


archive_df
----------------
melt names of dogs

Image_predictions_df
--------------------------------
melt p, configurations


## CLEAN
    
    
       start cleaninng the inspected issues above



In [214]:
archive_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

In [215]:
Image_predictions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [216]:
api_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2354 entries, 0 to 2353
Data columns (total 3 columns):
tweet_id     2354 non-null int64
retweets     2354 non-null int64
favorites    2354 non-null int64
dtypes: int64(3)
memory usage: 55.2 KB


In [249]:
#Create a master file merging all 3 files together 
master_df = pd.merge(archive_df, Image_predictions_df, how = 'left', on = ['tweet_id'] )
master_df = pd.merge(master_df, api_df, how = 'left', on = ['tweet_id'])
master_df.to_csv('df_master.csv', encoding = 'utf-8')
master_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2356 entries, 0 to 2355
Data columns (total 30 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
jpg_url                       20

In [250]:
#remove duplicates / null values / tweers without pictures
# Delete the retweets
master_df = master_df[pd.isnull(master_df.retweeted_status_id)]
# Delete duplicated tweet_id
master_df = master_df.drop_duplicates()
# Delete tweets with no pictures
master_df = master_df.dropna(subset = ['jpg_url'])


In [251]:
master_df.shape

(1994, 30)

In [252]:
# Delete columns related to retweet we don't need anymore
master_df = master_df.drop('retweeted_status_id', 1)
master_df = master_df.drop('retweeted_status_user_id', 1)
master_df = master_df.drop('retweeted_status_timestamp', 1)

In [253]:
master_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1994 entries, 0 to 2355
Data columns (total 27 columns):
tweet_id                 1994 non-null int64
in_reply_to_status_id    23 non-null float64
in_reply_to_user_id      23 non-null float64
timestamp                1994 non-null object
source                   1994 non-null object
text                     1994 non-null object
expanded_urls            1994 non-null object
rating_numerator         1994 non-null int64
rating_denominator       1994 non-null int64
name                     1994 non-null object
doggo                    1994 non-null object
floofer                  1994 non-null object
pupper                   1994 non-null object
puppo                    1994 non-null object
jpg_url                  1994 non-null object
img_num                  1994 non-null float64
p1                       1994 non-null object
p1_conf                  1994 non-null float64
p1_dog                   1994 non-null object
p2                    

In [254]:
#need to investigate visually 
master_df.to_csv (r'master.csv', index = False, header=True)

In [255]:
master_df.columns                 

Index(['tweet_id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'timestamp',
       'source', 'text', 'expanded_urls', 'rating_numerator',
       'rating_denominator', 'name', 'doggo', 'floofer', 'pupper', 'puppo',
       'jpg_url', 'img_num', 'p1', 'p1_conf', 'p1_dog', 'p2', 'p2_conf',
       'p2_dog', 'p3', 'p3_conf', 'p3_dog', 'retweets', 'favorites'],
      dtype='object')

In [257]:
# change timestamp to date 
master_df['timestamp']= pd.to_datetime(master_df['timestamp'])

#change rating_denominator  , rating_numerator  to float 

master_df['rating_numerator']= master_df['rating_numerator'].astype(dtype=np.float64)
master_df['rating_denominator']= master_df['rating_denominator'].astype(dtype=np.float64)

master_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1994 entries, 0 to 2355
Data columns (total 27 columns):
tweet_id                 1994 non-null int64
in_reply_to_status_id    23 non-null float64
in_reply_to_user_id      23 non-null float64
timestamp                1994 non-null datetime64[ns]
source                   1994 non-null object
text                     1994 non-null object
expanded_urls            1994 non-null object
rating_numerator         1994 non-null float64
rating_denominator       1994 non-null float64
name                     1994 non-null object
doggo                    1994 non-null object
floofer                  1994 non-null object
pupper                   1994 non-null object
puppo                    1994 non-null object
jpg_url                  1994 non-null object
img_num                  1994 non-null float64
p1                       1994 non-null object
p1_conf                  1994 non-null float64
p1_dog                   1994 non-null object
p2        

In [258]:
#make copy for reverting 
master_copy=master_df.copy() 

In [259]:
#calculate rating ratio then drop 'rating_numerator' ,'rating_denominator' column
master_df['rating_ratio'] = master_df['rating_numerator']/master_df['rating_denominator']
master_df = master_df.drop('rating_numerator',1) 
master_df = master_df.drop('rating_denominator',1) 
master_df.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,expanded_urls,name,doggo,floofer,...,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog,retweets,favorites,rating_ratio
0,892420643555336193,,,2017-08-01 16:23:56,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,https://twitter.com/dog_rates/status/892420643...,Phineas,,,...,False,bagel,0.085851,False,banana,0.07611,False,8853.0,39467.0,1.3
1,892177421306343426,,,2017-08-01 00:17:27,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,https://twitter.com/dog_rates/status/892177421...,Tilly,,,...,True,Pekinese,0.090647,True,papillon,0.068957,True,6514.0,33819.0,1.3
2,891815181378084864,,,2017-07-31 00:18:03,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,https://twitter.com/dog_rates/status/891815181...,Archie,,,...,True,malamute,0.078253,True,kelpie,0.031379,True,4328.0,25461.0,1.2
3,891689557279858688,,,2017-07-30 15:58:51,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,https://twitter.com/dog_rates/status/891689557...,Darla,,,...,False,Labrador_retriever,0.168086,True,spatula,0.040836,False,8964.0,42908.0,1.3
4,891327558926688256,,,2017-07-29 16:00:24,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,https://twitter.com/dog_rates/status/891327558...,Franklin,,,...,True,English_springer,0.22577,True,German_short-haired_pointer,0.175219,True,9774.0,41048.0,1.2


In [260]:
# rename the name column to dog_name
master_df.rename(columns={'name':'dog_name','timestamp': 'tweet_date', 'text': 'tweet_text','jpg_url':'img_url','source':'tweet_source'},inplace=True)
master_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1994 entries, 0 to 2355
Data columns (total 26 columns):
tweet_id                 1994 non-null int64
in_reply_to_status_id    23 non-null float64
in_reply_to_user_id      23 non-null float64
tweet_date               1994 non-null datetime64[ns]
tweet_source             1994 non-null object
tweet_text               1994 non-null object
expanded_urls            1994 non-null object
dog_name                 1994 non-null object
doggo                    1994 non-null object
floofer                  1994 non-null object
pupper                   1994 non-null object
puppo                    1994 non-null object
img_url                  1994 non-null object
img_num                  1994 non-null float64
p1                       1994 non-null object
p1_conf                  1994 non-null float64
p1_dog                   1994 non-null object
p2                       1994 non-null object
p2_conf                  1994 non-null float64
p2_dog     

In [261]:
#check lower case values and change them to title case 
master_df['dog_name'].str.islower().sum()

98

In [262]:
master_df['dog_name'] = master_df['dog_name'].apply(lambda x: x.title())
master_df['dog_name'].str.islower().sum()

0

In [263]:
# change all None to nan values
master_df.dog_name.replace('None', np.nan, inplace= True)
master_df.doggo.replace('None', np.nan, inplace= True)
master_df.floofer.replace('None', np.nan, inplace= True)
master_df.pupper.replace('None', np.nan, inplace= True)
master_df.puppo.replace('None', np.nan, inplace= True)
master_df.sample(20)


Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,tweet_date,tweet_source,tweet_text,expanded_urls,dog_name,doggo,floofer,...,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog,retweets,favorites,rating_ratio
2225,668286279830867968,,,2015-11-22 04:33:59,"<a href=""http://twitter.com/download/iphone"" r...",Meet Rusty. Rusty's dreaming of a world where ...,https://twitter.com/dog_rates/status/668286279...,Rusty,,,...,True,basset,0.189214,True,Cardigan,0.11301,True,149.0,535.0,1.1
242,846153765933735936,,,2017-03-27 00:15:53,"<a href=""http://twitter.com/download/iphone"" r...",This is Vixen. He really likes bananas. Steals...,https://twitter.com/dog_rates/status/846153765...,Vixen,,,...,True,flat-coated_retriever,0.218451,True,Labrador_retriever,0.10802,True,10226.0,34394.0,1.3
632,793962221541933056,,,2016-11-02 23:45:19,"<a href=""http://twitter.com/download/iphone"" r...",This is Maximus. His face is stuck like that. ...,https://twitter.com/dog_rates/status/793962221...,Maximus,,,...,True,golden_retriever,0.044462,True,Staffordshire_bullterrier,0.016497,True,5711.0,18910.0,1.2
1363,702932127499816960,,,2016-02-25 19:04:13,"<a href=""http://twitter.com/download/iphone"" r...",This is Chip. He's an Upper West Nile Pantaloo...,https://twitter.com/dog_rates/status/702932127...,Chip,,,...,False,wombat,0.239332,False,beaver,0.149605,False,811.0,2825.0,0.6
1191,717841801130979328,,,2016-04-06 22:29:56,"<a href=""http://twitter.com/download/iphone"" r...",This is Barclay. His father was a banana. 11/1...,https://twitter.com/dog_rates/status/717841801...,Barclay,,,...,True,English_springer,0.070113,True,bath_towel,0.002561,False,670.0,2660.0,1.1
135,866450705531457537,,,2017-05-22 00:28:40,"<a href=""http://twitter.com/download/iphone"" r...",This is Jamesy. He gives a kiss to every other...,https://twitter.com/dog_rates/status/866450705...,Jamesy,,,...,True,Boston_bull,0.07806,True,pug,0.001771,True,32883.0,106827.0,1.3
252,844973813909606400,,,2017-03-23 18:07:10,"<a href=""http://twitter.com/download/iphone"" r...",This is Brady. He's a recovering alcoholic. De...,https://twitter.com/dog_rates/status/844973813...,Brady,,,...,True,golden_retriever,0.195218,True,Chihuahua,0.01732,True,3617.0,16361.0,1.2
325,833863086058651648,,,2017-02-21 02:17:06,"<a href=""http://twitter.com/download/iphone"" r...",This is Bentley. Hairbrushes are his favorite ...,https://twitter.com/dog_rates/status/833863086...,Bentley,,,...,True,Great_Pyrenees,0.312632,True,golden_retriever,0.141736,True,2729.0,14661.0,1.2
1249,711306686208872448,,,2016-03-19 21:41:44,"<a href=""http://twitter.com/download/iphone"" r...",What hooligan sent in pictures w/out a dog in ...,https://twitter.com/dog_rates/status/711306686...,,,,...,False,loggerhead,0.12329,False,Dandie_Dinmont,0.086792,True,819.0,3596.0,0.3
161,860563773140209665,,,2017-05-05 18:36:06,"<a href=""http://twitter.com/download/iphone"" r...",Meet Lorenzo. He's an avid nifty hat wearer an...,https://www.gofundme.com/help-lorenzo-beat-can...,Lorenzo,,,...,True,Pembroke,0.055979,True,beagle,0.045896,True,2334.0,7878.0,1.3


In [264]:
master_df['dog_name'].isnull().sum()

546

In [265]:
nan_dogs= master_df.loc[master_df.dog_name.isnull()]
master_df.drop(nan_dogs.index,axis= 0,inplace=True)
master_df['dog_name'].isnull().sum()

0

In [267]:
master_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1448 entries, 0 to 2354
Data columns (total 26 columns):
tweet_id                 1448 non-null int64
in_reply_to_status_id    1 non-null float64
in_reply_to_user_id      1 non-null float64
tweet_date               1448 non-null datetime64[ns]
tweet_source             1448 non-null object
tweet_text               1448 non-null object
expanded_urls            1448 non-null object
dog_name                 1448 non-null object
doggo                    43 non-null object
floofer                  5 non-null object
pupper                   131 non-null object
puppo                    15 non-null object
img_url                  1448 non-null object
img_num                  1448 non-null float64
p1                       1448 non-null object
p1_conf                  1448 non-null float64
p1_dog                   1448 non-null object
p2                       1448 non-null object
p2_conf                  1448 non-null float64
p2_dog               

In [268]:
# extracting confidence and dog breed from all corresponding values per each prediction algorithm
dog_breed = []
confidence = []

def breed_confidence(data):
    if data.p1_dog:
        dog_breed.append(data.p1)
        confidence.append(data.p1_conf)
    elif data.p2_dog:
        dog_breed.append(data.p2)
        confidence.append(data.p2_conf)
    elif data.p3_dog :
        dog_breed.append(data.p3)
        confidence.append(data.p3_conf)
    else:
        dog_breed.append('No breed')
        confidence.append(0)
# apply the function row wise        
master_df.apply(breed_confidence,axis =1)
# assign the new column names
master_df['dog_breed'] = dog_breed
master_df['confidence'] = confidence
# drop the un needed columns now
master_df.drop(columns = ['p1', 'p1_dog', 'p1_conf' , 'p2', 'p2_dog',
                                 'p2_conf' , 'p3', 'p3_dog', 'p3_conf'],axis=1, inplace =True)

In [269]:
master_df

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,tweet_date,tweet_source,tweet_text,expanded_urls,dog_name,doggo,floofer,pupper,puppo,img_url,img_num,retweets,favorites,rating_ratio,dog_breed,confidence
0,892420643555336193,,,2017-08-01 16:23:56,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,https://twitter.com/dog_rates/status/892420643...,Phineas,,,,,https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg,1.0,8853.0,39467.0,1.3,No breed,0.000000
1,892177421306343426,,,2017-08-01 00:17:27,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,https://twitter.com/dog_rates/status/892177421...,Tilly,,,,,https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,1.0,6514.0,33819.0,1.3,Chihuahua,0.323581
2,891815181378084864,,,2017-07-31 00:18:03,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,https://twitter.com/dog_rates/status/891815181...,Archie,,,,,https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg,1.0,4328.0,25461.0,1.2,Chihuahua,0.716012
3,891689557279858688,,,2017-07-30 15:58:51,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,https://twitter.com/dog_rates/status/891689557...,Darla,,,,,https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg,1.0,8964.0,42908.0,1.3,Labrador_retriever,0.168086
4,891327558926688256,,,2017-07-29 16:00:24,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,https://twitter.com/dog_rates/status/891327558...,Franklin,,,,,https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg,2.0,9774.0,41048.0,1.2,basset,0.555712
6,890971913173991426,,,2017-07-28 16:27:12,"<a href=""http://twitter.com/download/iphone"" r...",Meet Jax. He enjoys ice cream so much he gets ...,"https://gofundme.com/ydvmve-surgery-for-jax,ht...",Jax,,,,,https://pbs.twimg.com/media/DF1eOmZXUAALUcq.jpg,1.0,2158.0,12041.0,1.3,Appenzeller,0.341703
8,890609185150312448,,,2017-07-27 16:25:51,"<a href=""http://twitter.com/download/iphone"" r...",This is Zoey. She doesn't want to be one of th...,https://twitter.com/dog_rates/status/890609185...,Zoey,,,,,https://pbs.twimg.com/media/DFwUU__XcAEpyXI.jpg,1.0,4429.0,28226.0,1.3,Irish_terrier,0.487574
9,890240255349198849,,,2017-07-26 15:59:51,"<a href=""http://twitter.com/download/iphone"" r...",This is Cassie. She is a college pup. Studying...,https://twitter.com/dog_rates/status/890240255...,Cassie,doggo,,,,https://pbs.twimg.com/media/DFrEyVuW0AAO3t9.jpg,1.0,7711.0,32467.0,1.4,Pembroke,0.511319
10,890006608113172480,,,2017-07-26 00:31:25,"<a href=""http://twitter.com/download/iphone"" r...",This is Koda. He is a South Australian decksha...,https://twitter.com/dog_rates/status/890006608...,Koda,,,,,https://pbs.twimg.com/media/DFnwSY4WAAAMliS.jpg,1.0,7624.0,31166.0,1.3,Samoyed,0.957979
11,889880896479866881,,,2017-07-25 16:11:53,"<a href=""http://twitter.com/download/iphone"" r...",This is Bruno. He is a service shark. Only get...,https://twitter.com/dog_rates/status/889880896...,Bruno,,,,,https://pbs.twimg.com/media/DFl99B1WsAITKsg.jpg,1.0,5156.0,28268.0,1.3,French_bulldog,0.377417


In [270]:
master_df.columns

Index(['tweet_id', 'in_reply_to_status_id', 'in_reply_to_user_id',
       'tweet_date', 'tweet_source', 'tweet_text', 'expanded_urls', 'dog_name',
       'doggo', 'floofer', 'pupper', 'puppo', 'img_url', 'img_num', 'retweets',
       'favorites', 'rating_ratio', 'dog_breed', 'confidence'],
      dtype='object')

In [271]:
master_melted = pd.melt(master_df, id_vars = ['tweet_id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'tweet_date','tweet_source',
                                              'tweet_text','expanded_urls', 'img_url', 'img_num', 'retweets', 'favorites', 
                                              'rating_ratio','dog_breed', 'confidence' ,'dog_name'],value_vars =['doggo', 'floofer', 'pupper', 'puppo'],var_name = 'types', value_name = 'dog_type')
master_melted

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,tweet_date,tweet_source,tweet_text,expanded_urls,img_url,img_num,retweets,favorites,rating_ratio,dog_breed,confidence,dog_name,types,dog_type
0,892420643555336193,,,2017-08-01 16:23:56,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,https://twitter.com/dog_rates/status/892420643...,https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg,1.0,8853.0,39467.0,1.3,No breed,0.000000,Phineas,doggo,
1,892177421306343426,,,2017-08-01 00:17:27,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,https://twitter.com/dog_rates/status/892177421...,https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,1.0,6514.0,33819.0,1.3,Chihuahua,0.323581,Tilly,doggo,
2,891815181378084864,,,2017-07-31 00:18:03,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,https://twitter.com/dog_rates/status/891815181...,https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg,1.0,4328.0,25461.0,1.2,Chihuahua,0.716012,Archie,doggo,
3,891689557279858688,,,2017-07-30 15:58:51,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,https://twitter.com/dog_rates/status/891689557...,https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg,1.0,8964.0,42908.0,1.3,Labrador_retriever,0.168086,Darla,doggo,
4,891327558926688256,,,2017-07-29 16:00:24,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,https://twitter.com/dog_rates/status/891327558...,https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg,2.0,9774.0,41048.0,1.2,basset,0.555712,Franklin,doggo,
5,890971913173991426,,,2017-07-28 16:27:12,"<a href=""http://twitter.com/download/iphone"" r...",Meet Jax. He enjoys ice cream so much he gets ...,"https://gofundme.com/ydvmve-surgery-for-jax,ht...",https://pbs.twimg.com/media/DF1eOmZXUAALUcq.jpg,1.0,2158.0,12041.0,1.3,Appenzeller,0.341703,Jax,doggo,
6,890609185150312448,,,2017-07-27 16:25:51,"<a href=""http://twitter.com/download/iphone"" r...",This is Zoey. She doesn't want to be one of th...,https://twitter.com/dog_rates/status/890609185...,https://pbs.twimg.com/media/DFwUU__XcAEpyXI.jpg,1.0,4429.0,28226.0,1.3,Irish_terrier,0.487574,Zoey,doggo,
7,890240255349198849,,,2017-07-26 15:59:51,"<a href=""http://twitter.com/download/iphone"" r...",This is Cassie. She is a college pup. Studying...,https://twitter.com/dog_rates/status/890240255...,https://pbs.twimg.com/media/DFrEyVuW0AAO3t9.jpg,1.0,7711.0,32467.0,1.4,Pembroke,0.511319,Cassie,doggo,doggo
8,890006608113172480,,,2017-07-26 00:31:25,"<a href=""http://twitter.com/download/iphone"" r...",This is Koda. He is a South Australian decksha...,https://twitter.com/dog_rates/status/890006608...,https://pbs.twimg.com/media/DFnwSY4WAAAMliS.jpg,1.0,7624.0,31166.0,1.3,Samoyed,0.957979,Koda,doggo,
9,889880896479866881,,,2017-07-25 16:11:53,"<a href=""http://twitter.com/download/iphone"" r...",This is Bruno. He is a service shark. Only get...,https://twitter.com/dog_rates/status/889880896...,https://pbs.twimg.com/media/DFl99B1WsAITKsg.jpg,1.0,5156.0,28268.0,1.3,French_bulldog,0.377417,Bruno,doggo,


In [272]:
master_melted = master_melted.drop('types', 1)
master_melted

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,tweet_date,tweet_source,tweet_text,expanded_urls,img_url,img_num,retweets,favorites,rating_ratio,dog_breed,confidence,dog_name,dog_type
0,892420643555336193,,,2017-08-01 16:23:56,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,https://twitter.com/dog_rates/status/892420643...,https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg,1.0,8853.0,39467.0,1.3,No breed,0.000000,Phineas,
1,892177421306343426,,,2017-08-01 00:17:27,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,https://twitter.com/dog_rates/status/892177421...,https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,1.0,6514.0,33819.0,1.3,Chihuahua,0.323581,Tilly,
2,891815181378084864,,,2017-07-31 00:18:03,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,https://twitter.com/dog_rates/status/891815181...,https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg,1.0,4328.0,25461.0,1.2,Chihuahua,0.716012,Archie,
3,891689557279858688,,,2017-07-30 15:58:51,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,https://twitter.com/dog_rates/status/891689557...,https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg,1.0,8964.0,42908.0,1.3,Labrador_retriever,0.168086,Darla,
4,891327558926688256,,,2017-07-29 16:00:24,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,https://twitter.com/dog_rates/status/891327558...,https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg,2.0,9774.0,41048.0,1.2,basset,0.555712,Franklin,
5,890971913173991426,,,2017-07-28 16:27:12,"<a href=""http://twitter.com/download/iphone"" r...",Meet Jax. He enjoys ice cream so much he gets ...,"https://gofundme.com/ydvmve-surgery-for-jax,ht...",https://pbs.twimg.com/media/DF1eOmZXUAALUcq.jpg,1.0,2158.0,12041.0,1.3,Appenzeller,0.341703,Jax,
6,890609185150312448,,,2017-07-27 16:25:51,"<a href=""http://twitter.com/download/iphone"" r...",This is Zoey. She doesn't want to be one of th...,https://twitter.com/dog_rates/status/890609185...,https://pbs.twimg.com/media/DFwUU__XcAEpyXI.jpg,1.0,4429.0,28226.0,1.3,Irish_terrier,0.487574,Zoey,
7,890240255349198849,,,2017-07-26 15:59:51,"<a href=""http://twitter.com/download/iphone"" r...",This is Cassie. She is a college pup. Studying...,https://twitter.com/dog_rates/status/890240255...,https://pbs.twimg.com/media/DFrEyVuW0AAO3t9.jpg,1.0,7711.0,32467.0,1.4,Pembroke,0.511319,Cassie,doggo
8,890006608113172480,,,2017-07-26 00:31:25,"<a href=""http://twitter.com/download/iphone"" r...",This is Koda. He is a South Australian decksha...,https://twitter.com/dog_rates/status/890006608...,https://pbs.twimg.com/media/DFnwSY4WAAAMliS.jpg,1.0,7624.0,31166.0,1.3,Samoyed,0.957979,Koda,
9,889880896479866881,,,2017-07-25 16:11:53,"<a href=""http://twitter.com/download/iphone"" r...",This is Bruno. He is a service shark. Only get...,https://twitter.com/dog_rates/status/889880896...,https://pbs.twimg.com/media/DFl99B1WsAITKsg.jpg,1.0,5156.0,28268.0,1.3,French_bulldog,0.377417,Bruno,


In [273]:
master_melted['dog_type'].value_counts()

pupper     131
doggo       43
puppo       15
floofer      5
Name: dog_type, dtype: int64

In [274]:
master_melted['dog_type'].isnull().sum()

5598

In [275]:
master_melted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5792 entries, 0 to 5791
Data columns (total 16 columns):
tweet_id                 5792 non-null int64
in_reply_to_status_id    4 non-null float64
in_reply_to_user_id      4 non-null float64
tweet_date               5792 non-null datetime64[ns]
tweet_source             5792 non-null object
tweet_text               5792 non-null object
expanded_urls            5792 non-null object
img_url                  5792 non-null object
img_num                  5792 non-null float64
retweets                 5792 non-null float64
favorites                5792 non-null float64
rating_ratio             5792 non-null float64
dog_breed                5792 non-null object
confidence               5792 non-null float64
dog_name                 5792 non-null object
dog_type                 194 non-null object
dtypes: datetime64[ns](1), float64(7), int64(1), object(7)
memory usage: 724.1+ KB


In [309]:
master_melted.to_csv('Twitter_archive_master.csv',encoding='utf-8',index=False)
