In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Gather Data:

### Import WeRateDogs Twitter archive

In [None]:
df_WRD_twitter = pd.read_csv('twitter-archive-enhanced.csv')

### Import image prediction file from url

In [None]:
# Reference: https://www.codementor.io/aviaryan/downloading-files-from-urls-in-python-77q3bs0un

import requests
import os
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
r = requests.get(url)
open('image-predictions.tsv', 'wb').write(r.content)

In [None]:
df_img_pred = pd.read_csv('image-predictions.tsv', sep='\t')

### Import Tweet JSON data

In [None]:
# Reference: Dhaval P's answer on his question: https://knowledge.udacity.com/questions/47704
import json
data = []
with open('tweet_json.txt') as f:    
        for line in f:         
            data.append(json.loads(line))
df_twit_JSON = pd.DataFrame(data)


## Assess Data:

### Assessing WeRateDogs Twitter Archive Data:

#### Quality Issues:
- Of the 2356 entries, there are only approximately 400 which have a declared dog type (i.e. doggo, puppo, etc.). This is either because there are not enough established variables for the wide variety of dog categories, the majority of tweets do not implement use of dog categories, or the dataset did not extract all the category mentions from the tweets.
- Dog names ('name') has 745 extracted as 'None', and several dog names extracted as 'a', 'the', and 'an'. It appears the original extracton found whatever followed  after the tweet's first 'is'. Most of the Nones are appropriate, whereas others just haven't been extracted potentially due to the location of the name within the tweet.
- There are 181 retweet entries, and the project dictates only having original tweets. Should be removed.
- There are 78 reply tweet entries, and I'm not sure if that fits into the definition of 'originial tweet' even if it includes new photo, name and rating. Better to err on the side of caution and remove them.
- Entry at index 313 extracted a rating of '960/0', and needs to be changed to the revised rating of '13/10'

#### Tidiness Issues:
- Dog types (i.e. doggo, puppo, etc.) are in separate variable columns, where if a dog is described as such, the value is the dogtype, whereas if it isn't, the value is a non-null 'None'. Instead the columns could either be framed as Boolean 1's and 0's, or all placed into one 'dog_type' variable column.

In [None]:
df_WRD_twitter.head()

In [None]:
df_WRD_twitter.info()

In [None]:
#Reference: https://stackoverflow.com/questions/33042633/selecting-last-n-columns-and-excluding-last-n-columns-in-dataframe
dog_type_cols = df_WRD_twitter.columns[-5:].values

for i in dog_type_cols:
    print(df_WRD_twitter[i].value_counts())

In [None]:
# Reference: https://stackoverflow.com/questions/25351968/how-to-display-full-non-truncated-dataframe-information-in-html-when-convertin/25352191
pd.set_option('display.max_colwidth', -1)

df_WRD_twitter[df_WRD_twitter['name']=='None']['text'].head(10)

In [None]:
df_WRD_twitter[df_WRD_twitter['name']=='a']['text'].head(10)

In [None]:
df_WRD_twitter.rating_numerator.value_counts()

In [None]:
df_WRD_twitter[df_WRD_twitter['rating_numerator'] >= 20][['rating_numerator','rating_denominator','text']]

### Assessing Image Prediction Data:

#### Quality Issues:
- 1
- 2
- 3

#### Tidiness Issues:
- 1
- 2
- 3

In [None]:
df_img_pred.head()

In [None]:
df_img_pred.info()

### Assessing Twitter JSON Data:

#### Quality Issues:
- 1
- 2
- 3

#### Tidiness Issues:
- 1
- 2
- 3

In [None]:
df_twit_JSON.head()

In [None]:
df_twit_JSON.info()

## Clean:

#### Define

#### Code

In [None]:
df_twit_JSON.drop(['contributors', 
                   'coordinates', 
                   'geo',
                   'id',
                   'in_reply_to_screen_name', 
                   'in_reply_to_status_id', 
                   'in_reply_to_status_id_str',
                   'in_reply_to_user_id',
                   'in_reply_to_user_id_str',
                   'place',
                   'quoted_status',
                   'quoted_status_id',
                   'quoted_status_id_str',
                   
                  ])

#### Test