# Project: Wrangling and Analyze Data

In [None]:
import tweepy
import requests
import pandas as pd
import json
import time
import math
import re
import os
from timeit import default_timer as timer
from tweepy import OAuthHandler
import numpy as np

import matplotlib.pyplot as plt
from matplotlib import cm
%matplotlib inline

## Data Gathering
In the cell below, gather **all** three pieces of data for this project and load them in the notebook. **Note:** the methods required to gather each data are different.
1. Directly download the WeRateDogs Twitter archive data (twitter_archive_enhanced.csv)

In [None]:
arch_df = pd.read_csv('twitter-archive-enhanced.csv')

2. Use the Requests library to download the tweet image prediction (image_predictions.tsv)

In [None]:
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url) 
with open(os.path.join(url.split('/')[-1]), mode='wb') as file: 
        file.write(response.content)
        
images = pd.read_csv('image-predictions.tsv', delimiter = '\t')

    

3. Use the Tweepy library to query additional data via the Twitter API (tweet_json.txt)

In [None]:
# Query Twitter API for each tweet in the Twitter archive and save JSON in a text file
# These are hidden to comply with Twitter's API terms and conditions
consumer_key = 'HIDDEN'
consumer_secret = 'HIDDEN'
access_token = 'HIDDEN'
access_secret = 'HIDDEN'

auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True)

# NOTE TO STUDENT WITH MOBILE VERIFICATION ISSUES:
# df_1 is a DataFrame with the twitter_archive_enhanced.csv file. You may have to
# change line 17 to match the name of your DataFrame with twitter_archive_enhanced.csv
# NOTE TO REVIEWER: this student had mobile verification issues so the following
# Twitter API code was sent to this student from a Udacity instructor
# Tweet IDs for which to gather additional data via Twitter's API
tweet_ids = arch_df.tweet_id.values
len(tweet_ids)

# Query Twitter's API for JSON data for each tweet ID in the Twitter archive
count = 0
fails_dict = {}
start = timer()
# Save each tweet's returned JSON as a new line in a .txt file
with open('tweet_json.txt', 'w') as outfile:
    # This loop will likely take 20-30 minutes to run because of Twitter's rate limit
    for tweet_id in tweet_ids:
        count += 1
        print(str(count) + ": " + str(tweet_id))
        try:
            tweet = api.get_status(tweet_id, tweet_mode='extended')
            print("Success")
            json.dump(tweet._json, outfile)
            outfile.write('\n')
        except tweepy.TweepError as e:
            print("Fail")
            fails_dict[tweet_id] = e
            pass
end = timer()
print(end - start)
print(fails_dict)

In [None]:
df_list = []
with open('tweet-json.txt', 'r') as json_file:
    for line in json_file:
        status = json.loads(line)
        
        # Append to list of dictionaries
        df_list.append({'tweet_id': status['id'],
                        'retweet_count': status['retweet_count'],
                        'favorite_count': status['favorite_count'],
                       })

# Create a DataFrame with tweet ID, retweet count, favorite count and display_text_range
json_tweets = pd.DataFrame(df_list, columns = ['tweet_id', 'favorite_count', 'retweet_count'])

## Assessing Data
In this section, detect and document at least **eight (8) quality issues and two (2) tidiness issue**. You must use **both** visual assessment
programmatic assessement to assess the data.


In [None]:
#Lets look at a preview of the archived dataframe
arch_df.head(5)


In [None]:
#Now lets preview the json file dataframe
json_tweets.head(5)

Here I am checking for any missing data within the columns. 

In [None]:
arch_df.info()
json_tweets.info()

Here, I am seeing if any of the columns have less than or more than the values they are suuposed to. For instance, the denominator column should only have 10 as a value. While the numerator column 

In [None]:
#Lets see if there may be any erraneous data in the rating_deominator column. All values should be 10. 
arch_df.describe().rating_denominator

In [None]:
#Lets take a look to see if there are any duplicated TweetIDs
print('archived', sum(arch_df.duplicated('tweet_id')))
print('predictions', sum(json_tweets.duplicated('tweet_id')))

In [None]:
#Here I am comparing the tweet_id columns in both of the data frames to see if there are any missing tweet id's 

arch_df['tweet_id'].isin(json_tweets['tweet_id']).value_counts()

### Quality issues

1. Erraneous Columns

2. Need to delete retweets 

3. Delete reply tweets 

4. Erraneous dog names are present such as, 'by' or 'None'. 

5. Abbreviating Source links

6. Filling in all of the none type values with 'None' for more flexibliy 

7. Some rating demonator values or more or less than 10

8. There are tweets without images. They will need to be removed. 

9. Choose the dog with the highest confidence rating. 


### Tidiness issues
1. Merge Dataframes and get rid of unneccesary columns

2. The dog stage columns (doggo,floofer,pupper,puppo) can be combined into one column. 


In [None]:
# Make copies of original pieces of data
clean_arch = arch_df.copy()
clean_json = json_tweets.copy()


### Tidy Issue #1: 

Merge Dataframes and get rid of unneccesary columns

#### Define:  

We will be merging all 3 dataframes (clean_json, images, and clean_arch) into one master data frame using the pandas merge function. This will prepare us for cleaning the data later on.

#### Code

In [None]:
master_df = clean_json.copy()
master_df = pd.merge(clean_arch, images, how = 'left', on = ['tweet_id'] )
master_df = pd.merge(master_df, clean_json, how = 'left', on = ['tweet_id'] )



Test

In [None]:
master_df.info()

### Issue #2:  

We need to combine the 4 dog stages (doggo, floofer, pupper, puppo) into one column. 

#### Define

We will be combining the 4 dog stage columns into one column to make the data tidier. We will be using adding the dog_stage column and adding the values to the column and then dropping the extra columns. 

#### Code

In [None]:
#Here we are confirming the value in the columns
print(master_df.doggo.value_counts())
print(master_df.floofer.value_counts())
print(master_df.pupper.value_counts())
print(master_df.puppo.value_counts())


In [None]:
#create a new column 
master_df.doggo.replace('None', '', inplace = True)
master_df.floofer.replace('None', '', inplace = True)
master_df.pupper.replace('None', '', inplace = True)
master_df.puppo.replace('None', '', inplace = True)


master_df['dog_stage'] = master_df['doggo'].astype(str) + master_df["floofer"].astype(str) + master_df["pupper"].astype(str) + master_df["puppo"].astype(str)
#Testing
master_df.dog_stage.value_counts()


In [None]:
# rename dog_stage names
master_df.loc[master_df.dog_stage=='nannannanpuppo','dog_stage']='Puppo'
master_df.loc[master_df.dog_stage=='nannanpuppernan','dog_stage']='Pupper'
master_df.loc[master_df.dog_stage=='nanfloofernannan','dog_stage']='Floofer'
master_df.loc[master_df.dog_stage=='doggonannannan','dog_stage']='Doggo'
master_df.loc[master_df.dog_stage=='nannannannan','dog_stage']='None'
master_df.loc[master_df.dog_stage=='doggonanpuppernan','dog_stage']='Unknown'
master_df.loc[master_df.dog_stage=='doggonannanpuppo','dog_stage']='Unknown'
master_df.loc[master_df.dog_stage=='doggofloofernannan','dog_stage']='Unknown'

master_df['dog_stage'].replace('nannannanpuppo','Puppo')




#replace blank cells with unknown
master_df.dog_stage.replace('','Unknown', inplace=True)
master_df.dog_stage.value_counts()



In [None]:
# Drop extra columns 
master_df.drop(['doggo','pupper','floofer','puppo'], axis=1)

#### Test

In [None]:
master_df.columns.tolist()

## Cleaning Data
In this section, clean **all** of the issues you documented while assessing. 

In [None]:
#2 Delete duplicated tweet_id (retweets)
master_df = master_df[pd.isnull(master_df.retweeted_status_id)]
#3 Delete tweet replies
master_df = master_df[master_df['in_reply_to_status_id'].isna()]
#4 Delete tweets without pictures
master_df = master_df.dropna(subset = ['jpg_url'])
master_df.info()

In [None]:
# Create conditions 
conditions = [(master_df['p1_dog'] == True),(master_df['p2_dog'] == True),(master_df['p3_dog'] == True)]

# Create the Choice order
choice_breed = [master_df['p1'],master_df['p2'],master_df['p3']]

# Create the Choice order based on the confidence interval
choices_confidence = [master_df['p1_conf'],master_df['p2_conf'],master_df['p3_conf']]

# Select breed based on first successful condition
master_df['breed'] = np.select(conditions, choices_breed, default = 'none')

# select predicted confidence level based on first successful condition
master_df['confidence'] = np.select(conditions, choices_confidence,default = 0)

#test
master_df.head()

In [None]:
# change data type for the confidence interval to a whole number
master_df.confidence = (master_df.confidence * 100).astype(int)


In [None]:
# Delete irrelevant columns
master_df.drop(['p1', 'p1_dog', 'p1_conf','p2', 'p2_dog', 'p2_conf','p3', 'p3_dog', 'p3_conf','in_reply_to_status_id','in_reply_to_user_id', 'retweeted_status_id','retweeted_status_user_id','retweeted_status_timestamp','doggo','floofer','pupper','puppo'], axis = 1, inplace = True)

master_df.head(1)

In [None]:
#5 Delete irrelevant columns 
master_df.columns.tolist()


In [None]:
master_df.head(1)

In [None]:
#6 Renameing source links

#Creating a dictionary that will hold the abbreviated text
source_text = {
    '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>' : 'Twitter for iPhone',
    '<a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a>' : 'Vine - Make a Scene',
    '<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>' : 'Twitter Web Client',
    '<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>' : 'TweetDeck'
}

# Creating funcation that will replace the keys with values
def rename_s(df):
    if df['source'] in source_text.keys():
        abbrev = source_text[df['source']]
        return abbrev
    else:
        return df['source']
    
# Calling our function   
master_df.source = master_df.apply(rename_s, axis=1)
master_df.source.value_counts()

In [None]:
#1 Replace all of the nonetype data entries with "None"
master_df.fillna('None', inplace = True)
#Change favorite count data to whole integer
master_df.favorite_count = master_df.favorite_count.astype(int)
master_df.retweet_count = master_df.retweet_count.astype(int)

names_mask = master_df.name.str.contains('^[a-z]', regex = True)

master_df[names_mask].name.value_counts().sort_index()



In [None]:
#Lets replace these with the "None" Value. 
master_df.loc[names_mask, 'name'] = "None"


In [None]:
#Drop all ratings with values more than 10 or less than 10. 

rating_mask = master_df.rating_denominator >= 10
master_df[rating_mask].rating_denominator.value_counts().sort_index(ascending = False)

In [None]:
master_df = master_df[master_df.rating_denominator <= 10]
master_df[rating_mask].rating_denominator.value_counts().sort_index(ascending = False)

In [None]:
master_df.describe()
master_df.head()

## Storing Data
Save gathered, assessed, and cleaned master dataset to a CSV file named "twitter_archive_master.csv".

In [None]:
master_df.to_csv('twitter_archived_master.csv', encoding = 'utf-8')

## Analyzing and Visualizing Data
In this section, analyze and visualize your wrangled data. You must produce at least **three (3) insights and one (1) visualization.**

1. According to the data we rate dogs rates golden retrieve more than any other breed. 
2. Which UI source is the most popular? 
3. There is a strong correlation in the number of retweets and favorites performed by users. 


# Top Favorited Pups

In [None]:
master_df.query('dog_stage == None').value_counts()
master_df.dog_stage.replace('', 'Unknown', inplace = True)
labelz = ['Pupper','Doggo','Puppo','Unknown','Floofer']
dog_stage_number = master_df.dog_stage.value_counts().head(6)
print('Most Mentioned Dog Stages: Pupper')
print(dog_stage_number)

# Top Rated Dog

In [None]:
top_rated_dog = master_df.groupby(['dog_stage']).rating_numerator.sum().sort_values(ascending=False)
print('Top Rated Pup')
print(top_rated_dog)

In [None]:
fav_count = master_df.favorite_count
retweet_count = master_df.retweet_count
N = 4
# 0 to 15 point ra
colors = np.random.rand(N)
plt.scatter(fav_count, retweet_count, c = colors, alpha = 0.5)

plt.legend(title = 'Retweets vs Favorites')

# Most Common Dog Name

In [None]:
common_dog_name = master_df.name.value_counts().head(4)
common_dog_name = common_dog_name.drop('None')
print(common_dog_name)

### Visualization

In [None]:
#According to the data we rate dogs rates golden retrieve more than any other breed. 

breed_counts = master_df.breed.value_counts().head(5)
labels = ['Golden Retriever', 'Labrador Retriever', 'Pembroke', 'Chihuahua']
print(labels)
most_breeds = breed_counts.drop(index = 'none')
plt.pie(most_breeds, labels = labels)
plt.legend(loc='upper left', title = 'Common Dog Breeds')
print('Most Common Dog Breeds: ', 'Golden Retriever')

In [None]:
Which UI Source is the most popular? 

In [None]:
#The owners of the account use the mobile app far more than any other platform of twitter. 
master_df['source'].value_counts()
