# Project: Wrangling and Analyze Data

## Import required Python Libraries 

In [1]:
import pandas as pd
import requests
import json
import tweepy
import timeit
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
%matplotlib inline

ModuleNotFoundError: No module named 'tweepy'

## Data Gathering
In the cell below, gather **all** three pieces of data for this project and load them in the notebook. **Note:** the methods required to gather each data are different.
1. Directly download the WeRateDogs Twitter archive data (twitter_archive_enhanced.csv)

2. Use the Requests library to download the tweet image prediction (image_predictions.tsv)

In [None]:
pred_url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
with open('image_predictions.tsv', 'wb') as file:
    pred_data = requests.get(pred_url)
    file.write(pred_data.content)

3. Use the Tweepy library to query additional data via the Twitter API (tweet_json.txt)

In [None]:
keys = open("twitter_keys.txt")
lines = keys.readlines()
consumer_key = lines[0].split(' ')[1].strip()
consumer_secret = lines[1].split(' ')[1].strip()
access_token = lines[2].split(' ')[1].strip()
access_token_secret = lines[3].split(' ')[1].strip()

In [None]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [None]:
df_imagePred = pd.read_csv('image_predictions.tsv',sep='\t')
tweet_ids = df_imagePred.tweet_id.values
len(tweet_ids)

In [None]:
# Query Twitter's API for JSON data for each tweet ID in the Twitter archive
count = 0
fails_dict = {}
start = timeit.default_timer()
# Save each tweet's returned JSON as a new line in a .txt file
with open('tweet_json.txt', 'w') as outfile:
    # This loop will likely take 20-30 minutes to run because of Twitter's rate limit
    for tweet_id in tweet_ids:
        count += 1
        print(str(count) + ": " + str(tweet_id))
        try:
            tweet = api.get_status(tweet_id, tweet_mode='extended')
            print("Success")
            json.dump(tweet._json, outfile)
            outfile.write('\n')
        except tweepy.TweepError as e:
            print("Fail",e)
            fails_dict[tweet_id] = e
            pass
end = timeit.default_timer()
print(end - start)
print(fails_dict)

## Assessing Data
In this section, detect and document at least **eight (8) quality issues and two (2) tidiness issue**. You must use **both** visual assessment
programmatic assessement to assess the data.

**Note:** pay attention to the following key points when you access the data.

* You only want original ratings (no retweets) that have images. Though there are 5000+ tweets in the dataset, not all are dog ratings and some are retweets.
* Assessing and cleaning the entire dataset completely would require a lot of time, and is not necessary to practice and demonstrate your skills in data wrangling. Therefore, the requirements of this project are only to assess and clean at least 8 quality issues and at least 2 tidiness issues in this dataset.
* The fact that the rating numerators are greater than the denominators does not need to be cleaned. This [unique rating system](http://knowyourmeme.com/memes/theyre-good-dogs-brent) is a big part of the popularity of WeRateDogs.
* You do not need to gather the tweets beyond August 1st, 2017. You can, but note that you won't be able to gather the image predictions for these tweets since you don't have access to the algorithm used.



In [None]:
# set the option to display the full string from a pandas DataFrame
pd.options.display.max_rows
pd.set_option('display.max_colwidth', -1)

In [None]:
df_wrdArchive = pd.read_csv('twitter-archive-enhanced.csv')
df_imagePred = pd.read_csv('image_predictions.tsv',sep='\t')
df_tweetStatus = pd.read_json('tweet_json.txt', lines = True)

In [None]:
df_wrdArchive.info()

In [None]:
df_wrdArchive

In [None]:
df_wrdArchive[df_wrdArchive.retweeted_status_id.notna()]

In [None]:
df_wrdArchive[df_wrdArchive.in_reply_to_status_id.notna()]

In [None]:
# Lets drop those tweets which are either re-tweet or in reply to
df_wrdArchive.drop(df_wrdArchive[df_wrdArchive.retweeted_status_id.notna()].index, inplace = True)
df_wrdArchive.drop(df_wrdArchive[df_wrdArchive.in_reply_to_status_id.notna()].index, inplace = True)

In [None]:
df_wrdArchive.info()

In [None]:
df_tweetStatus

In [None]:
df_imagePred

In [None]:
df_wrdArchive.describe()

In [None]:
df_wrdArchive[df_wrdArchive.tweet_id.duplicated()]

In [None]:
df_tweetStatus.info()

In [None]:
df_tweetStatus.describe()

In [None]:
df_tweetStatus[df_tweetStatus.id.duplicated()]

In [None]:
df_imagePred.info()

In [None]:
df_imagePred.describe()

In [None]:
df_imagePred[df_imagePred.tweet_id.duplicated()]

In [None]:
df_tweetStatus[df_tweetStatus['retweeted'] == False]

In [None]:
df_tweetStatus['retweeted'].value_counts()

In [None]:
df_tweetStatus.retweeted_status.notna().value_counts()

In [None]:
df_tweetStatus[df_tweetStatus.retweeted_status.isna()]

In [None]:
df_imagePred[df_imagePred.jpg_url.duplicated()]

In [None]:
df_imagePred[df_imagePred['jpg_url'] == 'https://pbs.twimg.com/media/CWza7kpWcAAdYLc.jpg']

In [None]:
df_wrdArchive[df_wrdArchive.tweet_id.isin(df_imagePred[df_imagePred['jpg_url'] == 'https://pbs.twimg.com/media/CWza7kpWcAAdYLc.jpg'].tweet_id)]

In [None]:
df_wrdArchive.doggo.value_counts()

In [None]:
df_wrdArchive.floofer.value_counts()

In [None]:
df_wrdArchive.pupper.value_counts()

In [None]:
df_wrdArchive.puppo.value_counts()

In [None]:
df_wrong_dog_stage = df_wrdArchive.iloc[:0,:].copy()
for index, row in df_wrdArchive.iterrows():
    if ((row['doggo'] != "None" and (row['floofer'] != "None" or row['pupper'] != "None" or row['puppo'] != "None")) or
        (row['floofer'] != "None" and (row['doggo'] != "None" or row['pupper'] != "None" or row['puppo'] != "None")) or
        (row['pupper'] != "None" and (row['doggo'] != "None" or row['floofer'] != "None" or row['puppo'] != "None")) or
        (row['puppo'] != "None" and (row['doggo'] != "None" or row['floofer'] != "None" or row['pupper'] != "None"))):
        df_wrong_dog_stage = df_wrong_dog_stage.append(row,ignore_index= True)

In [None]:
df_wrong_dog_stage

In [None]:
df_wrdArchive.rating_numerator.value_counts()

In [None]:
df_wrdArchive.rating_denominator.value_counts()

### Quality issues

#### WeRateDogs Twitter Archive
1. Datetime fields are stored as object instead of datetime.    
2. The dog names format should be consistent. Capitalize.
3. Replace 'None' with np.nan to indicate the missing values.
4. Dogs classified as more than one type(doggo, floofer, pupper and puppo) in some cases. 
5. Denominator with value other than 10.
6. Numerator with value of 0.
#### Image Predictions
7. Non descriptive column name.(Image Predictions)
8. The prediction dog breed names formatting incosistent.
9. Many Predicted names in dog breed prediction are not likely to be a dog.
#### Tweet Status
9. Remove all columns not needed.

### Tidiness issues
1. The columns doggo, floofer, pupper and puppo need to be merged together to form a single column. These are values converted into columns.

2. Rename the column **id** to **tweet_id** in tweetStatus and **timestamp** to **created_at** in tweetArchive

3. Drop columns that are not needed for our analysis.

4. Twiiter archive and status data can be consolidated and merged together to have a well formed observation set as they together form the observation. For that matter, we can add dog breed prediction too, without much overhead as we will keep only the original tweets.

5. Dogs rating numerator and denominator are stored seprately, which is not ideal. Rating should be one single column and for that matter Ratings can come in decimal. Lets derive rating in decimal and store it a comlumn call Dogs rating.

## Cleaning Data
In this section, clean **all** of the issues you documented while assessing. 

**Note:** Make a copy of the original data before cleaning. Cleaning includes merging individual pieces of data according to the rules of [tidy data](https://cran.r-project.org/web/packages/tidyr/vignettes/tidy-data.html). The result should be a high-quality and tidy master pandas DataFrame (or DataFrames, if appropriate).

In [None]:
# Make copies of original pieces of data
df_ta_clean = df_wrdArchive.copy()
df_img_clean = df_imagePred.copy()
df_ts_clean = df_tweetStatus.copy()


### Tidiness issues

### Issue #1: These are values converted into columns - doggo, floofer, pupper and puppo

#### Define:
doggo, floffer, pupper and puppo seems to be dog stages as per dogtionary. However, doggo might be an exception as it is sometimes called as affectionately which directly not relate to any age stage. Any dog for that matter can be qualified as doggo. For the sake of keeping it simple lets merge these 4 column into one. During programatic assessment it was found that some dogs had more than one stage. With this cleanup we will loose this additional information. Precedence of the selection in the order doggo, floffer, pupper and puppo - doggo given highest precedence and puppo the lowest. 

#### Code

In [None]:
def getDogStage (row):
    if row['doggo'] != "None" :
        return row['doggo']
    if row['floofer'] != "None" :
        return row['floofer']
    if row['pupper'] != "None" :
        return row['pupper']
    if row['puppo'] != "None" :
        return row['puppo']
    
df_ta_clean['stage'] = df_ta_clean.apply (lambda row: getDogStage(row), axis=1)
df_ta_clean = df_ta_clean.drop(['doggo','floofer','pupper','puppo'], axis = 1)

#### Test

In [None]:
df_ta_clean.stage.value_counts()

In [None]:
df_ta_clean.sample(5)

### Issue #2: Rename the column id to tweet_id in tweetStatus and timestamp to created_at in tweetArchive

#### Define
**id** column in tweet status is the tweet_id, though the name suggest something else. Visual assessment confirmed that. Let's rename this column to **tweet_id** facilitate restructing of our gathered data which will help us to simply our analysis.

#### Code

In [None]:
df_ts_clean.rename(columns = {'id':'tweet_id'}, inplace = True)
df_ta_clean.rename(columns = {'timestamp':'created_at'}, inplace = True)

#### Test

In [None]:
df_ts_clean.info()

### Issue #3: Drop columns that are not needed for our analysis.

#### Define
Lets clean up additional column that does not relate to dogs rating explicilty. As one observation should form one entry in the table - and we are interested only in dogs rating lets shed the extra noises in these data sources.

**Needed columns:**
>**Twitter Archive:**'tweet_id','created_at','name','stage','rating_numerator','rating_denominator','text'

>**Twitter Status:** 'tweet_id','created_at','favorite_count','retweet_count'

>**Dog breed prediction by Image:** Lets keep all except for img_num. In my opinion we dont need to have p2 and p3 columns.




#### Code

In [None]:
df_img_clean.info()

In [None]:
df_ta_clean = df_ta_clean[['tweet_id','created_at','name','stage','rating_numerator','rating_denominator','text']].copy()
df_ts_clean = df_ts_clean[['tweet_id','created_at','favorite_count','retweet_count']].copy()
df_img_clean = df_img_clean[['tweet_id','jpg_url','p1','p1_conf','p1_dog']].copy()

#### Test

In [None]:
df_ta_clean.info()

In [None]:
df_ts_clean.info()

In [None]:
df_img_clean.info()

### Issue #4: Consolidate Archive, Status, Dog breed predictions into a single dataset.

#### Define
Twiiter archive and status data can be consolidated and merged together to have a well formed observation set as they together form the observation. For that matter, we can add dog breed prediction too, without much overhead as we will keep only the original tweets.

#### Code
We will do this in step of storing data in master Archive as mentioned in below.

#### Test
Will test in step storing data in master Archiveas mentioned in below.

#### Issue #5: Dog's rating are stored in two different columns as rating Denominatro and Numerator. 

#### Define
Dogs Rating should be in a single column. Eventhough it the Dog's rating that set WeRateDogs standout in the crowd, for data analysis it is not ideal. Hence, derive Rating using numertor/denominator and store in a single column. To keep it simple let's round the value to one decimal point.

#### Code
There are some errornoius entry in denominator and numerator. These values needs to be cleaned up first. We will work on this tidiness issue at the end of resolving data quality issue.

#### Test
Will do later when tidiness issue coding is done.

### Quality Issues

### Issue #1: Datetime fields are stored as object instead of datetime

#### Define:
Change column dataType of timestamp (in archive) and created_at(in tweet status) to datetime datatype with same format. Please keep a note that in previous step(Tidiness Issue #2) column name 'timestamp' in archive been changed to 'created_at'.

#### Code

In [None]:
df_ta_clean['created_at'] = pd.to_datetime(df_ta_clean['created_at'], utc=False)
df_ts_clean['created_at'] = pd.to_datetime(df_ts_clean['created_at'], utc=False)

#### Test

In [None]:
df_ta_clean.info()

In [None]:
df_ts_clean.info()

### Issue #2: Dog names in archive is not consistent. Make the name capitalize.

#### Define
Dog names in archive is not consistent. Make the name capitalize.

#### Code

In [None]:
df_ta_clean['name'] = df_ta_clean.name.str.capitalize()

#### Test

In [None]:
df_wrdArchive['name'].str.islower().sum()

In [None]:
df_ta_clean['name'].str.islower().sum()

In [None]:
df_ta_clean[df_ta_clean['name'] != None].sample(5)

### Issue #3: Replace 'None' with np.nan to indicate the missing values.

#### Define
There are entries in dataset where "None" is cell value wehere as in some other places it is NaN. Lets make this consistent.

#### Code

In [None]:
df_ta_clean = df_ta_clean.replace('None', np.nan)
df_ts_clean = df_ts_clean.replace('None', np.nan)
df_img_clean = df_img_clean.replace('None', np.nan)

#### Test

In [None]:
df_ta_clean[df_ta_clean.stage.notna()].shape

In [None]:
df_ta_clean[df_ta_clean.isna().any(axis=1)]

In [None]:
df_ts_clean[df_ts_clean.isna().any(axis=1)]

In [None]:
df_img_clean[df_img_clean.isna().any(axis=1)]

### Issue #4: Dogs classified as more than one type(doggo, floofer, pupper and puppo) in some cases.

#### Define
As the issue describe, there are Many dogs which has been categorized as more than one stage (doggo, floofer, pupper and puppo). This needs to be fixed. 

#### Code
This already has been taken care in the Issue #1 in Tidiness cleaning. Please refer to Define section of that issue.

#### Test
doggo, floofer, pupper and puppo has already been converted into a single column. Therefore, it can not hold more than one value in the cell

In [None]:
df_ta_clean.stage.value_counts()

### Issue #5: Denominator with value other than 10

#### Define
Most of the reating denominator (>95%) are having the value 10 which is standard. Though, it was mentioned in the begining that the rating system in WeRateDogs is what set them apart, but in this case we can standardize the denominator to 10. This will not have any impact on the overall ratings given to a dog.

#### Code

In [None]:
df_ta_clean.loc[df_ta_clean['rating_denominator'] != 10,'rating_denominator' ] = 10

#### Test

In [None]:
df_ta_clean[df_ta_clean['rating_denominator'] != 10].shape

### Issue #6: Numertor with value 0

#### Define
Rating with 0 does not seems to be a realistic one. We also dont have any standard value to set in this case either. Lets drop those entries where we have numerator 0.

#### Code

In [None]:
df_ta_clean.info()

In [None]:
df_ta_clean.drop(df_ta_clean[df_ta_clean['rating_numerator'] == 0].index, inplace=True)

#### Test

In [None]:
df_ta_clean[df_ta_clean['rating_numerator'] == 0]

In [None]:
df_ta_clean.info()

### Issue #7: Image Predictions Non descriptive column name.

#### Define
Column names in dog breed predictions by image are not descriptive enough. Lets give them more meaningful names to easily relate to the content of the columns and use them effectively. Not only that, from analysis perspective it is enough to keep the first set of prediction outcome and drop the rest.

#### Code

In [None]:
df_img_clean.rename(columns = {'p1':'first_pridicted_breed',
                              'p1_conf': 'first_pridiction_confidence',
                              'p1_dog':'is_first_pridiction_dog_breed',
                             }, inplace = True)

#### Test

In [None]:
df_img_clean.info()

### Issue #8: The predicted dog breed names formatting incosistent.

#### Define
Predicted dog breed naming convention are not consistent. Lets make it consistent by capitalizing first letter of every word. Please note, in some cases there are multiple words in a single dog breed separated by either a underscore or space.

#### Code

In [None]:
df_img_clean['first_pridicted_breed'] = df_img_clean.first_pridicted_breed.str.title()

#### Test

In [None]:
df_img_clean.first_pridicted_breed.sample(5)

### Issue #9: Many predicted dog breed names are not dog breed

#### Define
It is true that some predicted dog breed are not dog breeds and the same is flagged as false in such cases. We can either drop these entries or keep it as it will not harm as much. The upside is we can still have the picture url available in master and dog breed can be corrected manually. Manual assessment revealed that in some cases the background picture of dog took precedence, in other cases there are no dog picture at all. For the sake of keeping this wrangling part clean, lets remove such entries from the table.

#### Code

In [None]:
# Let's find out how many prediction ended up with no dog breed
df_img_clean[df_img_clean['is_first_pridiction_dog_breed'] == False]

In [None]:
df_img_clean.drop(df_img_clean[df_img_clean['is_first_pridiction_dog_breed'] == False].index, inplace = True)

#### Test

In [None]:
df_img_clean[df_img_clean['is_first_pridiction_dog_breed'] == False]

### Issue #10: Delete not needed column from all three datasets

#### Define
One final cleanup is still necessary to see if there are any duplicate columns exist. If yes, drop them. Revisit the columns to see if any column that is not needed for our nanalysis still exist. If yes then drop them as well. This will ensure good quality and concise data and we can merge them easily to have good master archive data.

#### Code

In [None]:
columns = pd.Series(list(df_ta_clean.columns) + list(df_ts_clean.columns) + list(df_img_clean.columns))
columns[columns.duplicated()]

**Note:** We need **tweet_id** to merge all three datasets. Will kep it. Lets analyse **created_at**. This occurs in both archive and status dataset. Lets have a look at them.

In [None]:
df_temp = pd.merge(df_ta_clean,df_ts_clean, how='inner', on='tweet_id')

In [None]:
df_temp.sample(5)

In [None]:
df_temp['matches'] = np.where(df_temp['created_at_x'] == df_temp['created_at_y'], True, False)

In [None]:
df_temp[df_temp['matches'] == False]

This clearly proves that in both the tables timestamp are same. Hence we can delete created_at from tweet status dataste.

In [None]:
df_ts_clean.drop(['created_at'], axis = 1, inplace = True)

#### Test

In [None]:
columns = pd.Series(list(df_ta_clean.columns) + list(df_ts_clean.columns) + list(df_img_clean.columns))
columns[columns.duplicated()]

### Revisit Tidiness Issue#5: Rating distributed over two cloumns

#### Code

In [None]:
df_ta_clean.info()

In [None]:
df_ta_clean['rating'] = (df_ta_clean['rating_numerator']/df_ta_clean['rating_denominator'])*10

#### Test

In [None]:
df_ta_clean.sample(5)

## Storing Data
Save gathered, assessed, and cleaned master dataset to a CSV file named "twitter_archive_master.csv".

In [None]:
df_ta_clean.info()

In [None]:
df_ts_clean.info()

In [None]:
df_img_clean.info()

In [None]:
df_archive_master = pd.merge(df_ta_clean,df_ts_clean, how='inner', on='tweet_id')
df_archive_master = pd.merge(df_archive_master,df_img_clean, how='inner', on='tweet_id')

In [None]:
df_archive_master.shape

In [None]:
df_archive_master.sample(5)

In [None]:
df_archive_master.to_csv('twitter_archive_master.csv', index = False)

## Analyzing and Visualizing Data
In this section, analyze and visualize your wrangled data. You must produce at least **three (3) insights and one (1) visualization.**

In [None]:
df_archive_master.info()

In [None]:
df_archive_master.sample(5)

#### Most preferred dog breed as pet

In [None]:
df_archive_master.groupby(['first_pridicted_breed'])['tweet_id'].count().nlargest()

#### Least preferred dog breed as pet

In [None]:
df_archive_master.groupby(['first_pridicted_breed'])['tweet_id'].count().nsmallest()

#### Most loved dog breed

In [None]:
df_archive_master.groupby(['first_pridicted_breed'])['favorite_count'].sum().nlargest()

#### Not so loved dog breed

In [None]:
df_archive_master.groupby(['first_pridicted_breed'])['favorite_count'].sum().nsmallest()

#### Most re-tweeted dog breed

In [None]:
df_archive_master.groupby(['first_pridicted_breed'])['retweet_count'].sum().nlargest()

#### Least re-tweeted dog breed

In [None]:
df_archive_master.groupby(['first_pridicted_breed'])['retweet_count'].sum().nsmallest()

#### Highest rated Dog Breed

In [None]:
df_archive_master.iloc[df_archive_master['rating'].nlargest().index]['first_pridicted_breed']
df_archive_master.groupby(['first_pridicted_breed'])['rating'].mean().nlargest()

#### Lowest rated dog breed

In [None]:
df_archive_master.groupby(['first_pridicted_breed'])['rating'].mean().nsmallest()

#### Dog breed easily identifiable

In [None]:
df_archive_master.groupby(['first_pridicted_breed'])['first_pridiction_confidence'].mean().nlargest()

#### Dog breed difficult to identify

In [None]:
df_archive_master.groupby(['first_pridicted_breed'])['first_pridiction_confidence'].mean().nsmallest()

### Insights:
There are many insights can be inferred based on the analysis done on cleaned and consolidated data. Many more can still be recovered by experienced data scientist by analysis. But from my little experince and the analysis done, here is what I can say
1. Most loved(favourited)/ re-tweetd dog breed are the ones that are adopted most by people as pet.
2. Highest rated dogs are not the most famous ones (favourited/re-tweeted).
3. People tends to mark tweet as faovourite than re-tweeting them.
4. It is highly likely that more the favourite count, higher the chanes of it being re-tweeted.
5. There are dog breeds predicted with higher confidenece and some are having lower confidence level. This indicates that those which are easily identifiable has some distinct features. On the other hand some breed has very common features between them which probably lead to lower confidence. Here, I think it was a mistake to drop other p2 and p3 columns from prediction tabel. I assume the confidence are evenly distributed between 3 prediction.

### Visualization

In [None]:
df = df_archive_master.groupby(['first_pridicted_breed'])['tweet_id'].count().nlargest().to_frame()
df.rename(columns = {'tweet_id': 'Tweet_Count'}, inplace = True)
ax = df.plot.barh(title='Most preferred dog breed as pet')
ax.set(xlabel='Tweet_count', ylabel='Breed')
plt.savefig('most_preferred_dog_breed',
            bbox_inches ="tight",
            pad_inches = 0.2,
            transparent = True,
            facecolor ="g",
            edgecolor ='r',
            orientation ='landscape')
plt.show()

In [None]:
df = df_archive_master.groupby(['first_pridicted_breed'])['tweet_id'].count().nsmallest().to_frame()
df.rename(columns = {'tweet_id': 'Tweet_Count'}, inplace = True)
ax = df.plot.barh(title='Least preferred dog breed as pet')
ax.set(xlabel='Tweet_count', ylabel='Breed')
plt.savefig('least_preferred_dog_breed',
            bbox_inches ="tight",
            pad_inches = 0.2,
            transparent = True,
            facecolor ="g",
            edgecolor ='r',
            orientation ='landscape')
plt.show()

In [None]:
df = df_archive_master.groupby(['first_pridicted_breed'])['favorite_count'].sum().nlargest().to_frame()
ax = df.plot.barh(title='Most favoured dog breed')
ax.set(xlabel='favorite_count', ylabel='Breed')
plt.savefig('most_favoured_dog_breed',
            bbox_inches ="tight",
            pad_inches = 0.2,
            transparent = True,
            facecolor ="g",
            edgecolor ='r',
            orientation ='landscape')
plt.show()

In [None]:
df = df_archive_master.groupby(['first_pridicted_breed'])['favorite_count'].sum().nsmallest().to_frame()
ax = df.plot.barh(title='Least favoured dog breed')
ax.set(xlabel='favorite_count', ylabel='Breed')
plt.savefig('least_favoured_dog_breed',
            bbox_inches ="tight",
            pad_inches = 0.2,
            transparent = True,
            facecolor ="g",
            edgecolor ='r',
            orientation ='landscape')
plt.show()

In [None]:
df = df_archive_master.groupby(['first_pridicted_breed'])['retweet_count'].sum().nsmallest().to_frame()
ax = df.plot.barh(title='Most re-tweeted dog breed')
ax.set(xlabel='retweet_count', ylabel='Breed')
plt.savefig('most_re-tweeted_dog_breed',
            bbox_inches ="tight",
            pad_inches = 0.2,
            transparent = True,
            facecolor ="g",
            edgecolor ='r',
            orientation ='landscape')
plt.show()

In [None]:
df = df_archive_master.groupby(['first_pridicted_breed'])['retweet_count'].sum().nlargest().to_frame()
ax = df.plot.barh(title='Most re-tweeted dog breed')
ax.set(xlabel='retweet_count', ylabel='Breed')
plt.savefig('least_re-tweeted_dog_breed',
            bbox_inches ="tight",
            pad_inches = 0.2,
            transparent = True,
            facecolor ="g",
            edgecolor ='r',
            orientation ='landscape')
plt.show()

In [None]:
df=df_archive_master.groupby(['first_pridicted_breed'])['first_pridiction_confidence'].mean().nlargest()
ax = df.plot.barh(title='Easily identifiable dog breed')
ax.set(xlabel='Predicted Confidence', ylabel='Breed')
plt.savefig('breed_easily_identifiable',
            bbox_inches ="tight",
            pad_inches = 0.2,
            transparent = True,
            facecolor ="g",
            edgecolor ='r',
            orientation ='landscape')
plt.show()

In [None]:
df=df_archive_master.groupby(['first_pridicted_breed'])['first_pridiction_confidence'].mean().nsmallest()
ax = df.plot.barh(title='Dog breed difficult to identofy')
ax.set(xlabel='Predicted Confidence', ylabel='Breed')
plt.savefig('breed_not_easily_identifiable',
            bbox_inches ="tight",
            pad_inches = 0.2,
            transparent = True,
            facecolor ="g",
            edgecolor ='r',
            orientation ='landscape')
plt.show()

#### Correlation between favourite, retweet, rating and confidence

In [None]:
df = df_archive_master[['favorite_count','retweet_count','rating','first_pridiction_confidence']]
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)