In [None]:
# PROJECT 4 - Wrangle Twitter data via API (made in Vscode)

## Table of Contents
* [Introduction](#intro)
* [Initial Brief](#1.1-initial-brief)
* [General Outline](#general_outline)
* [Import Libraries](#)
* [](#)

`Note: Fill at the end. Automate with python library/extension.`

# Introduction
Gather readily available data from an existing source on the web to allow first hand experience of wrangling data.<br>
It is a significant task as data will not always be provided and if it is: <br>
 - Best case: Spelling mistakes and/or equivalent,
 - Worst case: No schema/format, duplicates, incomplete and/or incorrect values recorded.

## Initial Brief
- User has provided archived twitter data for analysis
 - [ ] Twitter archive export in CSV
 - [ ] URL to Machine Learning image predictions
<br>
- Identify minimum:
 - [ ] 8 quality issues
 - [ ] 2 tidiness issues
<br>
- Out of scope:
 - [ ] Unique rating system
 - [ ] No gathering required past 01 Aug 2017

## General outline
- [ ] Read-in CSV data
- [ ] Access URL data (_over manually downloading file_)

In [None]:
## install modules via terminal
#pip install pandas # also downloads numpy
#pip install requests
#pip install tweepy
#pip install pandasgui
#pip install autoviz
#pip install pandas-profiling
#pip install sweetviz
#pip install bs4

## Optional - provides Table of Contents and minimizes lines (only on Jupyter)
#pip install jupyter_contrib_nbextensions && jupyter contrib nbextension install

## Import Libraries

In [None]:
import pandas as pd
import numpy as np

import requests
import os

import msvcrt
import sys

from pandasgui import show

from bs4 import BeautifulSoup

import tweepy

import json

from pathlib import Path

## Defined Functions

- addFiles(filename)    `Created for the ability to scale`
- go_assess(df)         `Created to reiterate through assessment steps`

In [None]:
filelist = [] # declare
print('{} Files in list'.format(len(filelist)) ) # initial print

# Adds and tracks files
def add_files(*filename): # PARAMETER: <string>
    for file in filename:
        filelist.append(file)
        print('{} added to file list.'.format(file) )

    if len(filelist) > 1:
        print('{} files now in list.'.format( len(filelist)) )
    else:
        print('{} file now in list.'.format( len(filelist)) )
    return file

In [None]:
def get_values(df, col, name): # 
    export = []
    value_cnt = col.value_counts()
    value = value_cnt.values
# test for duplicates, no duplicates should be equal to .series size
    if value.sum() > value.shape[0]: # there are duplicates
        txt_result = ('Duplicates found in column \'{}\', the max duplicate item repeats {} times.'.format(name, value.max()) ) # print results, return indexes
    else: # no duplicates
        txt_result = ('No duplicates found in column \'{}\'.'.format(name) )
        #print('{}: No duplicates found.'.format(col) )
    # pack variables into list
    export.append(value_cnt)
    export.append(txt_result)
    
    return export

In [None]:
def go_assess(df):
    # empty every function call, to prevent list from accumulating over time
    results = [] #
    summary = [] #
    val_sum = [] # 
    assessment = []
    print('Dataframe contains the following columns:')
    print('{}\n'.format(df.columns) )

    for i, col in enumerate(df.columns):
        # copy into message
        print('Column {} - \'{}\' has been assessed. Assessment saved in results[{}] and summary[{}]'.format(i, col, i, i))
        
        # call and get results
        val_sum = get_values(df, df[col], col)

        # append results
        summary.append(val_sum[1])
        results.append(val_sum[0])

    assessment.append(summary)
    assessment.append(results)
    print('NOTE: To access variables, set a series name e.g below:\nseries[0][x] to access summary details.\nseries[1][x] to access the value_counts results.\nx represents column number')
    return assessment #

In [None]:
def trim_strings(df):
    for col in df:
        if df[col].dtype == 'object':
            startcount = df[col].str.len()
            df[col].str.strip()
            endcount = df[col].str.len()
            print

            if (startcount.sum() - endcount.sum()) > 0:
                print('Whitespaces were present in {}.'.format(col) )
            else:
                print('No whitespaces in {}.'.format(col) )

## Data Wrangling

## Iteration 1
Import data from a twitter user archive provided by the end-user

`Note: Add edit# upon addition of new issue.`

### Gathering 1
#### Initialize
Enter Known Input Info
Format: file name inside ''

In [None]:
# FILE 1 - TWITTER ARCHIVE DATA
folder = 'Incoming_Files/'
twitter_file = 'twitter-archive-enhanced-2.csv'
add_files(twitter_file)

In [None]:
# FILE 2 - TWITTER ML IMAGE PREDICTIONS
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'

# assign to a response object
response = requests.get(url)

image_predictions = url.split('/')[-1] # extract file name

# with open, allows for the auto close file when complete
# split after last delimiter /, indicating file name
with open(os.path.join(folder, image_predictions), mode='wb') as file:
    # read file 
    file.write(response.content)
    print('{} has been saved in: "/{}"'.format(image_predictions, folder) )

# call function and add name to end of list
add_files(image_predictions)

In [None]:
run_script = str(input('Run script to Access Twitter API (Y/N)?'))
valid_input = ['n', 'N', 'y', 'Y']
yes_list = ['y','Y']
no_list = ['n','N']

In [None]:
# FILE 3 - TWITTER API JSON
# run python script, pass dataframe name (dataframe could not be passed)
while run_script not in valid_input:
    run_script = input('Wrong input. Run script to Access Twitter API (Y/N)?')

if run_script in yes_list:
    folderarg = folder.replace(' ', '_')
    print('Running. Will indicate when complete.\n')
    %run twitter-api.py $folderarg $filelist[0]
elif run_script in no_list:
    print('Script not running.')

In [None]:
## Twitter API data
API_cols = ['tweet_id', 'retweet_count', 'fav_count']

# read in txt and convert to json
API_export = 'tweet_json.txt'
# call function and add name to end of list
add_files(API_export)

json_keys, json_id, json_fav_count, json_retweet_cnt = [], [],[],[]

with open(API_export) as txt_file:
    for line in txt_file:
        #print(line)
        json_obj = json.loads(line)
        #append to list then combine lists 
        json_keys.append(json_obj)
        json_id.append(json_obj['id_str'])
        json_fav_count.append(json_obj['favorite_count'])
        json_retweet_cnt.append(json_obj['retweet_count'])
        

In [None]:
json_obj

#### Import into dataframes

In [None]:
 # IMPROVE
 # create empty list
df_raw = []
file_extensions = []

# dataframe to contain original imports
for num, file in enumerate(filelist):
    ext = file.split('.')[-1]
    file_extensions.append(ext)
    # read extension type
    ## catch CSV, TSV, JSON, no Switch/Case in Python
    if ext == 'csv':
        df_raw.append(pd.read_csv(folder + file) )
    elif ext == 'tsv':
        df_raw.append(pd.read_csv(folder + file, sep='\t') )
    elif file == 'tweet_json.txt': # improve for general twitter api scrap import
        df_raw.append(pd.DataFrame(zip(json_id, json_fav_count, json_retweet_cnt), columns=API_cols))
    else:
        print('filelist({}) - "{}", could not be read into a dataframe.'.format(num, filelist[num]) )

In [None]:
print(filelist)

In [None]:
df_raw[0].sample(3)  # visually assess file was read in correctly

In [None]:
df_twitter = df_raw[0].copy()

In [None]:
df_raw[1].sample(3)  # visually assess file was read in correctly

In [None]:
df_image_predictor = df_raw[1].copy() # create copy

In [None]:
df_raw[2].sample(3)  # visually assess file was read in correctly

In [None]:
df_twitter_api = df_raw[2].copy()

In [None]:
df_twitter_api.sample(3)

In [None]:
# DO - search Incoming Files directory, files not in list to be added.

## Assessing data
### Assess 1 - Twitter Data Archive
#### Define:<br>

In [None]:
# external windows open
twitter_gui = show(df_raw[0])

In [None]:
df_twitter.info()

In [None]:
df_twitter.describe()

In [None]:
# call go_assess function
archive_assessed = go_assess(df_twitter)

### Column 0 - tweet_id

In [None]:
### Column 0 - 
archive_assessed[0][0], archive_assessed[1][0]

### Column 1 - in reply

In [None]:
### Column 1 - 
archive_assessed[0][1], archive_assessed[1][1]

In [None]:
df_twitter[df_twitter.in_reply_to_status_id.notna()]['in_reply_to_status_id'].sample(5)

In [None]:
### Column 2 - 
archive_assessed[0][2], archive_assessed[1][2]

In [None]:
### Column 3 - 

In [None]:
archive_assessed[0][3], archive_assessed[1][3]

In [None]:
### Column 4 - 
archive_assessed[0][4], archive_assessed[1][4]

In [None]:
### Column 5 - 
archive_assessed[0][5], archive_assessed[1][5]

In [None]:
### Column 6 - 
archive_assessed[0][6], archive_assessed[1][6]

In [None]:
### Column 7 - 
archive_assessed[0][7], archive_assessed[1][7]

In [None]:
### Column 8 - 
archive_assessed[0][8], archive_assessed[1][8]

In [None]:
### Column 9 - 
archive_assessed[0][9], archive_assessed[1][9]

In [None]:
### Column 10 - 
archive_assessed[0][10], archive_assessed[1][10]

In [None]:
### Column 11 - 
archive_assessed[0][11], archive_assessed[1][11]

In [None]:
### Column 12 - name

In [None]:
archive_assessed[0][12], archive_assessed[1][12]

## Assess 2 - Twitter Image Predictions

In [None]:
# external windows open
predictions_gui = show(df_raw[1])

In [None]:
df_image_predictor.sample(3)

In [None]:
df_image_predictor.info()

In [None]:
df_image_predictor.describe()

In [None]:
img_assessed = go_assess(df_image_predictor)

In [None]:
### Column 0
img_assessed[0][0], img_assessed[1][0]

In [None]:
### Column 1
# search for files other then .jpg, use .split and sift through values
not_jpg = df_image_predictor[~df_image_predictor.jpg_url.str.contains('.jpg',)]
not_jpg.jpg_url

In [None]:
img_assessed[0][1], img_assessed[1][1]

In [None]:
### Column 2
img_assessed[0][2], img_assessed[1][2]

In [None]:
### Column 3
img_assessed[0][3], img_assessed[1][3]

In [None]:
is_ws = df_image_predictor[df_image_predictor.p1.str.contains(' ',)]
is_ws

In [None]:
mask = img_assessed[1][3] == 1
img_assessed[1][3][mask]

In [None]:
### Column 4
img_assessed[0][4], img_assessed[1][4]

In [None]:
### Column 5
img_assessed[0][5], img_assessed[1][5]

In [None]:
df_image_predictor.query('p1_dog == False').iloc[:, [0,1,3,5]]

In [None]:
p1_false_results = df_image_predictor.query('p1_dog == False').iloc[:,:6]
p1_false_results

In [None]:
p1_false_results.groupby(['p1']).size()

In [None]:
### Column 6
img_assessed[0][6], img_assessed[1][6]

In [None]:
### Column 7
img_assessed[0][7], img_assessed[1][7]

In [None]:
### Column 8
img_assessed[0][8], img_assessed[1][8]

In [None]:
### Column 9
img_assessed[0][9], img_assessed[1][9]

In [None]:
### Column 10
img_assessed[0][10], img_assessed[1][10]

In [None]:
### Column 11
img_assessed[0][11], img_assessed[1][11]

## Assess 3 - Twitter API Raw Data

In [None]:
# external windows open
api_gui = show(df_raw[2])

In [None]:
df_twitter_api.sample(3)

In [None]:
df_twitter_api.info()

In [None]:
df_twitter_api.describe()

In [None]:
#assess raw json api data
api_assessed = go_assess(df_twitter_api)

In [None]:
api_assessed = go_assess(df_twitter_api)

In [None]:
api_assessed[0][0], api_assessed[1][0]

In [None]:
api_assessed[0][1], api_assessed[1][1]

In [None]:
api_assessed[0][2], api_assessed[1][2]

### Assess Iteration 2

In [None]:
df_clean = []
df_clean.append(df_twitter)
df_clean.append(df_image_predictor)
df_clean.append(df_twitter_api)

In [None]:
for df in df_clean:
    print(df.shape)

## Cleaning data
### Quality Issue 1:
#### Define:
col0: tweet_id data type change to string, all dataframes

#### Code:

In [None]:
q1 = 'tweet_id'

In [None]:
# Print previous data types 
df_image_predictor[q1].head(1)
for df in df_clean:
    print(df[q1].head(1))

In [None]:
# Convert to string
for df in df_clean:
    df[q1] = df[q1].astype(str)

#### Test

In [None]:
df_image_predictor[q1].head(1)
for i, df in enumerate(df_clean):
    print(df[q1].head(1))

### Quality issue 2:
#### Define:
col3: change timestamp datatype to datetime

#### Code:

In [None]:
df_clean[0].timestamp = pd.to_datetime(df_clean[0].timestamp)

#### Test:

In [None]:
df_clean[0].timestamp

### Quality Issue 3:
#### Define:
col4: split string to remove html tag and extract content within

In [None]:
archive_assessed[1][4]

#### Code:
strip string prior to splitting

In [None]:
df_clean[0].iloc[:,4] = df_clean[0].iloc[:,4].str.strip()

In [None]:
# RESET COLUMN if coded incorrectly
df_clean[0].iloc[:,4] = df_raw[0].iloc[:,4]

In [None]:
df_clean[0].iloc[:,4] = df_clean[0].iloc[:,4].apply(lambda text: BeautifulSoup(text, 'html.parser').get_text())

In [None]:
df_clean[0].rename(columns={'source':'source_app'}, inplace=True)

#### Test:

In [None]:
df_clean[0].iloc[:,4].value_counts()

### Quality Issue 4:
#### Define:
col1,2,6,7: change datatype from float to int
#### Code:

In [None]:
q4 = list(df_twitter.iloc[:0, [1,2,6,7]])
# Print previous data types 
for column in q4:
    print(df_twitter[column].head(0))

In [None]:
# Convert to string
for column in q4:
    df_twitter[column] = df_twitter[column].astype(str)

#### Test:

In [None]:
for column in q4:
    print(df_twitter[column].head(0))

### Quality Issue 5:
#### Define:
remove potential whitespaces across all string/objects, trim front and end as visual inspection appeared to show start of strings not inline when scrolling down.

In [None]:
df_clean[0].info()

#### Code & Test:
`Improvement opportunity. Scan for object/string dtype and return if true to easily filter`

In [None]:
# call function
trim_strings(df_clean[0])

In [None]:
trim_strings(df_clean[1])

In [None]:
trim_strings(df_clean[2])

### Quality Issue 6:
#### Define:
df_image_predictor<br>
col3,6,9: change to lower case

In [None]:
q6 = list(df_clean[1].iloc[:0, [3,6,9]])

#### Code:

In [None]:
q6 = list(df_clean[1].iloc[:0, [3,6,9]])
# Print previous data types 
for column in q6:
    print(df_clean[1][column].head(5))

In [None]:
# RESET COLUMN if coded incorrectly
df_clean[1].iloc[:, [3,6,9]] = df_raw[1].iloc[:, [3,6,9]]

In [None]:
for column in q6:
    df_clean[1][column] = df_clean[1][column].str.lower()

#### Test:

In [None]:
for column in q6:
    print(df_clean[1][column].head(5))

### Quality Issue 7:
#### Define:
col1: rename from jpg_url to img_url

In [None]:
df_clean[1].iloc[:0, 1]

#### Code:

In [None]:
df_clean[1].rename(columns={'jpg_url':'img_url'}, inplace=True)

#### Test:

In [None]:
df_clean[1].iloc[:0, 1]

### Quality Issue 8:
#### Define:
col2: rename from img_num to conf_tweet_img

#### Code:

In [None]:
df_clean[1].iloc[:0, 2]

In [None]:
### Quality Issue 8:
df_clean[1].rename(columns={'img_num':'conf_tweet_img'}, inplace=True)

#### Test:

In [None]:
df_clean[1].iloc[:0, 2]

### Quality Issue 9:
#### Define:
<br>check col12 to remove/replace incorrect names with None

#### Code:

In [None]:
names_list = df_clean[0].name.value_counts().index
names_list

In [None]:
# extract names - regex test
name_mask = df_clean[0].name.str.match('[^A-Z]')
name_mask.value_counts()

In [None]:
df_clean[0].name[name_mask].value_counts()

In [None]:
df_clean[0].name.where(~name_mask)

In [None]:
df_clean[0].name = df_clean[0].name.where(~name_mask,None)

#### Test:

In [None]:
df_clean[0].name

### Quality Issue 10:
#### Define:
<br>check numerator rating value is correct.

In [None]:
# preview of strings in text column
list(df_clean[0].text.sample(5) )

#### Code:

In [None]:
# regex filter to extract 123.34/123 found visually and programmatically
df_clean[0]['rating'] = df_clean[0].text.str.extract(r'(\b\d{0,3}\.?\d{1,2}\/\d{2,3})', expand=True)

In [None]:
df_clean[0]['rating'].value_counts(dropna=False)

In [None]:
#remove .13 and .10 manually
# remove .13
x = df_clean[0].query('rating==".13/10"').rating.index[0]
df_clean[0]['rating'].iloc[x] = df_clean[0]['rating'].iloc[x].split('.')[1]
df_clean[0]['rating'].iloc[x]

In [None]:
# remove .10
x = df_clean[0].query('rating==".10/10"').rating.index[0]
df_clean[0]['rating'].iloc[x] = df_clean[0]['rating'].iloc[x].split('.')[1]
df_clean[0]['rating'].iloc[x]

In [None]:
df_clean[0].rating.value_counts(dropna=False), df_clean[0].rating.shape

In [None]:
# check for non regex matches
checknull = df_clean[0].rating.isnull()

In [None]:
list(df_clean[0][checknull].text), list(df_clean[0][checknull].rating)

In [None]:
decimal_mask = df_clean[0].rating.str.contains('\.', na=False) # to remove error
decimal_mask.value_counts()

In [None]:
df_clean[0][decimal_mask].rating

In [None]:
decimal_index = df_clean[0][decimal_mask].index
decimal_index

In [None]:
clean_ratings = df_clean[0].rating.str.split('/', n=2, expand=True).astype(float)
mid = clean_ratings[0].median()
clean_ratings[0].fillna(mid, inplace=True)
clean_ratings[1].fillna(10, inplace=True)
clean_ratings

In [None]:
clean_ratings[0].dtype, clean_ratings[0].dtype

In [None]:
clean_ratings[0].value_counts()

In [None]:
clean_ratings[1].value_counts()

In [None]:
df_clean[0].rating_numerator = clean_ratings[0].astype(float)
df_clean[0].rating_denominator = clean_ratings[1].astype(int)

#### Test:

In [None]:
df_clean[0].rating_denominator.isnull().values.any(), df_clean[0].rating_numerator.isnull().values.any()

In [None]:
df_clean[0].drop('rating', axis=1, inplace=True)

In [None]:
df_clean[0].info()

In [None]:
df_clean[0].rating_numerator.value_counts()

In [None]:
df_clean[0].rating_denominator.value_counts()

In [None]:
df_clean[0].info()

### Quality Issue 11:
#### Define:
remove retweets, i.e. 'RT @' in text

In [None]:
df_clean[0].in_reply_to_user_id.value_counts()

#### Code:

In [None]:
df_clean[0] = df_clean[0].query('in_reply_to_user_id=="nan" & retweeted_status_user_id=="nan"')

#### Test:

In [None]:
df_clean[0].in_reply_to_user_id.value_counts()

In [None]:
df_clean[0].retweeted_status_id.value_counts()

### Tidiness Issue 1:
#### Define:
timestamp split into three columns, date, time, timezone

In [None]:
df_clean[0].shape, df_clean[1].shape, df_clean[2].shape

#### Code:


In [None]:
df_twitter.timestamp.sample(5)

In [None]:
df_clean[0]['date'] = df_clean[0]['timestamp'].dt.date 
df_clean[0]['time'] = df_clean[0]['timestamp'].dt.time 
df_clean[0]['timezone'] = df_clean[0]['timestamp'].astype(str).str[-6:]
df_clean[0].drop(labels='timestamp', axis=1, inplace = True)

#### Test:


In [None]:
df_clean[0].iloc[:,16:]

### Tidiness Issue 2:
#### Define:
categorize dog type into one column, and drop redundant columns.
### Quality Issue #:
#### Define:
change datatype into categorical

In [None]:
df_clean[0].iloc[:,11:16].sample(10)

#### Code:

In [None]:
df_clean[0]['dog_type'] = df_clean[0].text.str.extract('(doggo|floofer|pupper|puppo)', expand=False)
df_clean[0]['dog_type'] = df_clean[0]['dog_type'].astype('category')

In [None]:
drop_cols = list(df_clean[0].iloc[:1,12:16])
drop_cols

In [None]:
df_clean[0]['dog_type'].value_counts(dropna=False)

In [None]:
df_clean[0]['dog_type'].fillna('doggo', inplace=True)

In [None]:
df_clean[0]['dog_type'].value_counts(dropna=False)

In [None]:
df_clean[0].drop(drop_cols, axis=1, inplace=True)

#### Test:

In [None]:
df_clean[0]['dog_type'].value_counts(dropna=False)

In [None]:
list(df_clean[0].iloc[:0,:])

### Tidiness Issue 3:
#### Define:
merge dataframes to contain the relevant columns required for analysis ensuring each is relevant to the information it pertains. Two dataframes in total.
One observation consisting of Twitter Data, another consisting of image predictions.

#### Code:


In [None]:
df_clean[0].shape

In [None]:
# merge twitter archive and api data first, keep predictions at the end (width wise) of data frame
twitter_archive_master = pd.merge(df_clean[0], df_clean[2], on='tweet_id', how='inner')

In [None]:
twitter_archive_master = pd.merge(twitter_archive_master, df_clean[1], on='tweet_id', how='inner')

#### Test:

In [None]:
twitter_archive_master.shape

In [None]:
twitter_archive_master.head()

In [None]:
twitter_archive_master.info()

### Tidiness Issue 4:
#### Define:
drop redundant columns, retweeted and in_reply columns, four (4) in total

#### Code:

In [None]:
drop_cols = list(twitter_archive_master.iloc[:0,[1,2,5,6,7,8]])
drop_cols

In [None]:
twitter_archive_master.drop(columns=drop_cols, inplace=True)

#### Test:

In [None]:
twitter_archive_master.info()

In [None]:
df_clean[1].head()

## Save Clean data

In [None]:
directory = 'Working_Files'
#from pathlib import Path
Path(directory).mkdir(parents=True, exist_ok=True)

In [None]:
### Twitter Master file
filename_out1 = 'twitter_archive_master.csv'
twitter_archive_master.to_csv(directory+'/'+filename_out1, index=True)

In [None]:
import shutil
def move_file(folder, file):
    path = os.getcwd()
    dest = path+'/'+folder+'/'+file
    source = path+'/'+file

    shutil.move(source, dest)
    print('{} moved.'.format(file))

## Exploratory Data Analysis and Visualization

In [None]:
twitter_archive_master.info()

### Automated EDA

#### Sweetviz

In [None]:
import sweetviz as sv

sweetviz_file1 = 'SweetViz-Twitter_Data_Report.html'
sweetviz_file2 = 'SweetViz-Img_Predictions_Report.html'

# Clean
clean_report1 = sv.analyze([twitter_archive_master,'Twitter_Data'])
clean_report1.show_html(filepath=sweetviz_file1, open_browser=False)
print('')
# Raw
raw_report1 = sv.analyze([df_raw[0],'Twitter_Data_Raw'])
raw_report1.show_html(filepath='Raw_'+sweetviz_file1, open_browser=False)
print('')
raw_report2 = sv.analyze([df_raw[0], 'Image_Predictions_Raw'])
raw_report2.show_html(filepath='Raw_'+sweetviz_file2, open_browser=False)

In [None]:
clean_report2 = sv.analyze([df_image_predictor, 'Image_Predictions'])
clean_report2.show_html(filepath=sweetviz_file2, open_browser=False)

In [None]:
# move into Reports Folder
#dest = path+'/'+folder+'/'+file
#source = path+'/'+file
move_file('Reports', sweetviz_file1)
move_file('Reports', sweetviz_file2)
move_file('Reports', 'Raw_'+sweetviz_file1)
move_file('Reports', 'Raw_'+sweetviz_file2)

#### Pandas Profiling
Limitation of n=10000 data points to be analysed

from pandas_profiling import ProfileReport

##### Twitter data

In [None]:
from pandas_profiling import ProfileReport
twitter_profile = ProfileReport(twitter_archive_master, title="Pandas_Profiling-Twitter_Data_Report")

In [None]:
def save_profile(profile, rep_name):
    prof_directory = 'Reports'
    Path(prof_directory).mkdir(parents=True, exist_ok=True)
    profile.to_file(prof_directory + '/' + rep_name + '.html')

In [None]:
twitter_profile.to_widgets() # to display report above

In [None]:
# save report as html, provide profile and report name
pandasprof = 'Twitter_Report.html'
twitter_profile.to_file(pandasprof)
move_file('Reports', pandasprof)

### Manual Visualization
#### Tableau Public

In [None]:
%%HTML
<div class='tableauPlaceholder' id='viz1607928848578' style='position: relative'><noscript><a href='#'><img alt=' ' src='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;We&#47;WeLoveDogsTwitterData&#47;Dashboard1&#47;1_rss.png' style='border: none' /></a></noscript><object class='tableauViz'  style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='' /><param name='name' value='WeLoveDogsTwitterData&#47;Dashboard1' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;We&#47;WeLoveDogsTwitterData&#47;Dashboard1&#47;1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /><param name='language' value='en' /></object></div>                <script type='text/javascript'>                    var divElement = document.getElementById('viz1607928848578');                    var vizElement = divElement.getElementsByTagName('object')[0];                    if ( divElement.offsetWidth > 800 ) { vizElement.style.width='650px';vizElement.style.height='887px';} else if ( divElement.offsetWidth > 500 ) { vizElement.style.width='650px';vizElement.style.height='887px';} else { vizElement.style.width='100%';vizElement.style.height='1577px';}                     var scriptElement = document.createElement('script');                    scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js';                    vizElement.parentNode.insertBefore(scriptElement, vizElement);                </script>

In [None]:
%%HTML
<div class='tableauPlaceholder' id='viz1607928908675' style='position: relative'><noscript><a href='#'><img alt=' ' src='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;We&#47;WeLoveDogsTwitterData&#47;Dashboard2&#47;1_rss.png' style='border: none' /></a></noscript><object class='tableauViz'  style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='' /><param name='name' value='WeLoveDogsTwitterData&#47;Dashboard2' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;We&#47;WeLoveDogsTwitterData&#47;Dashboard2&#47;1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /><param name='language' value='en' /></object></div>                <script type='text/javascript'>                    var divElement = document.getElementById('viz1607928908675');                    var vizElement = divElement.getElementsByTagName('object')[0];                    if ( divElement.offsetWidth > 800 ) { vizElement.style.width='650px';vizElement.style.height='887px';} else if ( divElement.offsetWidth > 500 ) { vizElement.style.width='650px';vizElement.style.height='887px';} else { vizElement.style.width='100%';vizElement.style.height=(divElement.offsetWidth*1.77)+'px';}                     var scriptElement = document.createElement('script');                    scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js';                    vizElement.parentNode.insertBefore(scriptElement, vizElement);                </script>

In [None]:
%%HTML
<div class='tableauPlaceholder' id='viz1607928926606' style='position: relative'><noscript><a href='#'><img alt=' ' src='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;We&#47;WeLoveDogsTwitterData&#47;Dashboard3&#47;1_rss.png' style='border: none' /></a></noscript><object class='tableauViz'  style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='' /><param name='name' value='WeLoveDogsTwitterData&#47;Dashboard3' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;We&#47;WeLoveDogsTwitterData&#47;Dashboard3&#47;1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /><param name='language' value='en' /></object></div>                <script type='text/javascript'>                    var divElement = document.getElementById('viz1607928926606');                    var vizElement = divElement.getElementsByTagName('object')[0];                    if ( divElement.offsetWidth > 800 ) { vizElement.style.width='650px';vizElement.style.height='887px';} else if ( divElement.offsetWidth > 500 ) { vizElement.style.width='650px';vizElement.style.height='887px';} else { vizElement.style.width='100%';vizElement.style.height='527px';}                     var scriptElement = document.createElement('script');                    scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js';                    vizElement.parentNode.insertBefore(scriptElement, vizElement);                </script>