# Introduction: Stats for Medium Articles

In this notebook, we will explore my medium article statistics. We'll work with the raw HTML of the stats page.

In [None]:
# Data science imports
import pandas as pd
import numpy as np

# Options for pandas
pd.options.display.max_columns = 25

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Interactive plotting
import plotly.plotly as py
import plotly.graph_objs as go
import cufflinks
cufflinks.go_offline()

# Parsing articles
from bs4 import BeautifulSoup

# Utilities
from collections import Counter, defaultdict
import re

# Getting webpages
import requests


In [None]:
soup = BeautifulSoup(open('data/stats.html', 'r'), 'html.parser')
soup.text[:100]

In [None]:
table_rows = soup.find_all(attrs={'class':"sortableTable-row js-statsTableRow"})
print(f'Found {len(table_rows)} table entries.')

In [None]:
entry = table_rows[3]
entry

In [None]:
entry.find_all(text=' Unlisted')

In [None]:
entry.get('data-timestamp')

In [None]:
def convert_timestamp(ts: int):
    return pd.to_datetime(ts, origin='unix', unit='ms')

convert_timestamp(entry.get('data-timestamp'))

In [None]:
for i in entry.find_all(attrs={'class':'sortableTable-value'}):
    print(i)

In [None]:
for i in entry.find_all(attrs={'class':'u-sm-show'}):
    print(i) 

In [None]:
int(entry.find_all(attrs={'class':'readingTime'})[0].get('title').split(' ')[0])

In [None]:
entry.find_all(text='View story', attrs={'class': 'sortableTable-link'})

In [None]:
entry_dict = {}
for value, key in zip(entry.find_all(attrs={'class':'sortableTable-value'}),
            ['published_timestamp', 'views', 'reads', 'ratio', 'fans']):
    entry_dict[key] = float(value.text) if key == 'ratio' else int(value.text)
    
entry_dict['published_timestamp'] = convert_timestamp(entry_dict['published_timestamp'])
entry_dict['started_timestamp'] = convert_timestamp(entry.get('data-timestamp'))
entry_dict['read_time'] = int(entry.find_all(attrs={'class':'readingTime'})[0].get('title').split(' ')[0])

print(entry_dict)

In [None]:
table = soup.find_all(attrs={'class': 'js-statsTableBody'})
type(table[0])

In [None]:
type(entry)

In [None]:
def process_table_entry(entry, parallel=True):
    """
    Extract data from one entry in table
    
    :param entry: BeautifulSoup tag
    
    :return entry_dict: dictionary with data about entry
    
    """
    if parallel:
        entry = BeautifulSoup(entry, 'html.parser')
        
    entry_dict = {}
    for value, key in zip(entry.find_all(attrs={'class':'sortableTable-value'}),
            ['published_timestamp', 'views', 'reads', 'ratio', 'fans']):
        entry_dict[key] = float(value.text) if key == 'ratio' else int(value.text)
    
    entry_dict['published_timestamp'] = convert_timestamp(entry_dict['published_timestamp'])
    entry_dict['started_timestamp'] = convert_timestamp(entry.get('data-timestamp'))
    entry_dict['read_time'] = int(entry.find_all(attrs={'class':'readingTime'})[0].get('title').split(' ')[0])
    entry_dict['unlisted'] = True if len(entry.find_all(text=' Unlisted')) > 0 else False
    
    link = entry.find_all(text='View story', 
                               attrs={'class': 'sortableTable-link'})[0].get('href')
    
    # Retrieve the article and create a soup
    entry = requests.get(link).content
    entry_soup = BeautifulSoup(entry)
    
    # Find the title header (determines if an article or a response)
    if entry_soup.h1 is not None:
        title = entry_soup.h1.text
    else:
        title = f'response-{t}'

    # Text as single long string
    entry_text = [p.text for p in entry_soup.find_all('p')]
    entry_text = ' '.join(entry_text)

    # Word count
    word_count = len(entry_text.split(' '))

    # Reading time in minutes
    read_time = entry_soup.find_all(attrs={'class': 'readingTime'})
    read_mins = int(read_time[0].get('title').split(' ')[0])

    # Number of claps
    clap_pattern = re.compile('^[0-9]{1,} claps|^[0-9]{1,}.[0-9]{1,}K claps|^[0-9]{1,}K claps')
    claps = entry_soup.find_all(text = clap_pattern)

    if len(claps) > 0:
        if 'K' in claps[0]:
            clap_number = int(1e3 * float(claps[0].split('K')[0]))
        else:
            clap_number = int(claps[0].split(' ')[0])
    else:
        clap_number = 0

    # Post tags
    tags = entry_soup.find_all(attrs={'class': 'tags tags--postTags tags--borderless'})
    tags = [li.text for li in tags[0].find_all('li')]
        
    # Store in dictionary with title as key
    entry_dict['title'] = title
    entry_dict['text'] = entry_text
    entry_dict['word_count'] = word_count
    entry_dict['read_time'] = read_mins
    entry_dict['claps'] = clap_number
    entry_dict['tags'] = tags
    
    return entry_dict

In [None]:
entry_dict = process_table_entry(entry)

In [None]:
r = []
for i, e in enumerate(table_rows):
    print(f'{100 * i / len(table_rows):.2f}% complete. Total read time: {sum([t["read_time"] for t in r])}', 
          end = '\r')
    r.append(process_table_entry(e))

df = pd.DataFrame(r)
df.head()

In [None]:
df.corr()

In [None]:
colorscales = ['Greys', 'YlGnBu', 'Greens', 'YlOrRd', 'Bluered', 'RdBu',
            'Reds', 'Blues', 'Picnic', 'Rainbow', 'Portland', 'Jet',
            'Hot', 'Blackbody', 'Earth', 'Electric', 'Viridis', 'Cividis']

In [None]:
import plotly.figure_factory as ff
from plotly.offline import iplot 

corrs = df.corr()
figure = ff.create_annotated_heatmap(z = corrs.values, x =list(corrs.columns), 
                            y=list(corrs.index), showscale=True, 
                                     colorscale='Picnic',
                                     annotation_text=corrs.round(2).values)
iplot(figure)

In [None]:
# Add extra columns with more data
df['response'] = ['response' if x == True else 'article' for x in df['title'].str.contains('response')]
df['claps_per_word'] = df['claps'] / df['word_count']
df['words_per_minute'] = df['word_count'] / df['read_time']

# Add 10 most common tags with flag if data has it
n = 10
all_tags = list(chain(*df['tags'].tolist()))
tag_counts = Counter(all_tags)
tags = tag_counts.most_common(n)

for tag, count in tags:
    flag = [1 if tag in tags else 0 for tags in df['tags']]
    df.loc[:, f'<tag>{tag}'] = flag

In [None]:
df.dtypes

In [None]:
from multiprocessing import Pool
import sys

table_rows_str = [str(r) for r in table_rows]

pool = Pool(processes=10)
r = pool.map(process_table_entry, 
             table_rows_str)
pool.close()
pool.join()

In [None]:
df = pd.DataFrame(r)
df.head()

In [None]:
from itertools import chain

# Add extra columns with more data
df['response'] = ['response' if x == True else 'article' for x in df['title'].str.contains('response')]
df['claps_per_word'] = df['claps'] / df['word_count']
df['words_per_minute'] = df['word_count'] / df['read_time']

# Add 10 most common tags with flag if data has it
n = 10
all_tags = list(chain(*df['tags'].tolist()))
tag_counts = Counter(all_tags)
tags = tag_counts.most_common(n)

for tag, count in tags:
    flag = [1 if tag in tags else 0 for tags in df['tags']]
    df.loc[:, f'<tag>{tag}'] = flag

In [None]:
df.corr()

In [None]:
from timeit import default_timer as timer

In [None]:
def process_in_parallel(table_rows, processes):
    """
    Process all the stats in a table in parallel
    
    :param table_rows: BeautifulSoup table rows
    
    :return df: dataframe of information about each post
    
    """
    table_rows_str = [str(r) for r in table_rows]
    
    pool = Pool(processes=processes)
    results = [] 
    start = timer()
    for i, r in enumerate(pool.imap_unordered(process_table_entry, table_rows_str)):
        print(f'{100 * i / len(table_rows_str):.2f}% complete.', end = '\r')
        results.append(r)
    pool.close()
    pool.join()
    end = timer()
    print(f'Processed {len(table_rows_str)} articles in {end-start:.2f} seconds.')
    df = pd.DataFrame(results)
    
    # Add extra columns with more data
    df['response'] = ['response' if x == True else 'article' for x in df['title'].str.contains('response')]
    df['claps_per_word'] = df['claps'] / df['word_count']
    df['words_per_minute'] = df['word_count'] / df['read_time']

    # Add 10 most common tags with flag if data has it
    n = 10
    all_tags = list(chain(*df['tags'].tolist()))
    tag_counts = Counter(all_tags)
    tags = tag_counts.most_common(n)

    for tag, count in tags:
        flag = [1 if tag in tags else 0 for tags in df['tags']]
        df.loc[:, f'<tag>{tag}'] = flag
    

    return df

In [None]:
df = process_in_parallel(table_rows, processes=20)

In [None]:
df['unlisted'] = df['unlisted'].astype(str)

In [None]:
df.iplot(x = 'published_timestamp', y = 'word_count', categories='unlisted');

In [None]:
df[df['unlisted'] == 'False'].iplot(x = 'published_timestamp', y = 'read_time',
                                    mode = 'markers')

In [None]:
df.head()

In [None]:
df.shape