In [56]:
import datetime as dt
from steemdata import SteemData

import pandas as pd
import numpy as np

try:
    import plotly.plotly as py
    import plotly.graph_objs as go
    import cufflinks as cf
except:
    !pip install plotly
    !pip install cufflinks
    
# helpers
from toolz import keyfilter

def keep(d, whitelist):
    return keyfilter(lambda k: k in whitelist, d)

def omit(d, blacklist):
    return keyfilter(lambda k: k not in blacklist, d)

In [57]:
db = SteemData().db

## Methods

In [58]:
import json
import requests as rs
from steem.post import Post

def get_csrf(url):
    resp = rs.get(url)

    if resp.status_code == 200:
        body = resp.text
        start_index = body.find('"csrf":"')
        csrf = body[start_index + 8:start_index + 44]
        return csrf, resp.cookies

def get_view_count(url_partial, csrf, cookies):
    payload = {
        "csrf": csrf,
        "page": url_partial,
        "ref": "",
    }

    url = 'https://steemit.com/api/v1/page_view'
    resp = rs.post(url, json=payload, cookies=cookies)

    if resp.status_code == 200 and resp.text:
        return json.loads(resp.text)
    return {}

def get_views(identifier):
    p = Post(identifier)
    csrf, cookies = get_csrf('https://steemit.com' + p.url)
    return get_view_count(p.url, csrf, cookies).get('views', -1)

## Top Viewed Posts

In [60]:
# time constraints
time_constraints = {
    '$gte': dt.datetime.now() - dt.timedelta(days=8),
}
conditions = {
    'created': time_constraints,
#     'net_rshares': {'$gt': 0},
#     'author_reputation': {'$gt': 0},
    'net_votes': {'$gt': 100},
    'children': {'$gt': 3},
}
projection = {
    '_id': 0,
    'identifier': 1,
    'net_votes': 1,
    'net_rshares': 1,
    'author_reputation': 1,
    'title': 1,
    'author': 1,
    'pending_payout_value': 1,
    'total_payout_value': 1,
}
posts = list(db['Posts'].find(conditions, projection=projection))

In [61]:
def filter_spam(post):
    if int(post['net_rshares']) < 0 or int(post['author_reputation']) < 0:
        return False
    return True

In [62]:
posts = list(filter(filter_spam, posts))

In [63]:
len(posts)

2140

In [79]:
# posts2 = []
# posts_gen = iter(posts)

In [80]:
# from contextlib import suppress
# import time

# for p in posts_gen:
#     with suppress(Exception):
#         views = get_views(p['identifier'])
#         time.sleep(0.01)
#         print(views)
        
#         posts2.append({
#             **p,
#             'views': views
#         })
        

In [65]:
from funcy import silent
posts2 = [{**x, 'views': silent(get_views)(x['identifier'])} for x in posts]

INFO:steembase.http_client:non 200 response:502
INFO:steembase.http_client:failed to load response
INFO:steembase.http_client:non 200 response:502
INFO:steembase.http_client:failed to load response
INFO:steembase.http_client:non 200 response:502
INFO:steembase.http_client:failed to load response
INFO:steembase.http_client:non 200 response:502
INFO:steembase.http_client:failed to load response
INFO:steembase.http_client:non 200 response:502
INFO:steembase.http_client:failed to load response
INFO:steembase.http_client:non 200 response:502
INFO:steembase.http_client:failed to load response


KeyboardInterrupt: 

In [74]:
posts3 = [{
    **x,
    'pending_payout_value': x['pending_payout_value']['amount'],
    'total_payout_value': x['total_payout_value']['amount'],
} for x in posts2]

In [75]:
df = pd.DataFrame(posts3)

In [76]:
df2 = df[['author', 'title', 'net_votes', 'pending_payout_value', 'views']].dropna()
df2['author'] = df2['author'].apply(lambda x: "@%s" % x)
df2['views'] = df2['views'].apply(int)
df2['title'] = df2['title'].apply(lambda x: x[:30])

In [77]:
df3 = df2.sort_values('views', ascending=False).head(25)
df3.head(5)

Unnamed: 0,author,title,net_votes,pending_payout_value,views
246,@noisy,We just hacked 11 accounts on,1518,5323.804,14821
1816,@joseph,Why Will Ethereum Fail?,1234,3285.757,9205
881,@darthnava,"Fellow Steemers, I Need Help a",3129,14800.853,8587
1239,@timcliff,You should join Steemit. Here,1441,2723.802,5138
474,@teamsteem,Some Steem Tips For Some Steem,1137,2955.529,3785


In [78]:
from tabulate import tabulate
print(tabulate(df3.values.tolist(), df3.columns.tolist(), tablefmt="html"))

<table>
<thead>
<tr><th>author          </th><th>title                                         </th><th style="text-align: right;">  net_votes</th><th style="text-align: right;">  pending_payout_value</th><th style="text-align: right;">  views</th></tr>
</thead>
<tbody>
<tr><td>@noisy          </td><td>We just hacked 11 accounts on                 </td><td style="text-align: right;">       1518</td><td style="text-align: right;">              5323.8  </td><td style="text-align: right;">  14821</td></tr>
<tr><td>@joseph         </td><td>Why Will Ethereum Fail?                       </td><td style="text-align: right;">       1234</td><td style="text-align: right;">              3285.76 </td><td style="text-align: right;">   9205</td></tr>
<tr><td>@darthnava      </td><td>Fellow Steemers, I Need Help a                </td><td style="text-align: right;">       3129</td><td style="text-align: right;">             14800.9  </td><td style="text-align: right;">   8587</td></tr>
<tr><td>@timcli

## Top Posts by Languages

In [None]:
# time constraints
time_constraints = {
    '$gte': dt.datetime.now() - dt.timedelta(days=7),
}
conditions = {
    'created': time_constraints,
    'net_votes': {'$gt': 3},
    'children': {'$gt': 1},
}
projection = {
    '_id': 0,
    'identifier': 1,
    'title': 1,
    'author': 1,
    'body': 1,
}
lang_posts = list(db['Posts'].find(conditions, projection=projection))

In [None]:
len(lang_posts)

In [None]:
from langdetect import detect_langs
from funcy.colls import pluck
from funcy.seqs import first, last
from toolz.functoolz import compose, thread_last
from contextlib import suppress
from collections import Counter

In [None]:
def detect(body):
    with suppress(Exception):
        langs = detect_langs(body)
        if langs:
            return first(langs)
    
    return []

In [None]:
languages = thread_last(
    filter(lambda x: len(x['body']) > 100, lang_posts),
    (pluck, 'body'),
    (map, detect),
    (filter, bool)
)

In [None]:
languages = [x.lang for x in languages if x and x.prob > 0.8]

In [None]:
c = Counter(languages)

In [None]:
c.most_common(10)

In [None]:
normalized = [{'language': first(x), 'pct_share': round(last(x) / len(languages) * 100, 3)} for x in c.most_common(10)]

In [None]:
normalized

In [None]:
df = pd.DataFrame(normalized)
df.index = range(1,len(df)+1)

In [None]:
df.head(10)

In [None]:
import plotly.plotly as py
import plotly.graph_objs as go

labels = [first(x) for x in c.most_common(7)]
values = [last(x) for x in c.most_common(7)]
colors = ['#FEBFB3', '#E1396C', '#96D38C', '#D0F9B1']

trace = go.Pie(labels=labels, values=values,
               hoverinfo='label+percent', textinfo='label', 
               textfont=dict(size=20),
               marker=dict(colors=colors, 
                           line=dict(color='#000000', width=2)))

py.iplot([trace], filename='styled_pie_chart')

In [None]:
## todo, create a distinct filter on author field, to count % as unique persons, not as number of posts