In [None]:
import numpy as np
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
from textblob import TextBlob
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
urls = {
    'News':     'https://medium.com/tag/news/archive/{0}/{1:02d}/{2:02d}',
    'Politics': 'https://medium.com/tag/politics/archive/{0}/{1:02d}/{2:02d}',
    'Culture':  'https://medium.com/tag/culture/archive/{0}/{1:02d}/{2:02d}',
    'Music':    'https://medium.com/tag/music/archive/{0}/{1:02d}/{2:02d}',
    'History':  'https://medium.com/tag/history/archive/{0}/{1:02d}/{2:02d}',
    'Journalism': 'https://medium.com/tag/journalism/archive/{0}/{1:02d}/{2:02d}',
    'Technology': 'https://medium.com/tag/technology/archive/{0}/{1:02d}/{2:02d}',
}

In [None]:
def is_leap(year):
    if year % 4 != 0:
        return False
    elif year % 100 != 0:
        return True
    elif year % 400 != 0:
        return False
    else:
        return True
    
def convert_day(day, year):
    month_days = [31, 29 if is_leap(year) else 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    m = 0
    d = 0
    while day > 0:
        d = day
        day -= month_days[m]
        m += 1
    return (m, d)

def get_claps(claps_str):
    if (claps_str is None) or (claps_str == '') or (claps_str.split is None):
        return 0
    split = claps_str.split('K')
    claps = float(split[0])
    claps = int(claps*1000) if len(split) == 2 else int(claps)
    return claps

def get_img(img_url, dest_folder, dest_filename):
    ext = img_url.split('.')[-1]
    if len(ext) > 4:
        ext = 'jpg'
    dest_filename = f'{dest_filename}.{ext}'
    with open(f'{dest_folder}/{dest_filename}', 'wb') as f:
        f.write(requests.get(img_url, allow_redirects=False).content)
    return dest_filename

def Average(lst): 
    return sum(lst) / len(lst) 

def return_content(url):
  response = requests.get(url)
  soup = BeautifulSoup(response.text, 'html.parser')

  paragraphs = soup.find_all(['li', 'p', 'strong', 'em', 'h1'])
  text = []
  for p in paragraphs:
        if not p.href:
            if len(p.get_text()) > 5:
                text.append(p.get_text())
  
 # Number of Images
  num_imgs = int(len(soup.find_all('img', alt="Image for post")) / 3)

  return ' '.join(text), num_imgs

In [None]:
year = 2020
n_samples = 1

# selected_days = random.sample([i for i in range(1, 367 if is_leap(year) else 366)], n_samples)
# if year == 2020:
#   selected_days = random.sample([i for i in range(1, 300)], n_samples)

selected_days = np.arange(1, 300, 22)
selected_days = selected_days[7:]
print(selected_days)
print([no % 7 for no in selected_days])

[155 177 199 221 243 265 287]
[1, 2, 3, 4, 5, 6, 0]


In [None]:
def dummify(df, cate_variables):
    '''
    @Summary: convert the categorical variables to numeric variables by using dummies (binary).
    Old categorical variables will be dropped.
    @return: A copy of the old dataframe with new converted numeric variables. 
    '''
    # make a copy before creating dummies
    df_new = df.copy()
    
    # convert text-based columns to dummies (except v22)
    for var_name in cate_variables:
        dummies = pd.get_dummies(df[var_name], prefix=var_name)
        
        # Drop the current variable, concat/append the dummy dataframe to the dataframe.
        df_new = pd.concat([df_new.drop(var_name, 1), dummies.iloc[:,1:]], axis = 1)
    
    return df_new

In [None]:
data = []
article_id = 1024
i = 0
n = len(selected_days)
for d in selected_days:
    i += 1

    # Date
    month, day = convert_day(d, year)
    date = '{0}-{1:02d}-{2:02d}'.format(year, month, day)
    weekday = (2 + d) % 7 # 1 -> Monday
    timedelta = 347 - d
    print(f'{i} / {n} ; {date}')

    
    for publication, url in urls.items():
        response = requests.get(url.format(year, month, day), allow_redirects=True)
        if not response.url.startswith(url.format(year, month, day)):
            continue
        
        # Scrape Content of the Archive Page
        page = response.content
        
        # Parse the Archive Page
        soup = BeautifulSoup(page, 'html.parser')
        
        # Articles in the archive
        articles = soup.find_all("div", class_="postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls")
        for article in articles:
            # Scrape Title
            title = article.find("h3", class_="graf--title")
            if title is None:
                continue
            title = title.contents[0]
            
            # Scrape Subtitle
            subtitle = article.find("h4", class_="graf--subtitle")
            if subtitle is None:
                continue
            subtitle = subtitle.contents[0] if subtitle is not None else ''
            
            # Scrape Title Image - Unnecessary
            # image = article.find("img", class_="graf-image")
            # image = '' if image is None else get_img(image['src'], 'images', f'{article_id}')
            
            # Scrape Article URL
            article_url = article.find_all("a")[3]['href'].split('?')[0]

            # Get Claps
            claps = get_claps(article.find_all("button")[1].contents[0])
            if claps == 0:
              continue

            # Scrape Content
            text, num_imgs = return_content(article_url)

            contentBlob = TextBlob(text).lower()
                                
            # Number of words in the content
            n_tokens_content = len(contentBlob.words)
            n_unique_tokens = len(list(set(contentBlob.words)))
            
            # article sentiment
            content_sentiment_polarity = contentBlob.sentiment.polarity
            content_subjectivity = contentBlob.sentiment.subjectivity 

            try: 
              titleBlob = TextBlob(title).lower()
            except TypeError:
              continue

            # Language Detection
            try:
              lang = titleBlob.detect_language()
            except:
              continue

            # Number of words in the title
            n_tokens_title = len(titleBlob.words)

            # title sentiment
            title_sentiment_polarity = titleBlob.sentiment.polarity
            title_subjectivity = titleBlob.sentiment.subjectivity

            # Stop Words
            tokens_not_sw = [word for word in contentBlob.words if word not in stopwords.words()]
            n_non_stop_words = len(tokens_not_sw)
            n_non_stop_unique_tokens = len(list(set(tokens_not_sw)))

            # Word length
            if len(contentBlob.words) == 0:
              continue
            wordlength = [len(word) for word in contentBlob.words]
            average_token_length = Average(wordlength)

            # Scrape Reading Time
            reading_time = article.find("span", class_="readingTime")
            reading_time = 0 if reading_time is None else int(reading_time['title'].split(' ')[0])

            # Article ID
            article_id += 1
            responses = article.find_all("a")
            if len(responses) == 7:
                responses = responses[6].contents[0].split(' ')
                if len(responses) == 0:
                    responses = 0
                else:
                    responses = responses[0]
            else:
                responses = 0

            data.append([article_id, article_url, title, subtitle, text, num_imgs, claps, responses, reading_time, publication, date, timedelta, weekday, n_tokens_content, n_unique_tokens, content_sentiment_polarity, content_subjectivity, n_tokens_title, title_sentiment_polarity, title_subjectivity, n_non_stop_words, n_non_stop_unique_tokens, average_token_length, lang])

1 / 7 ; 2020-06-03
2 / 7 ; 2020-06-25
3 / 7 ; 2020-07-17
4 / 7 ; 2020-08-08
5 / 7 ; 2020-08-30
6 / 7 ; 2020-09-21
7 / 7 ; 2020-10-13


In [None]:
medium_df = pd.DataFrame(data, columns=['ID', 'URL', 'Title', 'Subtitle', 'Content', 'Number of Images', 'Claps', 'Responses', 'Reading_time', 'Publication', 'Date', 'TimeDelta', 'Weekday', 'n_tokens_content', 'n_unique_tokens', 'content_sentiment_polarity', 'content_subjectivity', 'n_tokens_title', 'title_sentiment_polarity', 'title_subjectivity', 'n_non_stop_words', 'n_non_stop_unique_tokens', 'average_token_length', 'Language'])

In [None]:
medium_df

Unnamed: 0,ID,URL,Title,Subtitle,Content,Number of Images,Claps,Responses,Reading_time,Publication,Date,TimeDelta,Weekday,n_tokens_content,n_unique_tokens,content_sentiment_polarity,content_subjectivity,n_tokens_title,title_sentiment_polarity,title_subjectivity,n_non_stop_words,n_non_stop_unique_tokens,average_token_length,Language
0,1025,https://blog.gojekengineering.com/global-techn...,Global technology and payments companies inves...,The investment will boost Southeast Asia’s…,Culture Design Stories We're Hiring! Global t...,2,20,0,5,News,2020-06-03,192,3,1130,420,0.184232,0.386133,8,0.0,0.000000,685,359,5.120354,en
1,1026,https://medium.com/discourse/the-libertarian-t...,The Libertarian to Fascist Pipeline is Shorter...,"Though opposing ideologies, Libertarians…",Politics Economy Energy & Climate Technology S...,1,14,2,8,News,2020-06-03,192,3,2055,738,0.139039,0.458405,10,0.0,0.000000,1129,623,5.016058,en
2,1027,https://medium.com/@hcsiratt/a-plea-for-unity-...,A Plea for Unity and Peace,"In times of crisis, we must unite as a community",A Plea for Unity and Peace The recent death of...,1,10,0,7,News,2020-06-03,192,3,1782,640,-0.000158,0.512571,6,0.0,0.000000,865,537,4.602694,en
3,1028,https://medium.com/quatria/is-this-cave-painti...,Is This Cave Painting A Prehistoric Map To Ant...,Scientists say this old drawing contains an…,Is This Cave Painting A Prehistoric Map To Ant...,1,53,0,3,News,2020-06-03,192,3,472,264,0.111264,0.356929,9,0.0,0.000000,269,211,5.076271,en
4,1029,https://medium.com/@ypi_78836/read-news-read-c...,"Read News, Read China, an NLP approach",Tracking the polices change from the enormous ...,"Read News, Read China, an NLP approach Thanks ...",9,1,0,6,News,2020-06-03,192,3,1098,494,0.076150,0.350728,7,0.0,0.000000,646,422,5.030055,en
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1550,2575,https://medium.com/an-idea/the-e-reader-is-a-p...,The E-reader Is a Powerful Tool for Writers,An E-Reader Is a Writer’s Card File,"Life, Health & Nutrition Business & Travel Sci...",1,53,0,4,Technology,2020-10-13,60,2,981,348,0.198784,0.544020,8,0.3,1.000000,445,269,4.126402,en
1551,2576,https://medium.com/callforcode/a-global-crisis...,A global crisis needs a global response,"Mami Mizutori, UNDRR talks about what it will ...",Regional Wrap-Up Series A global crisis needs ...,4,1,0,3,Technology,2020-10-13,60,2,429,230,0.228416,0.403002,7,0.0,0.000000,249,183,4.780886,en
1552,2577,https://medium.com/illumination/backlinks-impo...,Backlinks Important For SEO But You Don’t Need...,I can’t promise your blog the first position…,Business Poetry Fiction Philosophy Science Tec...,1,56,0,4,Technology,2020-10-13,60,2,916,322,0.135354,0.495042,12,0.4,1.000000,466,244,4.529476,en
1553,2578,https://medium.com/genius-in-a-bottle/i-found-...,I Found My Lost Attention,A poem,Submission Guidelines Prompt Guidelines Archiv...,1,510,1,1,Technology,2020-10-13,60,2,138,89,0.425000,0.650000,5,0.0,0.000000,85,65,4.934783,en


In [None]:
medium_df.to_csv('medium_data.csv', index=False)

In [None]:
# url = 'https://medium.com/unifiprotocol/unifi-protocol-is-launching-on-ethereum-5501f19c4b6'
# response = requests.get(url)
# soup = BeautifulSoup(response.text, 'html.parser')

# paragraphs = soup.find_all(['li', 'p', 'strong', 'em', 'h1'])
# text = []
# for p in paragraphs:
#       if not p.href:
#           if len(p.get_text()) > 5:
#               text.append(p.get_text())

# num_imgs = int(len(soup.findAll('img', alt="Image for post")) / 3)