# Python Web Scraping Project Using Beautiful Soup

In [35]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
import os

In [36]:
urls = {
    'Towards Data Science': 'https://towardsdatascience.com/archive/{0}/{1:02d}/{2:02d}',
    'UX Collective': 'https://uxdesign.cc/archive/{0}/{1:02d}/{2:02d}',
    'The Startup': 'https://medium.com/swlh/archive/{0}/{1:02d}/{2:02d}',
    'The Writing Cooperative': 'https://writingcooperative.com/archive/{0}/{1:02d}/{2:02d}',
    'Data Driven Investor': 'https://medium.com/datadriveninvestor/archive/{0}/{1:02d}/{2:02d}',
    'Better Humans': 'https://medium.com/better-humans/archive/{0}/{1:02d}/{2:02d}',
    'Better Marketing': 'https://medium.com/better-marketing/archive/{0}/{1:02d}/{2:02d}',
}

In [37]:
def is_leap(year):
    if year % 4 != 0:
        return False
    elif year % 100 != 0:
        return True
    elif year % 400 != 0:
        return False
    else:
        return True
    
def convert_day(day, year):
    month_days = [31, 29 if is_leap(year) else 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    m = 0
    d = 0
    while day > 0:
        d = day
        day -= month_days[m]
        m += 1
    return (m, d)

def get_claps(claps_str):
    if (claps_str is None) or (claps_str == '') or (claps_str.split is None):
        return 0
    split = claps_str.split('K')
    claps = float(split[0])
    claps = int(claps*1000) if len(split) == 2 else int(claps)
    return claps

def get_img(img_url, dest_folder, dest_filename):
    ext = img_url.split('.')[-1]
    if len(ext) > 4:
        ext = 'jpg'
    dest_filename = f'{dest_filename}.{ext}'
    with open(f'{dest_folder}/{dest_filename}', 'wb') as f:
        f.write(requests.get(img_url, allow_redirects=False).content)
    return dest_filename

In [38]:
year = 2019
selected_days = random.sample([i for i in range(1, 367 if is_leap(year) else 366)], 50)

In [39]:
img_dir = 'images'
if not os.path.exists(img_dir):
    os.mkdir(img_dir)

In [41]:
data = []
article_id = 0
i = 0
n = len(selected_days)
for d in selected_days:
    i += 1
    month, day = convert_day(d, year)
    date = '{0}-{1:02d}-{2:02d}'.format(year, month, day)
    print(f'{i} / {n} ; {date}')
    for publication, url in urls.items():
        response = requests.get(url.format(year, month, day), allow_redirects=True)
        if not response.url.startswith(url.format(year, month, day)):
            continue
        page = response.content
        soup = BeautifulSoup(page, 'html.parser')
        articles = soup.find_all("div", class_="postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls")
        for article in articles:
            title = article.find("h3", class_="graf--title")
            if title is None:
                continue
            title = title.contents[0]
            article_id += 1
            subtitle = article.find("h4", class_="graf--subtitle")
            subtitle = subtitle.contents[0] if subtitle is not None else ''
            image = article.find("img", class_="graf-image")
            image = '' if image is None else get_img(image['src'], 'images', f'{article_id}')
            article_url = article.find_all("a")[3]['href'].split('?')[0]
            claps = get_claps(article.find_all("button")[1].contents[0])
            reading_time = article.find("span", class_="readingTime")
            reading_time = 0 if reading_time is None else int(reading_time['title'].split(' ')[0])
            responses = article.find_all("a")
            if len(responses) == 7:
                responses = responses[6].contents[0].split(' ')
                if len(responses) == 0:
                    responses = 0
                else:
                    responses = responses[0]
            else:
                responses = 0

            data.append([article_id, article_url, title, subtitle, image, claps, responses, reading_time, publication, date])

1 / 50 ; 2019-02-28


IndexError: list index out of range

In [42]:
medium_df = pd.DataFrame(data, columns=['id', 'url', 'title', 'subtitle', 'image', 'claps', 'responses', 'reading_time', 'publication', 'date'])

In [43]:
medium_df

Unnamed: 0,id,url,title,subtitle,image,claps,responses,reading_time,publication,date
0,1,https://towardsdatascience.com/the-ultimate-gu...,The Ultimate Guide to Data Cleaning,When the data is spewing garbage,1.png,3100,10,15,Towards Data Science,2019-02-28
1,2,https://towardsdatascience.com/10-git-commands...,10 Git Commands You Should Know,Plus tips to save time with Git,2.jpeg,6600,6,6,Towards Data Science,2019-02-28
2,3,https://towardsdatascience.com/implementing-a-...,Implementing a Naive Bayes classifier for text...,The Naive Bayes classifier…,3.png,243,2,8,Towards Data Science,2019-02-28
3,4,https://towardsdatascience.com/data-engineerin...,"Data Engineering with Python, Django, and Post...",,4.jpeg,389,2,10,Towards Data Science,2019-02-28
4,5,https://towardsdatascience.com/reinforcement-l...,Reinforcement Learning Tutorial Part 3: Basic ...,,5.jpeg,344,6,5,Towards Data Science,2019-02-28
5,6,https://towardsdatascience.com/speaker-diariza...,Speaker Diarization with Kaldi,the ability to process audio of multiple speak...,6.jpg,640,9,8,Towards Data Science,2019-02-28
6,7,https://towardsdatascience.com/markov-chain-mo...,Markov Chain Monte Carlo,Lifting your understanding of MCMC to an inter...,7.png,390,2,5,Towards Data Science,2019-02-28
7,8,https://towardsdatascience.com/can-we-stop-wit...,Can we stop with the SQL JOINs venn diagrams i...,,8.png,1000,12,5,Towards Data Science,2019-02-28
8,9,https://towardsdatascience.com/how-to-make-you...,How to make your model awesome with Optuna,Easily and efficiently optimize model’s hyperp...,9.jpeg,435,4,5,Towards Data Science,2019-02-28
9,10,https://towardsdatascience.com/one-neural-netw...,"One neural network, many uses","Build image search, image captioning, similar ...",10.jpeg,1800,7,15,Towards Data Science,2019-02-28


In [44]:
medium_df.to_csv('medium_data.csv', index=False)