# Medium Article Scraper

This notebook scrapes Medium articles to extract:
- Title
- Author and author URL
- Article content
- Claps count
- Reading time
- Publication
- Date
- Image sources

The data will be saved in a CSV file with 500 rows.

## 1. Import Required Libraries

In [15]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from tqdm import tqdm
import random
import json
import re

## 2. Define Helper Functions

In [16]:
def clean_text(text):
    """Clean and normalize text content"""
    if text is None:
        return None
    return ' '.join(text.strip().split())

def extract_article_info(soup, url):
    """Extract all required information from a Medium article"""
    article = {'url': url}
    
    # Extract title
    try:
        title_tag = soup.find('h1')
        article['title'] = clean_text(title_tag.text) if title_tag else None
    except Exception as e:
        print(f"Error extracting title: {str(e)}")
        article['title'] = None

    # Extract subtitles (h2 tags)
    try:
        subtitles = soup.find_all('h2')
        article['subtitles'] = [clean_text(h2.text) for h2 in subtitles]
    except Exception as e:
        print(f"Error extracting subtitles: {str(e)}")
        article['subtitles'] = None

    # Extract author information
    try:
        author_tag = soup.find('a', {'rel': 'author'}) or soup.find('a', {'class': 'author'})
        if author_tag:
            article['author'] = clean_text(author_tag.text)
            article['author_url'] = author_tag.get('href')
            if not article['author_url'].startswith('http'):
                article['author_url'] = f"https://medium.com{article['author_url']}"
        else:
            article['author'] = None
            article['author_url'] = None
    except Exception as e:
        print(f"Error extracting author info: {str(e)}")
        article['author'] = None
        article['author_url'] = None

    # Extract claps count
    try:
        claps_button = soup.find('button', string=re.compile(r'[\d.KM]+\s*claps?', re.IGNORECASE))
        if claps_button:
            claps_text = claps_button.text.strip().lower()  # e.g., "1.2K claps" or "50 claps"
            # Extract number and multiplier
            match = re.search(r'([\d.]+)([km]?)', claps_text)
            if match:
                number = float(match.group(1))  # e.g., 1.2 or 50
                multiplier = match.group(2)     # e.g., "k", "m", or empty
                if multiplier == 'k':
                    article['claps'] = int(number * 1000)    # e.g., 1.2K -> 1200
                elif multiplier == 'm':
                    article['claps'] = int(number * 1000000) # e.g., 3M -> 3000000
                else:
                    article['claps'] = int(number)           # e.g., 50 -> 50
            else:
                article['claps'] = 0
        else:
            article['claps'] = 0
    except Exception as e:
        print(f"Error extracting claps: {str(e)}")
        article['claps'] = 0

    # Extract reading time
    try:
        reading_time = soup.find('span', string=re.compile(r'\d+\s*min read'))
        if reading_time:
            article['reading_time'] = int(''.join(filter(str.isdigit, reading_time.text)))
        else:
            article['reading_time'] = None
    except Exception as e:
        print(f"Error extracting reading time: {str(e)}")
        article['reading_time'] = None

    # Extract full article content
    try:
        article_sections = soup.find_all(['p', 'h2', 'h3', 'blockquote'])
        article['content'] = '\n'.join([clean_text(section.text) for section in article_sections])
    except Exception as e:
        print(f"Error extracting content: {str(e)}")
        article['content'] = None

    # Extract image sources
    try:
        images = soup.find_all('img')
        article['image_sources'] = [img.get('src') for img in images if img.get('src') 
                                   and not img.get('src').endswith(('.svg', '.gif'))]
    except Exception as e:
        print(f"Error extracting images: {str(e)}")
        article['image_sources'] = None

    return article

## 3. Main Scraping Loop

In [17]:
# Load URLs
data = pd.read_csv('url_technology.csv')
data.columns = ['urls']
# Take first 500 URLs
data = data.head(500)
print(f"Total URLs to process: {len(data)}")

# Initialize lists to store data
articles = []
failed_urls = []

# Main scraping loop with progress bar
print("\nStarting to scrape articles...")
for i, url in tqdm(enumerate(data['urls']), total=len(data)):
    try:
        # Add random delay between requests (1-3 seconds)
        time.sleep(random.uniform(1, 3))
        
        # Try to scrape the article
        article_info = scrape_medium_article(url)
        articles.append(article_info)
            
    except Exception as e:
        print(f"\nError processing {url} - {str(e)}")
        failed_urls.append({'url': url, 'error': str(e)})
    
    # Save progress every 50 articles
    if (i + 1) % 50 == 0:
        df = pd.DataFrame(articles)
        df.to_csv(f'articles_checkpoint_{i+1}.csv', index=False)
        print(f"\nCheckpoint saved: {i+1} articles processed")

Total URLs to process: 500

Starting to scrape articles...


 10%|█         | 50/500 [02:51<24:27,  3.26s/it]


Checkpoint saved: 50 articles processed


 13%|█▎        | 63/500 [03:39<22:52,  3.14s/it]


Error processing https://siddarth.design/taking-back-the-control-e6c28305ce51?source=tag_archive---------62----------------------- - HTTPSConnectionPool(host='siddarth.design', port=443): Max retries exceeded with url: /taking-back-the-control-e6c28305ce51?source=tag_archive---------62----------------------- (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001AE5AC86420>: Failed to resolve 'siddarth.design' ([Errno 11001] getaddrinfo failed)"))


 17%|█▋        | 83/500 [04:43<18:47,  2.70s/it]


Error processing https://blog.polyverse.io/polyverse-weekly-breach-report-1d1601e11e3f?source=tag_archive---------82----------------------- - HTTPSConnectionPool(host='blog.polyverse.io', port=443): Max retries exceeded with url: /polyverse-weekly-breach-report-1d1601e11e3f?source=tag_archive---------82----------------------- (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001AE5F55FD70>: Failed to resolve 'blog.polyverse.io' ([Errno 11001] getaddrinfo failed)"))


 20%|██        | 100/500 [05:40<22:48,  3.42s/it]


Checkpoint saved: 100 articles processed


 30%|███       | 150/500 [08:34<19:00,  3.26s/it]


Checkpoint saved: 150 articles processed


 34%|███▍      | 169/500 [09:37<17:30,  3.17s/it]


Error processing https://blog.pixels.camp/first-keynote-speakers-9c5d778764be?source=tag_archive---------63----------------------- - HTTPSConnectionPool(host='blog.pixels.camp', port=443): Max retries exceeded with url: /first-keynote-speakers-9c5d778764be?source=tag_archive---------63----------------------- (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001AE5EA48C80>: Failed to resolve 'blog.pixels.camp' ([Errno 11001] getaddrinfo failed)"))


 34%|███▍      | 172/500 [09:50<19:59,  3.66s/it]


Error processing https://tincture.io/2019-forecast-amaras-law-996a8ef7f2c9?source=tag_archive---------66----------------------- - HTTPSConnectionPool(host='tincture.io', port=443): Max retries exceeded with url: /2019-forecast-amaras-law-996a8ef7f2c9?source=tag_archive---------66----------------------- (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001AE5EA9F590>: Failed to resolve 'tincture.io' ([Errno 11002] getaddrinfo failed)"))


 40%|████      | 200/500 [11:33<17:57,  3.59s/it]


Checkpoint saved: 200 articles processed


 50%|█████     | 250/500 [14:29<14:45,  3.54s/it]


Checkpoint saved: 250 articles processed


 52%|█████▏    | 260/500 [15:02<12:44,  3.19s/it]


Error processing https://metizamagazine.com/the-best-brain-training-apps-4cde3973e27f?source=tag_archive---------154----------------------- - HTTPSConnectionPool(host='metizamagazine.com', port=443): Max retries exceeded with url: /the-best-brain-training-apps-4cde3973e27f?source=tag_archive---------154----------------------- (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1010)')))


 60%|██████    | 300/500 [17:05<10:20,  3.10s/it]


Checkpoint saved: 300 articles processed


 62%|██████▏   | 310/500 [17:39<11:10,  3.53s/it]


Error processing https://blog.usejournal.com/the-mythical-10x-programmer-e759a4ba2f0?source=tag_archive---------11----------------------- - HTTPSConnectionPool(host='blog.usejournal.com', port=443): Max retries exceeded with url: /the-mythical-10x-programmer-e759a4ba2f0?source=tag_archive---------11----------------------- (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001AE607C60C0>: Failed to resolve 'blog.usejournal.com' ([Errno 11001] getaddrinfo failed)"))


 63%|██████▎   | 314/500 [17:52<11:01,  3.56s/it]


Error processing https://blog.gojekengineering.com/gojeks-impact-on-indonesia-s-gig-economy-990a60cd23b9?source=tag_archive---------15----------------------- - HTTPSConnectionPool(host='blog.gojekengineering.com', port=443): Max retries exceeded with url: /gojeks-impact-on-indonesia-s-gig-economy-990a60cd23b9?source=tag_archive---------15----------------------- (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001AE5A0311C0>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))


 68%|██████▊   | 341/500 [19:19<07:47,  2.94s/it]


Error processing https://blog.kstart.in/this-and-that-i-am-excited-for-2019-497e6fda4461?source=tag_archive---------42----------------------- - HTTPSConnectionPool(host='blog.kstart.in', port=443): Max retries exceeded with url: /this-and-that-i-am-excited-for-2019-497e6fda4461?source=tag_archive---------42----------------------- (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001AE5FAF2A50>: Failed to resolve 'blog.kstart.in' ([Errno 11001] getaddrinfo failed)"))


 70%|███████   | 350/500 [19:52<08:53,  3.55s/it]


Checkpoint saved: 350 articles processed


 80%|████████  | 400/500 [22:37<04:59,  3.00s/it]


Checkpoint saved: 400 articles processed


 90%|█████████ | 450/500 [25:23<02:56,  3.52s/it]


Checkpoint saved: 450 articles processed


 96%|█████████▌| 478/500 [26:59<01:09,  3.16s/it]


Error processing https://blog.uplabs.com/10-years-2-startups-later-i-finally-built-what-i-always-needed-to-create-websites-94bdd0b498cf?source=tag_archive---------179----------------------- - HTTPSConnectionPool(host='blog.uplabs.com', port=443): Max retries exceeded with url: /10-years-2-startups-later-i-finally-built-what-i-always-needed-to-create-websites-94bdd0b498cf?source=tag_archive---------179----------------------- (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001AE607A1010>: Failed to resolve 'blog.uplabs.com' ([Errno 11001] getaddrinfo failed)"))


100%|██████████| 500/500 [28:03<00:00,  3.37s/it]


Checkpoint saved: 500 articles processed





## 4. Save Results

In [18]:
# Save final results
print("\nSaving final results...")
articles_df = pd.DataFrame(articles)

# Convert list columns to string representation
articles_df['image_sources'] = articles_df['image_sources'].apply(lambda x: json.dumps(x) if x else None)

# Save to CSV
articles_df.to_csv('medium_articles_full.csv', index=False)
failed_df = pd.DataFrame(failed_urls)
failed_df.to_csv('failed_urls.csv', index=False)

print(f"\nScraping completed!")
print(f"Successfully scraped: {len(articles)} articles")
print(f"Failed URLs: {len(failed_urls)}")

# Display first few rows of the results
articles_df.head()


Saving final results...

Scraping completed!
Successfully scraped: 491 articles
Failed URLs: 9


Unnamed: 0,url,title,author,author_url,claps,reading_time,date,content,image_sources
0,https://medium.com/javascript-scene/top-javasc...,Top JavaScript Frameworks and Topics to Learn ...,Eric Elliott,https://medium.com/@_ericelliott,36.0,10.0,"Jan 1, 2019",Top JavaScript Frameworks and Topics to Learn ...,"[""https://miro.medium.com/v2/resize:fill:88:88..."
1,https://medium.com/job-advice-for-software-eng...,What I want (and don’t want) to see on your so...,James S. Fisher,https://medium.com/@jamessfisher,25.0,8.0,"Jan 1, 2019",What I want (and don’t want) to see on your so...,"[""https://miro.medium.com/v2/resize:fill:88:88..."
2,https://itnext.io/load-testing-using-apache-jm...,Load Testing Using Apache JMeter,Mitesh,https://itnext.io/@mitesh_shamra,1.0,6.0,"Jan 1, 2019",Load Testing Using Apache JMeter Mitesh · Publ...,"[""https://miro.medium.com/v2/resize:fill:88:88..."
3,https://medium.com/s/story/black-mirror-bander...,The Illusion of Control in ‘Black Mirror: Band...,Howard Chai,https://howard-chai.medium.com,10.0,8.0,"Jan 1, 2019",The Illusion of Control in ‘Black Mirror: Band...,"[""https://miro.medium.com/v2/resize:fill:88:88..."
4,https://medium.com/fast-company/the-worst-desi...,The Worst Design Crimes of 2018,Fast Company,https://medium.com/@FastCompany,10.0,5.0,"Jan 1, 2019",The Worst Design Crimes of 2018 From tech comp...,"[""https://miro.medium.com/v2/resize:fill:88:88..."
