In [77]:
import requests
from bs4 import BeautifulSoup
import lxml
import html2text
from urllib.parse import urljoin, urlparse

from tqdm import tqdm

In [55]:
sitemap_url = 'https://www.deeplearning.ai/sitemap-0.xml'
scrap_url = 'https://www.deeplearning.ai/the-batch/'

In [11]:
def get_urls_from_sitemap(url) -> list:
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'xml')

    return [item.text for item in soup.find_all('loc') if item.text.startswith(scrap_url)]

links = get_urls_from_sitemap(sitemap_url)
len(links)


4668

## Now fetch into articles

In [101]:
def try_or_default(fn, default):
    try:
        return fn()
    except:
        return default

In [107]:
import re
import os
import pandas as pd

def download_file(url, folder):
    try:
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            parsed_url = urlparse(url)
            filename = os.path.basename(parsed_url.path)

            if parsed_url.query:
                filename += re.sub(r"[^\w\-]", "_", parsed_url.query)

            if not filename or filename == "/":
                filename = "downloaded_image"

            if not re.match(r"^.*\.(jpg|jpeg|png|gif)$", filename):
                filename += '.jpg'

            filepath = os.path.join(folder, filename)
            os.makedirs(folder, exist_ok=True)

            with open(filepath, "wb") as f:
                f.write(response.content)

            # print(f"Downloaded {filename} from {url}")
    except Exception as e:
        # print(f"Failed to download {url}: {e}")
        return

def extract_and_download_images(soup: BeautifulSoup, base_url):
    images = soup.find_all("img", src=True)

    folder_path = os.path.join('article_images', base_url.replace(scrap_url, ''))

    for img_tag in images:
        img_url = urljoin(base_url, img_tag["src"])
        download_file(img_url, folder_path)


def extract_content_from_link(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')

    article = soup.select_one('article')

    title = try_or_default(lambda: article.select_one("h1.leading-tight").get_text(), '')

    publication_date = try_or_default(lambda: article.select_one('.container--boxed > div > div:first-child > div:last-child > div:first-child > div:last-child').get_text(), '')

    for data in article(['style', 'script', 'aside', 'footer', 'nav']) + soup.find_all("section", id="subscribe"):
        # Remove tags
        data.decompose()

    article_folder = os.path.join('articles', url.replace(scrap_url, ''))
    extract_and_download_images(article, url)

    for data in article(['header']):
        # Remove tags
        data.decompose()

    return title, publication_date, html2text.html2text(' '.join(soup.stripped_strings))


data = []
for link in tqdm(links[3:]):
    data.append((link, *extract_content_from_link(link)))

 30%|███       | 1402/4665 [40:56<1:35:17,  1.75s/it]


KeyboardInterrupt: 

In [109]:
for link in tqdm(links[1403:]):
    data.append((link, *extract_content_from_link(link)))

100%|██████████| 3265/3265 [1:42:03<00:00,  1.88s/it]  


In [110]:
df = pd.DataFrame(data, columns =['Url', 'Title', 'Publication_Date', 'Content'])


In [111]:
df.set_index('Url', inplace=True)

In [116]:
df

Unnamed: 0_level_0,Title,Publication_Date,Content
Url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
https://www.deeplearning.ai/the-batch/art-team-sells-robots-painting-for-1-1-million/,Art team sells robot’s painting for $1.1 milli...,"Nov 11, 2024",Data Points: Art team sells robot’s painting f...
https://www.deeplearning.ai/the-batch/swe-kit-helps-developers-build-their-own-assistants/,SWE-Kit helps developers build their own assis...,"Nov 8, 2024",Data Points: SWE-Kit helps developers build th...
https://www.deeplearning.ai/the-batch/tensions-mount-as-automation-transforms-u-s-shipping-port/,Robots On the Loading Dock Tensions mount as a...,"Nov 06, 2024",Tensions Mount As Automation Transforms U.S. S...
https://www.deeplearning.ai/the-batch/social-media-bots-and-the-amplification-effect/,Social Media Bots and the Amplification Effect...,"Nov 06, 2024",Social Media Bots and the Amplification Effect...
https://www.deeplearning.ai/the-batch/issue-274/,,,"AI Controls Desktops, Agents Train Algorithms,..."
...,...,...,...
https://www.deeplearning.ai/the-batch/tag/weather-predictions/,,,Weather Predictions | The Batch Explore Course...
https://www.deeplearning.ai/the-batch/tag/webank/,,,WeBank | The Batch | AI News & Insights Explor...
https://www.deeplearning.ai/the-batch/tag/weizmann-institute/,,,Weizmann Institute | The Batch Explore Courses...
https://www.deeplearning.ai/the-batch/tag/wells-fargo/,,,Wells Fargo | The Batch | AI News & Insights E...


In [122]:
deduped = df.drop_duplicates()

In [123]:
new_df = deduped[(deduped['Title'] != '')]

In [126]:
new_df

Unnamed: 0_level_0,Title,Publication_Date,Content
Url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
https://www.deeplearning.ai/the-batch/art-team-sells-robots-painting-for-1-1-million/,Art team sells robot’s painting for $1.1 milli...,"Nov 11, 2024",Data Points: Art team sells robot’s painting f...
https://www.deeplearning.ai/the-batch/swe-kit-helps-developers-build-their-own-assistants/,SWE-Kit helps developers build their own assis...,"Nov 8, 2024",Data Points: SWE-Kit helps developers build th...
https://www.deeplearning.ai/the-batch/tensions-mount-as-automation-transforms-u-s-shipping-port/,Robots On the Loading Dock Tensions mount as a...,"Nov 06, 2024",Tensions Mount As Automation Transforms U.S. S...
https://www.deeplearning.ai/the-batch/social-media-bots-and-the-amplification-effect/,Social Media Bots and the Amplification Effect...,"Nov 06, 2024",Social Media Bots and the Amplification Effect...
https://www.deeplearning.ai/the-batch/openais-mle-bench-tests-ai-coding-agents/,When Agents Train Algorithms OpenAI’s MLE-benc...,"Nov 06, 2024",OpenAI’s MLE-bench Tests AI Coding Agents Expl...
...,...,...,...
https://www.deeplearning.ai/the-batch/europe-tightens-the-screws/,Europe Tightens the Screws,"Apr 17, 2019",Europe Tightens the Screws | AI News & Insight...
https://www.deeplearning.ai/the-batch/vcs-bet-on-nlp/,VCs Bet on NLP,"Apr 17, 2019",VCs Bet on NLP | AI News & Insights Explore Co...
https://www.deeplearning.ai/the-batch/drones-go-commercial/,Drones Go Commercial,"Apr 17, 2019",Drones Go Commercial | AI News & Insights Expl...
https://www.deeplearning.ai/the-batch/automatic-annotation/,Automatic Annotation,"Apr 17, 2019",Automatic Annotation | AI News & Insights Expl...


In [133]:
folders_clean = [url.replace('https://www.deeplearning.ai/the-batch/', '') for url in new_df.index]

In [155]:
import zipfile
import os
from tqdm import tqdm  # Ensure tqdm is used for the progress bar

def zip_selected_folders(output_filename, folder_list, base_directory='.'):
    with zipfile.ZipFile(output_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for folder_name in tqdm(folder_list):
            folder_path = os.path.join(base_directory, folder_name)
            if os.path.isdir(folder_path):
                for root, dirs, files in os.walk(folder_path):
                    for file in files:
                        file_path = os.path.join(root, file)
                        arcname = os.path.relpath(file_path, base_directory)
                        zipf.write(file_path, arcname)

# Usage
output_filename = 'images_clean.zip'
base_directory = 'article_images'  # Replace with your base directory path

zip_selected_folders(output_filename, folders_clean, base_directory)

100%|██████████| 1654/1654 [01:19<00:00, 20.74it/s]


In [136]:
new_df.to_parquet('clean_df.parquet.gzip',
              compression='gzip')

In [137]:
new_df.to_csv('clean_df.csv')

In [139]:
from_csv_df = pd.read_csv('clean_df.csv')
from_csv_df

Unnamed: 0,Url,Title,Publication_Date,Content
0,https://www.deeplearning.ai/the-batch/art-team...,Art team sells robot’s painting for $1.1 milli...,"Nov 11, 2024",Data Points: Art team sells robot’s painting f...
1,https://www.deeplearning.ai/the-batch/swe-kit-...,SWE-Kit helps developers build their own assis...,"Nov 8, 2024",Data Points: SWE-Kit helps developers build th...
2,https://www.deeplearning.ai/the-batch/tensions...,Robots On the Loading Dock Tensions mount as a...,"Nov 06, 2024",Tensions Mount As Automation Transforms U.S. S...
3,https://www.deeplearning.ai/the-batch/social-m...,Social Media Bots and the Amplification Effect...,"Nov 06, 2024",Social Media Bots and the Amplification Effect...
4,https://www.deeplearning.ai/the-batch/openais-...,When Agents Train Algorithms OpenAI’s MLE-benc...,"Nov 06, 2024",OpenAI’s MLE-bench Tests AI Coding Agents Expl...
...,...,...,...,...
1649,https://www.deeplearning.ai/the-batch/europe-t...,Europe Tightens the Screws,"Apr 17, 2019",Europe Tightens the Screws | AI News & Insight...
1650,https://www.deeplearning.ai/the-batch/vcs-bet-...,VCs Bet on NLP,"Apr 17, 2019",VCs Bet on NLP | AI News & Insights Explore Co...
1651,https://www.deeplearning.ai/the-batch/drones-g...,Drones Go Commercial,"Apr 17, 2019",Drones Go Commercial | AI News & Insights Expl...
1652,https://www.deeplearning.ai/the-batch/automati...,Automatic Annotation,"Apr 17, 2019",Automatic Annotation | AI News & Insights Expl...


In [142]:
def extract_unformatted_content_from_link(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')

    article = soup.select_one('article')


    for data in article(['style', 'script', 'aside', 'footer', 'nav', 'header']) + soup.find_all("section", id="subscribe"):
        # Remove tags
        data.decompose()

    return article

new_content_list = []

# extract_unformatted_content_from_link(new_df.index[0])

In [143]:
for url in tqdm(new_df.index):
    new_content_list.append(extract_unformatted_content_from_link(url))

100%|██████████| 1654/1654 [17:05<00:00,  1.61it/s]


In [144]:
new_df['Content_HTML'] = new_content_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Content_HTML'] = new_content_list


In [145]:
new_df

Unnamed: 0_level_0,Title,Publication_Date,Content,Content_HTML
Url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
https://www.deeplearning.ai/the-batch/art-team-sells-robots-painting-for-1-1-million/,Art team sells robot’s painting for $1.1 milli...,"Nov 11, 2024",Data Points: Art team sells robot’s painting f...,"[[[<p>Twice a week, Data Points brings you the..."
https://www.deeplearning.ai/the-batch/swe-kit-helps-developers-build-their-own-assistants/,SWE-Kit helps developers build their own assis...,"Nov 8, 2024",Data Points: SWE-Kit helps developers build th...,"[[[<p>Twice a week, Data Points brings you the..."
https://www.deeplearning.ai/the-batch/tensions-mount-as-automation-transforms-u-s-shipping-port/,Robots On the Loading Dock Tensions mount as a...,"Nov 06, 2024",Tensions Mount As Automation Transforms U.S. S...,[[[<p>Shipping ports are the latest front in t...
https://www.deeplearning.ai/the-batch/social-media-bots-and-the-amplification-effect/,Social Media Bots and the Amplification Effect...,"Nov 06, 2024",Social Media Bots and the Amplification Effect...,"[[[<p>Dear friends,</p>, <p>Trump and the Repu..."
https://www.deeplearning.ai/the-batch/openais-mle-bench-tests-ai-coding-agents/,When Agents Train Algorithms OpenAI’s MLE-benc...,"Nov 06, 2024",OpenAI’s MLE-bench Tests AI Coding Agents Expl...,"[[[<p>Coding agents are improving, but can the..."
...,...,...,...,...
https://www.deeplearning.ai/the-batch/europe-tightens-the-screws/,Europe Tightens the Screws,"Apr 17, 2019",Europe Tightens the Screws | AI News & Insight...,[[[<p>The European Commission pulled ahead of ...
https://www.deeplearning.ai/the-batch/vcs-bet-on-nlp/,VCs Bet on NLP,"Apr 17, 2019",VCs Bet on NLP | AI News & Insights Explore Co...,[[[<p>Two startups specializing in NLP reporte...
https://www.deeplearning.ai/the-batch/drones-go-commercial/,Drones Go Commercial,"Apr 17, 2019",Drones Go Commercial | AI News & Insights Expl...,"[[[<p>Alphabet spin-out Wing <a href=""https://..."
https://www.deeplearning.ai/the-batch/automatic-annotation/,Automatic Annotation,"Apr 17, 2019",Automatic Annotation | AI News & Insights Expl...,[[[<p>A new tool promises to speed up the labo...


In [146]:
new_df[['Content', 'Content_HTML']]

Unnamed: 0_level_0,Content,Content_HTML
Url,Unnamed: 1_level_1,Unnamed: 2_level_1
https://www.deeplearning.ai/the-batch/art-team-sells-robots-painting-for-1-1-million/,Data Points: Art team sells robot’s painting f...,"[[[<p>Twice a week, Data Points brings you the..."
https://www.deeplearning.ai/the-batch/swe-kit-helps-developers-build-their-own-assistants/,Data Points: SWE-Kit helps developers build th...,"[[[<p>Twice a week, Data Points brings you the..."
https://www.deeplearning.ai/the-batch/tensions-mount-as-automation-transforms-u-s-shipping-port/,Tensions Mount As Automation Transforms U.S. S...,[[[<p>Shipping ports are the latest front in t...
https://www.deeplearning.ai/the-batch/social-media-bots-and-the-amplification-effect/,Social Media Bots and the Amplification Effect...,"[[[<p>Dear friends,</p>, <p>Trump and the Repu..."
https://www.deeplearning.ai/the-batch/openais-mle-bench-tests-ai-coding-agents/,OpenAI’s MLE-bench Tests AI Coding Agents Expl...,"[[[<p>Coding agents are improving, but can the..."
...,...,...
https://www.deeplearning.ai/the-batch/europe-tightens-the-screws/,Europe Tightens the Screws | AI News & Insight...,[[[<p>The European Commission pulled ahead of ...
https://www.deeplearning.ai/the-batch/vcs-bet-on-nlp/,VCs Bet on NLP | AI News & Insights Explore Co...,[[[<p>Two startups specializing in NLP reporte...
https://www.deeplearning.ai/the-batch/drones-go-commercial/,Drones Go Commercial | AI News & Insights Expl...,"[[[<p>Alphabet spin-out Wing <a href=""https://..."
https://www.deeplearning.ai/the-batch/automatic-annotation/,Automatic Annotation | AI News & Insights Expl...,[[[<p>A new tool promises to speed up the labo...


In [152]:
new_df['Content'] = new_df['Content_HTML']
new_df.drop(['Content_HTML'], axis=1, inplace=True)

In [153]:
new_df.to_csv('articles_html.csv')

In [154]:
pd.read_csv('articles_html.csv')

Unnamed: 0,Url,Title,Publication_Date,Content
0,https://www.deeplearning.ai/the-batch/art-team...,Art team sells robot’s painting for $1.1 milli...,"Nov 11, 2024","<article class=""pb-16 bg-white""><div class=""po..."
1,https://www.deeplearning.ai/the-batch/swe-kit-...,SWE-Kit helps developers build their own assis...,"Nov 8, 2024","<article class=""pb-16 bg-white""><div class=""po..."
2,https://www.deeplearning.ai/the-batch/tensions...,Robots On the Loading Dock Tensions mount as a...,"Nov 06, 2024","<article class=""pb-16 bg-white""><div class=""po..."
3,https://www.deeplearning.ai/the-batch/social-m...,Social Media Bots and the Amplification Effect...,"Nov 06, 2024","<article class=""pb-16 bg-white""><div class=""po..."
4,https://www.deeplearning.ai/the-batch/openais-...,When Agents Train Algorithms OpenAI’s MLE-benc...,"Nov 06, 2024","<article class=""pb-16 bg-white""><div class=""po..."
...,...,...,...,...
1649,https://www.deeplearning.ai/the-batch/europe-t...,Europe Tightens the Screws,"Apr 17, 2019","<article class=""pb-16 bg-white""><div class=""po..."
1650,https://www.deeplearning.ai/the-batch/vcs-bet-...,VCs Bet on NLP,"Apr 17, 2019","<article class=""pb-16 bg-white""><div class=""po..."
1651,https://www.deeplearning.ai/the-batch/drones-g...,Drones Go Commercial,"Apr 17, 2019","<article class=""pb-16 bg-white""><div class=""po..."
1652,https://www.deeplearning.ai/the-batch/automati...,Automatic Annotation,"Apr 17, 2019","<article class=""pb-16 bg-white""><div class=""po..."
