# Data parser

In [1]:
import pandas as pd
import json
import re

In [113]:
pd.set_option('display.max_colwidth', 100)

# Single articles

In [114]:
input_filename = 'data/single_articles.xlsx'
raw_df = pd.read_excel(input_filename)

In [178]:
df = raw_df.drop(columns=[
    'web-scraper-order',
    'web-scraper-start-url',
    'image_header-src',
    'image_content',
    'image_content-src'])
df = df.rename(columns={'thebatch_root-href': 'article_url'})
df = df[~df.article_url.str.contains('/issue-')]
df = df.dropna()
print(df.shape)
df.head()

(1747, 3)


Unnamed: 0,article_url,text,image_header
1,https://www.deeplearning.ai/the-batch/the-robots-are-winning/,Two prominent economists cast doubt on rosy predictions that automation will create more jobs th...,[]
2,https://www.deeplearning.ai/the-batch/automatic-annotation/,A new tool promises to speed up the laborious process of annotating computer-vision training dat...,[]
3,https://www.deeplearning.ai/the-batch/drones-go-commercial/,"Alphabet spin-out Wing launched its consumer drone delivery service, opening doors for specialis...",[]
4,https://www.deeplearning.ai/the-batch/vcs-bet-on-nlp/,Two startups specializing in NLP reported new financing in the past week as the field heats up.W...,[]
5,https://www.deeplearning.ai/the-batch/europe-tightens-the-screws/,"The European Commission pulled ahead of the geopolitical pack, issuing guidelines for ethical de...",[]


In [179]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1747 entries, 1 to 2048
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   article_url   1747 non-null   object
 1   text          1747 non-null   object
 2   image_header  1747 non-null   object
dtypes: object(3)
memory usage: 54.6+ KB


In [180]:
df = df.fillna('[]').replace('[]', None)
df['text'] = df.text.str.replace(u'\xa0', u' ')
df.head(10)

Unnamed: 0,article_url,text,image_header
1,https://www.deeplearning.ai/the-batch/the-robots-are-winning/,Two prominent economists cast doubt on rosy predictions that automation will create more jobs th...,
2,https://www.deeplearning.ai/the-batch/automatic-annotation/,A new tool promises to speed up the laborious process of annotating computer-vision training dat...,
3,https://www.deeplearning.ai/the-batch/drones-go-commercial/,"Alphabet spin-out Wing launched its consumer drone delivery service, opening doors for specialis...",
4,https://www.deeplearning.ai/the-batch/vcs-bet-on-nlp/,Two startups specializing in NLP reported new financing in the past week as the field heats up.W...,
5,https://www.deeplearning.ai/the-batch/europe-tightens-the-screws/,"The European Commission pulled ahead of the geopolitical pack, issuing guidelines for ethical de...",
6,https://www.deeplearning.ai/the-batch/attack-of-the-robot-dogs/,"Boston Dynamics' robot dog is straining at the leash. In a new promotional video, a pack of the ...","[{""image_header"":"""",""image_header-src"":""/_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost..."
7,https://www.deeplearning.ai/the-batch/un-redacting-mueller/,Last week’s release of the redacted Mueller Report prompted calls to fill in the blanks using th...,"[{""image_header"":"""",""image_header-src"":""/_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost..."
8,https://www.deeplearning.ai/the-batch/transparency-for-training-data/,"AI is only as good as the data it trains on, but there’s no easy way to assess training data’s q...","[{""image_header"":"""",""image_header-src"":""/_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost..."
9,https://www.deeplearning.ai/the-batch/our-new-dota-playing-overlords/,"A software agent from OpenAI crushed human players of Defense of The Ancients 2, a multiplayer o...","[{""image_header"":"""",""image_header-src"":""/_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost..."
10,https://www.deeplearning.ai/the-batch/smile-as-you-board/,"U.S. authorities, in a bid to stop aliens from overstaying their visas, aim to apply face recogn...","[{""image_header"":"""",""image_header-src"":""/_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost..."


In [153]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2050 entries, 0 to 2049
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   article_url            2050 non-null   object
 1   text                   2049 non-null   object
 2   image_header           1740 non-null   object
 3   image_content          309 non-null    object
 4   image_header_cleaned   1643 non-null   object
 5   image_content_cleaned  309 non-null    object
dtypes: object(6)
memory usage: 96.2+ KB


In [182]:
def clean_image_header(image_header: str) -> str:
    if image_header is not None:
        img_src = json.loads(image_header)[0]['image_header-src']
        img_src = img_src.replace('%3A', ':').replace('%2F', '/')
        match_link = re.search(r'url=(https://[^&]+)', img_src)
        return match_link.group(1) if match_link else None

df['image_header_cleaned'] = df['image_header'].apply(clean_image_header)
df['image_header_cleaned'].head(10)

1                                                                                                    None
2                                                                                                    None
3                                                                                                    None
4                                                                                                    None
5                                                                                                    None
6     https://dl-staging-website.ghost.io/content/images/2022/10/6b8f5b65-0b7e-495c-be3c-3613671651ed-...
7     https://dl-staging-website.ghost.io/content/images/2022/10/6a7a5517-58c9-490a-a603-722e62484ad1-...
8     https://dl-staging-website.ghost.io/content/images/2022/10/afe604b0-915a-4e9d-a0c6-5a728f219cc3-...
9     https://dl-staging-website.ghost.io/content/images/2022/10/60ae2339-9012-4631-82d5-c99f911dacd5-...
10    https://dl-staging-website.ghost.io/cont

In [184]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1747 entries, 1 to 2048
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   article_url           1747 non-null   object
 1   text                  1747 non-null   object
 2   image_header          1740 non-null   object
 3   image_header_cleaned  1643 non-null   object
dtypes: object(4)
memory usage: 68.2+ KB


### Write to CSV

In [185]:
df[['article_url', 'text', 'image_header_cleaned']].to_csv('data/single_articles.csv',index=False)

# Weekly Issues

In [188]:
input_filename = 'data/weekly_articles.xlsx'
raw_df = pd.read_excel(input_filename)

In [191]:
df = raw_df.drop(columns=[
    'web-scraper-order',
    'web-scraper-start-url'])
df = df.rename(columns={'thebatch_root-href': 'article_url'})
df = df.dropna()
print(df.shape)
df.head()

(302, 2)


Unnamed: 0,article_url,text
0,https://www.deeplearning.ai/the-batch/issue-i/,"<p><em>Dear friends,</em></p><p><em>We're busily wrapping up the Machine Learning Yearning book...."
1,https://www.deeplearning.ai/the-batch/issue-ii/,"<p><em>Dear friends,</em><br><br><em>I spent my birthday last week thinking about how AI can be ..."
2,https://www.deeplearning.ai/the-batch/issue-iii/,"<p><em>Dear friends,</em><br><br><em>On Monday, I delivered a keynote via teleconference for Dub..."
3,https://www.deeplearning.ai/the-batch/issue-iv/,"<p><em>Dear friends,</em><br><br><em>A first for my three-month-old daughter Nova: an outing to ..."
4,https://www.deeplearning.ai/the-batch/issue-v/,"<p><em>Dear friends,</em></p><p><em>I’ve been thinking a lot about ""small data."" If you have an ..."


In [195]:
temp_text = df.iloc[0].text

In [None]:
from bs4 import BeautifulSoup

def extract_articles(html):
    soup = BeautifulSoup(html, 'html.parser')
    articles = []
    
    # Split articles by <hr>
    raw_articles = html.split('<hr>')
    
    for raw_article in raw_articles:
        article_soup = BeautifulSoup(raw_article, 'html.parser')
        
        # Extract text content
        text = article_soup.get_text(separator=' ', strip=True)
        
        # Extract image URLs
        images = [img['src'] for img in article_soup.find_all('img') if 'src' in img.attrs]
        
        articles.append({
            'text': text,
            'images': images
        })
    
    return articles

# Example usage
html_content = """<p><em>Dear friends,</em></p>...<hr><h2 id="automatic-annotation">Automatic Annotation</h2>..."""
articles = extract_articles(html_content)

for idx, article in enumerate(articles):
    print(f"Article {idx + 1}:")
    print("Text:", article['text'])
    print("Images:", article['images'])
    print("-" * 40)


In [194]:
df = df.fillna('[]').replace('[]', None)
df['text'] = df.text.str.replace(u'\xa0', u' ')
df.head(10)

Unnamed: 0,article_url,text
0,https://www.deeplearning.ai/the-batch/issue-i/,"<p><em>Dear friends,</em></p><p><em>We're busily wrapping up the Machine Learning Yearning book...."
1,https://www.deeplearning.ai/the-batch/issue-ii/,"<p><em>Dear friends,</em><br><br><em>I spent my birthday last week thinking about how AI can be ..."
2,https://www.deeplearning.ai/the-batch/issue-iii/,"<p><em>Dear friends,</em><br><br><em>On Monday, I delivered a keynote via teleconference for Dub..."
3,https://www.deeplearning.ai/the-batch/issue-iv/,"<p><em>Dear friends,</em><br><br><em>A first for my three-month-old daughter Nova: an outing to ..."
4,https://www.deeplearning.ai/the-batch/issue-v/,"<p><em>Dear friends,</em></p><p><em>I’ve been thinking a lot about ""small data."" If you have an ..."
5,https://www.deeplearning.ai/the-batch/issue-vi/,"<p><em>Dear friends,</em></p><p><em>So many people who are just starting out in machine learning..."
6,https://www.deeplearning.ai/the-batch/issue-vii/,"<p><em>Dear friends,</em><br><br><em>In March, I announced our Pie &amp; AI series of meetups. S..."
7,https://www.deeplearning.ai/the-batch/issue-viii/,"<p><em>Dear friends,</em><br><br><em>Healthcare is one of many sectors being transformed by AI. ..."
8,https://www.deeplearning.ai/the-batch/issue-ix/,"<p><em>Dear friends,</em><br><br><em>I spoke last week at re:MARS, Amazon's conference focusing ..."
9,https://www.deeplearning.ai/the-batch/issue-x/,"<p><em>Dear friends,</em><br><br><em>Last Friday, I attended the International Conference on Mac..."


In [None]:
def clean_image_content(image_content: str) -> str:
    if image_content is not None:
        return [el['image_content-src'] for el in json.loads(image_content)]

df['image_content_cleaned'] = df['image_content'].apply(clean_image_content)
df['image_content_cleaned'].head(10)

In [None]:
df[['article_url', 'text', 'image_header_cleaned']].to_csv('data/single_articles.csv',index=False)

## Download Images

In [186]:
all_distinct_image_header_urls = set(df.image_header_cleaned.unique())
len(all_distinct_image_header_urls)


1644

In [151]:
images = {}
for el in all_distinct_image_header_urls:
    if el is not None:
        images[el[51:].replace('/', '_')]=el
len(images)

1643

In [137]:
import os
import io
from PIL import Image
import requests

def save_images_to_local(dataset, output_folder="data/images/"):
    os.makedirs(output_folder, exist_ok=True)
    

    for image_name in dataset:
        image_url = dataset[image_name]
        try:
            image = requests.get(image_url).content
            image = Image.open(io.BytesIO(image))

            image = image.resize((448, 448))

            output_path = os.path.join(output_folder, f"image_{image_name}.png")

            image.save(output_path, format='PNG')
            print(f"Image saved in: {output_path}")
        except Exception as e:
            print(e)

save_images_to_local(images)

Image saved in: data/images/image_2024_11_unnamed--36-.jpg.png
Image saved in: data/images/image_2022_10_6ece12b2-8790-4ff2-bebe-2a7222de2be8--1-.png.png
Image saved in: data/images/image_2021_08_Retail-Surveillance-Revealed-1.gif.png
Image saved in: data/images/image_2022_09_LCLOUD_Slides_Revise_092822-1.gif.png
Image saved in: data/images/image_2023_04_Screen-Shot-2023-04-04-at-5.19.38-PM-2.png.png
Image saved in: data/images/image_2022_09_covid.gif.png
Image saved in: data/images/image_2021_08_AI-Trusths-AI-Falshoods.png.png
Image saved in: data/images/image_2021_06_europe_revised.gif.png
Image saved in: data/images/image_2024_02_unnamed---2024-01-31T154457.554.gif.png
Image saved in: data/images/image_2021_08_Tracking-the-Elusive-Stop-Sign-1.gif.png
Image saved in: data/images/image_2023_03_unnamed--44--1.gif.png
Image saved in: data/images/image_2021_07_Face-Recognition-Meets-Resistance-1.png.png
Image saved in: data/images/image_2024_04_unnamed---2024-04-24T134335.199-1.gif.png
I