# Data parser

In [1]:
import pandas as pd
import json
import re
from bs4 import BeautifulSoup

In [2]:
pd.set_option('display.max_colwidth', 100)

# Single articles

In [23]:
input_filename = 'data/single_articles.xlsx'
raw_df = pd.read_excel(input_filename)
raw_df.shape

(1748, 6)

In [24]:
df = raw_df.drop(columns=[
    'web-scraper-order',
    'web-scraper-start-url',
    'image-src',])
df = df.rename(columns={'thebatch_root-href': 'article_url'})
# df = df[~df.article_url.str.contains('/issue-')]
df = df.dropna()
print(df.shape)
df.head()

(1747, 3)


Unnamed: 0,article_url,text,image
0,https://www.deeplearning.ai/the-batch/the-robots-are-winning/,Two prominent economists cast doubt on rosy predictions that automation will create more jobs th...,[]
1,https://www.deeplearning.ai/the-batch/automatic-annotation/,A new tool promises to speed up the laborious process of annotating computer-vision training dat...,[]
2,https://www.deeplearning.ai/the-batch/drones-go-commercial/,"Alphabet spin-out Wing launched its consumer drone delivery service, opening doors for specialis...",[]
3,https://www.deeplearning.ai/the-batch/vcs-bet-on-nlp/,Two startups specializing in NLP reported new financing in the past week as the field heats up.W...,[]
4,https://www.deeplearning.ai/the-batch/europe-tightens-the-screws/,"The European Commission pulled ahead of the geopolitical pack, issuing guidelines for ethical de...",[]


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1747 entries, 0 to 1746
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   article_url  1747 non-null   object
 1   text         1747 non-null   object
 2   image        1747 non-null   object
dtypes: object(3)
memory usage: 54.6+ KB


In [26]:
df = df.fillna('[]').replace('[]', None)
df['text'] = df.text.str.replace(u'\xa0', u' ')
df.head(10)

Unnamed: 0,article_url,text,image
0,https://www.deeplearning.ai/the-batch/the-robots-are-winning/,Two prominent economists cast doubt on rosy predictions that automation will create more jobs th...,
1,https://www.deeplearning.ai/the-batch/automatic-annotation/,A new tool promises to speed up the laborious process of annotating computer-vision training dat...,
2,https://www.deeplearning.ai/the-batch/drones-go-commercial/,"Alphabet spin-out Wing launched its consumer drone delivery service, opening doors for specialis...",
3,https://www.deeplearning.ai/the-batch/vcs-bet-on-nlp/,Two startups specializing in NLP reported new financing in the past week as the field heats up.W...,
4,https://www.deeplearning.ai/the-batch/europe-tightens-the-screws/,"The European Commission pulled ahead of the geopolitical pack, issuing guidelines for ethical de...",
5,https://www.deeplearning.ai/the-batch/attack-of-the-robot-dogs/,"Boston Dynamics' robot dog is straining at the leash. In a new promotional video, a pack of the ...","[{""image"":"""",""image-src"":""/_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost.io%2Fcontent%..."
6,https://www.deeplearning.ai/the-batch/un-redacting-mueller/,Last week’s release of the redacted Mueller Report prompted calls to fill in the blanks using th...,"[{""image"":"""",""image-src"":""/_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost.io%2Fcontent%..."
7,https://www.deeplearning.ai/the-batch/transparency-for-training-data/,"AI is only as good as the data it trains on, but there’s no easy way to assess training data’s q...","[{""image"":"""",""image-src"":""/_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost.io%2Fcontent%..."
8,https://www.deeplearning.ai/the-batch/our-new-dota-playing-overlords/,"A software agent from OpenAI crushed human players of Defense of The Ancients 2, a multiplayer o...","[{""image"":"""",""image-src"":""/_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost.io%2Fcontent%..."
9,https://www.deeplearning.ai/the-batch/smile-as-you-board/,"U.S. authorities, in a bid to stop aliens from overstaying their visas, aim to apply face recogn...","[{""image"":"""",""image-src"":""/_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost.io%2Fcontent%..."


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1747 entries, 0 to 1746
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   article_url  1747 non-null   object
 1   text         1747 non-null   object
 2   image        1740 non-null   object
dtypes: object(3)
memory usage: 54.6+ KB


In [28]:
def clean_image(image: str) -> str:
    if image is not None:
        img_src = json.loads(image)[0]['image-src']
        img_src = img_src.replace('%3A', ':').replace('%2F', '/').strip('"')
        match_link = re.search(r'url=(https://[^&]+)', img_src)
        return [match_link.group(1)] if match_link else None

df['image_cleaned'] = df['image'].apply(clean_image)
df[['image', 'image_cleaned']].head(10)

Unnamed: 0,image,image_cleaned
0,,
1,,
2,,
3,,
4,,
5,"[{""image"":"""",""image-src"":""/_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost.io%2Fcontent%...",[https://dl-staging-website.ghost.io/content/images/2022/10/6b8f5b65-0b7e-495c-be3c-3613671651ed...
6,"[{""image"":"""",""image-src"":""/_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost.io%2Fcontent%...",[https://dl-staging-website.ghost.io/content/images/2022/10/6a7a5517-58c9-490a-a603-722e62484ad1...
7,"[{""image"":"""",""image-src"":""/_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost.io%2Fcontent%...",[https://dl-staging-website.ghost.io/content/images/2022/10/afe604b0-915a-4e9d-a0c6-5a728f219cc3...
8,"[{""image"":"""",""image-src"":""/_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost.io%2Fcontent%...",[https://dl-staging-website.ghost.io/content/images/2022/10/60ae2339-9012-4631-82d5-c99f911dacd5...
9,"[{""image"":"""",""image-src"":""/_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost.io%2Fcontent%...",[https://dl-staging-website.ghost.io/content/images/2022/10/50e3df2e-dbe2-4a2b-ab4b-623ace68e880...


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1747 entries, 0 to 1746
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   article_url    1747 non-null   object
 1   text           1747 non-null   object
 2   image          1740 non-null   object
 3   image_cleaned  1643 non-null   object
dtypes: object(4)
memory usage: 68.2+ KB


In [30]:
df = df.drop(columns="image")
df = df.rename(columns={"image_cleaned": "image"})
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1747 entries, 0 to 1746
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   article_url  1747 non-null   object
 1   text         1747 non-null   object
 2   image        1643 non-null   object
dtypes: object(3)
memory usage: 54.6+ KB


In [31]:
df.image

0                                                                                                      None
1                                                                                                      None
2                                                                                                      None
3                                                                                                      None
4                                                                                                      None
                                                       ...                                                 
1742                              [https://dl-staging-website.ghost.io/content/images/2025/02/UITARS-1.png]
1743                      [https://dl-staging-website.ghost.io/content/images/2025/02/FLASH2THINKING-1.png]
1744                               [https://dl-staging-website.ghost.io/content/images/2025/02/MOSHI-1.gif]
1745                        

### Write to CSV

In [32]:
df[['article_url', 'text', 'image']].to_csv('data/single_articles_cleaned.csv',index=False)

# Weekly Articles

In [39]:
input_filename = 'data/weekly_articles.xlsx'
raw_df_weekly = pd.read_excel(input_filename)

In [40]:
df_weekly = raw_df_weekly.drop(columns=[
    'web-scraper-order',
    'web-scraper-start-url'])
df_weekly = df_weekly.rename(columns={'thebatch_root-href': 'article_url', 'text': 'raw_html'})
df_weekly = df_weekly.dropna()
print(df_weekly.shape)
df_weekly.head()

(302, 2)


Unnamed: 0,article_url,raw_html
0,https://www.deeplearning.ai/the-batch/issue-i/,"<p><em>Dear friends,</em></p><p><em>We're busily wrapping up the Machine Learning Yearning book...."
1,https://www.deeplearning.ai/the-batch/issue-ii/,"<p><em>Dear friends,</em><br><br><em>I spent my birthday last week thinking about how AI can be ..."
2,https://www.deeplearning.ai/the-batch/issue-iii/,"<p><em>Dear friends,</em><br><br><em>On Monday, I delivered a keynote via teleconference for Dub..."
3,https://www.deeplearning.ai/the-batch/issue-iv/,"<p><em>Dear friends,</em><br><br><em>A first for my three-month-old daughter Nova: an outing to ..."
4,https://www.deeplearning.ai/the-batch/issue-v/,"<p><em>Dear friends,</em></p><p><em>I’ve been thinking a lot about ""small data."" If you have an ..."


In [48]:
def extract_articles(html: str):
    articles = []
    
    raw_articles = html.split('<hr>')
    for raw_article in raw_articles:
        article_soup = BeautifulSoup(raw_article, 'html.parser')

        text = article_soup.get_text(separator=' ', strip=True)
        images = [img['src'] for img in article_soup.find_all('img') if 'src' in img.attrs]
        
        articles.append({
            'text': text,
            'image': images
        })
    
    return articles

df_weekly['articles'] = df_weekly.raw_html.apply(extract_articles)
df_weekly['articles'].head()

0    [{'text': 'Dear friends, We're busily wrapping up the Machine Learning Yearning book. Meanwhile,...
1    [{'text': 'Dear friends, I spent my birthday last week thinking about how AI can be used to addr...
2    [{'text': 'Dear friends, On Monday, I delivered a keynote via teleconference for Dubai's AI Ever...
3    [{'text': 'Dear friends, A first for my three-month-old daughter Nova: an outing to the park. As...
4    [{'text': 'Dear friends, I’ve been thinking a lot about "small data." If you have an image class...
Name: articles, dtype: object

In [49]:
df_exploded = df_weekly.drop(columns='raw_html').explode('articles')
df_exploded = df_exploded.reset_index()
df_exploded = df_exploded.drop(columns="index")
df_exploded.head()

Unnamed: 0,article_url,articles
0,https://www.deeplearning.ai/the-batch/issue-i/,"{'text': 'Dear friends, We're busily wrapping up the Machine Learning Yearning book. Meanwhile, ..."
1,https://www.deeplearning.ai/the-batch/issue-i/,{'text': 'News The Robots are Winning Two prominent economists cast doubt on rosy predictions th...
2,https://www.deeplearning.ai/the-batch/issue-i/,{'text': 'Automatic Annotation A new tool promises to speed up the laborious process of annotati...
3,https://www.deeplearning.ai/the-batch/issue-i/,{'text': 'Drones Go Commercial Alphabet spin-out Wing launched its consumer drone delivery servi...
4,https://www.deeplearning.ai/the-batch/issue-i/,{'text': 'A MESSAGE FROM DEEPLEARNING.AI Want to master Tensor Flow? Check out our new Tensor Fl...


In [53]:
df_normalized = pd.json_normalize(df_exploded['articles'])
#df_normalized.image = df_normalized.image.str.strip('"')
df_normalized.head()

Unnamed: 0,text,image
0,"Dear friends, We're busily wrapping up the Machine Learning Yearning book. Meanwhile, we'd like ...",[https://dl-staging-website.ghost.io/content/images/2022/09/dfbcdc70-a9a2-4967-95b8-6866a6a0a6bf...
1,News The Robots are Winning Two prominent economists cast doubt on rosy predictions that automat...,[]
2,Automatic Annotation A new tool promises to speed up the laborious process of annotating compute...,[]
3,"Drones Go Commercial Alphabet spin-out Wing launched its consumer drone delivery service, openin...",[]
4,A MESSAGE FROM DEEPLEARNING.AI Want to master Tensor Flow? Check out our new Tensor Flow Special...,[https://dl-staging-website.ghost.io/content/images/2022/09/Sin-t-tulo.png]


In [55]:
df_normalized = pd.json_normalize(df_exploded['articles'])
full_df = pd.concat([df_exploded, df_normalized], axis=1)
full_df = full_df[full_df.text.str.len() > 0].drop(columns=['articles'])
full_df.head()

Unnamed: 0,article_url,text,image
0,https://www.deeplearning.ai/the-batch/issue-i/,"Dear friends, We're busily wrapping up the Machine Learning Yearning book. Meanwhile, we'd like ...",[https://dl-staging-website.ghost.io/content/images/2022/09/dfbcdc70-a9a2-4967-95b8-6866a6a0a6bf...
1,https://www.deeplearning.ai/the-batch/issue-i/,News The Robots are Winning Two prominent economists cast doubt on rosy predictions that automat...,[]
2,https://www.deeplearning.ai/the-batch/issue-i/,Automatic Annotation A new tool promises to speed up the laborious process of annotating compute...,[]
3,https://www.deeplearning.ai/the-batch/issue-i/,"Drones Go Commercial Alphabet spin-out Wing launched its consumer drone delivery service, openin...",[]
4,https://www.deeplearning.ai/the-batch/issue-i/,A MESSAGE FROM DEEPLEARNING.AI Want to master Tensor Flow? Check out our new Tensor Flow Special...,[https://dl-staging-website.ghost.io/content/images/2022/09/Sin-t-tulo.png]


### Write to CSV

In [58]:
full_df[['article_url', 'text', 'image']].to_csv('data/weekly_articles_cleaned.csv',index=False)

## Join Single and Weekly Articles

In [66]:
all_articles_df = pd.concat([df, full_df], ignore_index=True)
all_articles_df.head(20)

Unnamed: 0,article_url,text,image
0,https://www.deeplearning.ai/the-batch/the-robots-are-winning/,Two prominent economists cast doubt on rosy predictions that automation will create more jobs th...,
1,https://www.deeplearning.ai/the-batch/automatic-annotation/,A new tool promises to speed up the laborious process of annotating computer-vision training dat...,
2,https://www.deeplearning.ai/the-batch/drones-go-commercial/,"Alphabet spin-out Wing launched its consumer drone delivery service, opening doors for specialis...",
3,https://www.deeplearning.ai/the-batch/vcs-bet-on-nlp/,Two startups specializing in NLP reported new financing in the past week as the field heats up.W...,
4,https://www.deeplearning.ai/the-batch/europe-tightens-the-screws/,"The European Commission pulled ahead of the geopolitical pack, issuing guidelines for ethical de...",
5,https://www.deeplearning.ai/the-batch/attack-of-the-robot-dogs/,"Boston Dynamics' robot dog is straining at the leash. In a new promotional video, a pack of the ...",[https://dl-staging-website.ghost.io/content/images/2022/10/6b8f5b65-0b7e-495c-be3c-3613671651ed...
6,https://www.deeplearning.ai/the-batch/un-redacting-mueller/,Last week’s release of the redacted Mueller Report prompted calls to fill in the blanks using th...,[https://dl-staging-website.ghost.io/content/images/2022/10/6a7a5517-58c9-490a-a603-722e62484ad1...
7,https://www.deeplearning.ai/the-batch/transparency-for-training-data/,"AI is only as good as the data it trains on, but there’s no easy way to assess training data’s q...",[https://dl-staging-website.ghost.io/content/images/2022/10/afe604b0-915a-4e9d-a0c6-5a728f219cc3...
8,https://www.deeplearning.ai/the-batch/our-new-dota-playing-overlords/,"A software agent from OpenAI crushed human players of Defense of The Ancients 2, a multiplayer o...",[https://dl-staging-website.ghost.io/content/images/2022/10/60ae2339-9012-4631-82d5-c99f911dacd5...
9,https://www.deeplearning.ai/the-batch/smile-as-you-board/,"U.S. authorities, in a bid to stop aliens from overstaying their visas, aim to apply face recogn...",[https://dl-staging-website.ghost.io/content/images/2022/10/50e3df2e-dbe2-4a2b-ab4b-623ace68e880...


In [67]:
all_articles_filename = 'data/all_articles.csv'

all_articles_df[['article_url', 'text', 'image']].to_csv(all_articles_filename, index=False)

## Download Images

In [61]:
all_articles_df = pd.read_csv(all_articles_filename)
all_articles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3549 entries, 0 to 3548
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   article_url  3549 non-null   object
 1   text         3549 non-null   object
 2   image        3445 non-null   object
dtypes: object(3)
memory usage: 83.3+ KB


In [63]:
all_articles_df['image'] = all_articles_df['image'].str.strip('"').str.strip("[]").str.split(',')
all_image_urls = {el.replace("'", "") for el in all_articles_df.explode('image').image.dropna().unique() if len(el) > 5}
len(all_image_urls)

3642

In [64]:
images = {}
for el in all_image_urls:
    if el is not None:
        image_name = el[51:].replace('/', '_').replace('.', '_')
        images[image_name] = el
len(images)

3642

In [65]:
import json


with open('data/images_dataset.json', 'w') as file:
    json.dump(images, file)

In [110]:
import os
import io
from PIL import Image
import requests

def save_images_to_local(dataset, output_folder="data/images/"):
    os.makedirs(output_folder, exist_ok=True)

    for image_name in dataset:
        image_url = dataset[image_name]
        try:
            image = requests.get(image_url).content
            image = Image.open(io.BytesIO(image))

            image = image.resize((448, 448))

            output_path = os.path.join(output_folder, f"{image_name}.png")

            image.save(output_path, format='PNG')
            print(f"Image saved in: {output_path}")
        except Exception as e:
            print(e)

save_images_to_local(images)

Image saved in: data/images/2024_02_PRUNING--1--1_gif.png
Image saved in: data/images/2022_09_592a103a-d610-4433-b936-332fab9d7cdd_png.png
Image saved in: data/images/ploads_2021_01_DoubleDescent20ASPECT_png.png
Image saved in: data/images/2022_10_34bc1679-bbb1-411f-9e6a-c1674e4313cf--1-_gif.png
Image saved in: data/images/2024_10_unnamed--17--1_gif.png
Image saved in: data/images/2022_04_DEEPFAKE--1-_gif.png
Image saved in: data/images/2022_04_ACCIDENTS--1--2_gif.png
Image saved in: data/images/2022_09_aeb0f673-aaa9-45cd-a6c8-933b92ebd1ec_jpg.png
Image saved in: data/images/ploads_2021_01_Fawkes20ASPECT_png.png
Image saved in: data/images/2023_06_BENGIO-1_png.png
Image saved in: data/images/2022_10_ffd6f4c4-a3e6-43dc-b6ad-c6c340b14495--1--1_png.png
Image saved in: data/images/_gif?upscale=true&width=1200&upscale=true&name=INTERNET_gif.png
Image saved in: data/images/2023_12_AI-fuels-innovations-in-Pennsylvania-s-infrastructure_jpg.png
Image saved in: data/images/2021_08_Course-Name-Ba