# Data parser

In [1]:
import pandas as pd
import json
import re
from bs4 import BeautifulSoup

In [113]:
pd.set_option('display.max_colwidth', 100)

# Single articles

In [114]:
input_filename = 'data/single_articles.xlsx'
raw_df = pd.read_excel(input_filename)

In [178]:
df = raw_df.drop(columns=[
    'web-scraper-order',
    'web-scraper-start-url',
    'image_header-src',
    'image_content',
    'image_content-src'])
df = df.rename(columns={'thebatch_root-href': 'article_url'})
df = df[~df.article_url.str.contains('/issue-')]
df = df.dropna()
print(df.shape)
df.head()

(1747, 3)


Unnamed: 0,article_url,text,image_header
1,https://www.deeplearning.ai/the-batch/the-robots-are-winning/,Two prominent economists cast doubt on rosy predictions that automation will create more jobs th...,[]
2,https://www.deeplearning.ai/the-batch/automatic-annotation/,A new tool promises to speed up the laborious process of annotating computer-vision training dat...,[]
3,https://www.deeplearning.ai/the-batch/drones-go-commercial/,"Alphabet spin-out Wing launched its consumer drone delivery service, opening doors for specialis...",[]
4,https://www.deeplearning.ai/the-batch/vcs-bet-on-nlp/,Two startups specializing in NLP reported new financing in the past week as the field heats up.W...,[]
5,https://www.deeplearning.ai/the-batch/europe-tightens-the-screws/,"The European Commission pulled ahead of the geopolitical pack, issuing guidelines for ethical de...",[]


In [179]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1747 entries, 1 to 2048
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   article_url   1747 non-null   object
 1   text          1747 non-null   object
 2   image_header  1747 non-null   object
dtypes: object(3)
memory usage: 54.6+ KB


In [180]:
df = df.fillna('[]').replace('[]', None)
df['text'] = df.text.str.replace(u'\xa0', u' ')
df.head(10)

Unnamed: 0,article_url,text,image_header
1,https://www.deeplearning.ai/the-batch/the-robots-are-winning/,Two prominent economists cast doubt on rosy predictions that automation will create more jobs th...,
2,https://www.deeplearning.ai/the-batch/automatic-annotation/,A new tool promises to speed up the laborious process of annotating computer-vision training dat...,
3,https://www.deeplearning.ai/the-batch/drones-go-commercial/,"Alphabet spin-out Wing launched its consumer drone delivery service, opening doors for specialis...",
4,https://www.deeplearning.ai/the-batch/vcs-bet-on-nlp/,Two startups specializing in NLP reported new financing in the past week as the field heats up.W...,
5,https://www.deeplearning.ai/the-batch/europe-tightens-the-screws/,"The European Commission pulled ahead of the geopolitical pack, issuing guidelines for ethical de...",
6,https://www.deeplearning.ai/the-batch/attack-of-the-robot-dogs/,"Boston Dynamics' robot dog is straining at the leash. In a new promotional video, a pack of the ...","[{""image_header"":"""",""image_header-src"":""/_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost..."
7,https://www.deeplearning.ai/the-batch/un-redacting-mueller/,Last week’s release of the redacted Mueller Report prompted calls to fill in the blanks using th...,"[{""image_header"":"""",""image_header-src"":""/_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost..."
8,https://www.deeplearning.ai/the-batch/transparency-for-training-data/,"AI is only as good as the data it trains on, but there’s no easy way to assess training data’s q...","[{""image_header"":"""",""image_header-src"":""/_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost..."
9,https://www.deeplearning.ai/the-batch/our-new-dota-playing-overlords/,"A software agent from OpenAI crushed human players of Defense of The Ancients 2, a multiplayer o...","[{""image_header"":"""",""image_header-src"":""/_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost..."
10,https://www.deeplearning.ai/the-batch/smile-as-you-board/,"U.S. authorities, in a bid to stop aliens from overstaying their visas, aim to apply face recogn...","[{""image_header"":"""",""image_header-src"":""/_next/image/?url=https%3A%2F%2Fdl-staging-website.ghost..."


In [153]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2050 entries, 0 to 2049
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   article_url            2050 non-null   object
 1   text                   2049 non-null   object
 2   image_header           1740 non-null   object
 3   image_content          309 non-null    object
 4   image_header_cleaned   1643 non-null   object
 5   image_content_cleaned  309 non-null    object
dtypes: object(6)
memory usage: 96.2+ KB


In [182]:
def clean_image_header(image_header: str) -> str:
    if image_header is not None:
        img_src = json.loads(image_header)[0]['image_header-src']
        img_src = img_src.replace('%3A', ':').replace('%2F', '/')
        match_link = re.search(r'url=(https://[^&]+)', img_src)
        return match_link.group(1) if match_link else None

df['image_header_cleaned'] = df['image_header'].apply(clean_image_header)
df['image_header_cleaned'].head(10)

1                                                                                                    None
2                                                                                                    None
3                                                                                                    None
4                                                                                                    None
5                                                                                                    None
6     https://dl-staging-website.ghost.io/content/images/2022/10/6b8f5b65-0b7e-495c-be3c-3613671651ed-...
7     https://dl-staging-website.ghost.io/content/images/2022/10/6a7a5517-58c9-490a-a603-722e62484ad1-...
8     https://dl-staging-website.ghost.io/content/images/2022/10/afe604b0-915a-4e9d-a0c6-5a728f219cc3-...
9     https://dl-staging-website.ghost.io/content/images/2022/10/60ae2339-9012-4631-82d5-c99f911dacd5-...
10    https://dl-staging-website.ghost.io/cont

In [184]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1747 entries, 1 to 2048
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   article_url           1747 non-null   object
 1   text                  1747 non-null   object
 2   image_header          1740 non-null   object
 3   image_header_cleaned  1643 non-null   object
dtypes: object(4)
memory usage: 68.2+ KB


### Write to CSV

In [None]:
df[['article_url', 'text', 'image_header_cleaned']].to_csv('data/single_articles_cleaned.csv',index=False)

# Weekly Issues

In [188]:
input_filename = 'data/weekly_articles.xlsx'
raw_df = pd.read_excel(input_filename)

In [209]:
df = raw_df.drop(columns=[
    'web-scraper-order',
    'web-scraper-start-url'])
df = df.rename(columns={'thebatch_root-href': 'article_url', 'text': 'raw_html'})
df = df.dropna()
print(df.shape)
df.head()

(302, 2)


Unnamed: 0,article_url,raw_html
0,https://www.deeplearning.ai/the-batch/issue-i/,"<p><em>Dear friends,</em></p><p><em>We're busily wrapping up the Machine Learning Yearning book...."
1,https://www.deeplearning.ai/the-batch/issue-ii/,"<p><em>Dear friends,</em><br><br><em>I spent my birthday last week thinking about how AI can be ..."
2,https://www.deeplearning.ai/the-batch/issue-iii/,"<p><em>Dear friends,</em><br><br><em>On Monday, I delivered a keynote via teleconference for Dub..."
3,https://www.deeplearning.ai/the-batch/issue-iv/,"<p><em>Dear friends,</em><br><br><em>A first for my three-month-old daughter Nova: an outing to ..."
4,https://www.deeplearning.ai/the-batch/issue-v/,"<p><em>Dear friends,</em></p><p><em>I’ve been thinking a lot about ""small data."" If you have an ..."


In [210]:
def extract_articles(html: str):
    articles = []
    
    raw_articles = html.split('<hr>')
    for raw_article in raw_articles:
        article_soup = BeautifulSoup(raw_article, 'html.parser')

        text = article_soup.get_text(separator=' ', strip=True)
        images = [img['src'] for img in article_soup.find_all('img') if 'src' in img.attrs]
        
        articles.append({
            'text': text,
            'images': images
        })
    
    return articles

df['articles'] = df.raw_html.apply(extract_articles)
df['articles'].head()

0    [{'text': 'Dear friends, We're busily wrapping up the Machine Learning Yearning book. Meanwhile,...
1    [{'text': 'Dear friends, I spent my birthday last week thinking about how AI can be used to addr...
2    [{'text': 'Dear friends, On Monday, I delivered a keynote via teleconference for Dubai's AI Ever...
3    [{'text': 'Dear friends, A first for my three-month-old daughter Nova: an outing to the park. As...
4    [{'text': 'Dear friends, I’ve been thinking a lot about "small data." If you have an image class...
Name: articles, dtype: object

In [222]:
df_exploded = df.drop(columns='raw_html').explode('articles')
df_exploded = df_exploded.reset_index()
df_exploded = df_exploded.drop(columns="index")
df_exploded.head()

Unnamed: 0,article_url,articles
0,https://www.deeplearning.ai/the-batch/issue-i/,"{'text': 'Dear friends, We're busily wrapping up the Machine Learning Yearning book. Meanwhile, ..."
1,https://www.deeplearning.ai/the-batch/issue-i/,{'text': 'News The Robots are Winning Two prominent economists cast doubt on rosy predictions th...
2,https://www.deeplearning.ai/the-batch/issue-i/,{'text': 'Automatic Annotation A new tool promises to speed up the laborious process of annotati...
3,https://www.deeplearning.ai/the-batch/issue-i/,{'text': 'Drones Go Commercial Alphabet spin-out Wing launched its consumer drone delivery servi...
4,https://www.deeplearning.ai/the-batch/issue-i/,{'text': 'A MESSAGE FROM DEEPLEARNING.AI Want to master Tensor Flow? Check out our new Tensor Fl...


In [223]:
df_normalized = pd.json_normalize(df_exploded['articles'])

In [247]:
full_df = pd.concat([df_exploded, df_normalized], axis=1)
full_df = full_df[full_df.text.str.len() > 0]
full_df.head()

Unnamed: 0,article_url,articles,text,images
0,https://www.deeplearning.ai/the-batch/issue-i/,"{'text': 'Dear friends, We're busily wrapping up the Machine Learning Yearning book. Meanwhile, ...","Dear friends, We're busily wrapping up the Machine Learning Yearning book. Meanwhile, we'd like ...",[https://dl-staging-website.ghost.io/content/images/2022/09/dfbcdc70-a9a2-4967-95b8-6866a6a0a6bf...
1,https://www.deeplearning.ai/the-batch/issue-i/,{'text': 'News The Robots are Winning Two prominent economists cast doubt on rosy predictions th...,News The Robots are Winning Two prominent economists cast doubt on rosy predictions that automat...,[]
2,https://www.deeplearning.ai/the-batch/issue-i/,{'text': 'Automatic Annotation A new tool promises to speed up the laborious process of annotati...,Automatic Annotation A new tool promises to speed up the laborious process of annotating compute...,[]
3,https://www.deeplearning.ai/the-batch/issue-i/,{'text': 'Drones Go Commercial Alphabet spin-out Wing launched its consumer drone delivery servi...,"Drones Go Commercial Alphabet spin-out Wing launched its consumer drone delivery service, openin...",[]
4,https://www.deeplearning.ai/the-batch/issue-i/,{'text': 'A MESSAGE FROM DEEPLEARNING.AI Want to master Tensor Flow? Check out our new Tensor Fl...,A MESSAGE FROM DEEPLEARNING.AI Want to master Tensor Flow? Check out our new Tensor Flow Special...,[https://dl-staging-website.ghost.io/content/images/2022/09/Sin-t-tulo.png]


In [248]:
full_df[['article_url', 'text', 'images']].to_csv('data/weekly_articles_cleaned.csv',index=False)

## Download Images

In [186]:
# Single articles
all_distinct_image_header_urls = set(df.image_header_cleaned.dropna().unique())
len(all_distinct_image_header_urls)

1644

In [235]:
# Weekly_articles
all_distinct_image_header_urls = set(full_df.explode('images').images.dropna().unique())
len(all_distinct_image_header_urls)


1999

In [236]:
images = {}
for el in all_distinct_image_header_urls:
    if el is not None:
        images[el[51:].replace('/', '_')]=el
len(images)

1999

In [237]:
import os
import io
from PIL import Image
import requests

def save_images_to_local(dataset, output_folder="data/images/"):
    os.makedirs(output_folder, exist_ok=True)
    

    for image_name in dataset:
        image_url = dataset[image_name]
        try:
            image = requests.get(image_url).content
            image = Image.open(io.BytesIO(image))

            image = image.resize((448, 448))

            output_path = os.path.join(output_folder, f"image_{image_name}.png")

            image.save(output_path, format='PNG')
            print(f"Image saved in: {output_path}")
        except Exception as e:
            print(e)

save_images_to_local(images)

Image saved in: data/images/image_2022_10_unnamed--4-.jpg.png
Image saved in: data/images/image_2022_09_Screen20Shot202020-12-2220at209.47.3220AM20copy--1-.png.png
Image saved in: data/images/image_ploads_2021_01_Place20at20bottom20of20TOILETS.png.png
Image saved in: data/images/image_ploads_2021_01_ezgif.com-optimize207.gif.png
Image saved in: data/images/image_2021_05_image-15.png.png
Image saved in: data/images/image_2022_04_GLIDEv2.gif.png
Image saved in: data/images/image_ploads_2021_01_Experience20Replay.gif.png
Image saved in: data/images/image_ploads_2021_01_TF-PT2.png.png
Image saved in: data/images/image_2022_10_CHIPS--1-.jpg.png
Image saved in: data/images/image_ploads_2021_02_ezgif.com-gif-maker-100.gif.png
Image saved in: data/images/image_2022_05_Virus-Animation.gif.png
Image saved in: data/images/image_Sv2-1.gif?upscale=true&width=1200&upscale=true&name=KEYPOINTSv2-1.gif.png
Image saved in: data/images/image_ploads_2021_01_1_Cropped20Roberta.png.png
Image saved in: data/