# Web Scraping

In this Jupyter notebook, the Deutsche Welle website is scraped using the package BeautifulSoup.\
The scraping is mainly achieved with three functions.\
First, the overview page of Deutsche Welle is scraped using the function **scrape_dw_overview_page()**. The function takes a start and an end date and returns a DataFrame with the urls of all articles in between the two dates.\
Secondly, more detailed information of each article is scraped with the function **get_detailed_text()**. This function uses the returned DataFrame of the function scrape_dw_overview_page() and returns a DataFrame that includes all the required information about each article in the input DataFrame.\
Finally, the function **scrape_dw()** combines the first two functions and returns the required DataFrame.


In [3]:
# Imports
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import datetime

In [4]:
def scrape_dw_overview_page(start_date, end_date):
    try:
        # Define the URL of the Page, we want to Scrape and add the start and end dates
        url = f'https://www.dw.com/search/?languageCode=en&contentType=ARTICLE&searchNavigationId=9097-30688&from={start_date}&to={end_date}&sort=DATE&resultsCounter=10'

        # Request the html_code of the url defined above
        response = requests.get(url)

        # Check if the request was successful
        if response.status_code == 200:

            # Get the html_code
            html = response.text

            # Parse the html_code
            soup = BeautifulSoup(html, 'html.parser')

            # Get all articles in soup
            document_div = soup.find('div', attrs={'class': 'searchResults col4'})
            all_search_results = document_div.find_all('div', attrs={'class': 'searchResult'})

            # Set maximum elements
            max_elements = int(document_div.find('span', attrs={'class': 'hits all'}).text.strip())

            # Define the url again and adding max_elements to the request url
            url = f'https://www.dw.com/search/?languageCode=en&contentType=ARTICLE&searchNavigationId=9097-30688&from={start_date}&to={end_date}&sort=DATE&resultsCounter={max_elements}'
            html = requests.get(url).text
            soup = BeautifulSoup(html, 'html.parser')

            # Get all articles in soup
            document_div = soup.find('div', attrs={'class': 'searchResults col4'})
            all_search_results = document_div.find_all('div', attrs={'class': 'searchResult'})

            # Define a empty DataFrame
            df_scrapped = pd.DataFrame(columns=["url"])

            # Get url for all articles in all_search_results,
            for i in range(0, int(max_elements)):
                # Build the URL of a specific article
                url = f"https://www.dw.com{all_search_results[i].find('a').get('href')}"

                # Create a DataFrame with the current URL and append it to df_scrapped
                current_row = pd.DataFrame({'url': [url]})
                df_scrapped = pd.concat([df_scrapped, current_row], ignore_index=True)

                time.sleep(1)

            return df_scrapped
        else: # If the request was not successful
            print(f"Failed to retrieve data from {url}. Status code: {response.status_code}")
            return None

    except Exception as e: # If an error occurred
        print(f"An error occurred: {str(e)}")
        return None

In [5]:
def get_detailed_text(df):

    # Add empty columns to the DataFrame
    df["autor"] = ""
    df["date"] = ""
    df["category"]= ""
    df["region"] = ""  
    df["title"] = ""
    df["summary"] = ""
    df["text"] = ""
    df["subheadings"] = ""
    df["related_topics"] = ""
    
    # Ensure that soup is empty
    soup = "" 

    # Loop through all articles
    for id in range(0, len(df)):
        print(f"\nScraping article {id} of {len(df)}")

        # Get the url of the i-th article
        url = df.iloc[id].url

        try: 
            # Request the url
            response = requests.get(url)

        except:
            print(f"No article found for the url {url}\n")

        # If request was successfully 
        if response.status_code == 200: 

            # Get the html code
            html = response.text
            
            # Create soup by parse the html code
            soup = BeautifulSoup(html, 'html.parser')
             
            # Add url to DataFrame
            df["url"][id] = url

            # Try to get author
            try:
                # Find the a element with the specified class
                a_author = soup.find('a', class_='author-link')
                # Extract the author's name from the span element and add it to the DataFrame
                df["autor"][id]   = a_author.find('span', class_='sc-ezGUZh').text.strip()

            except:
                print(f"No author found for article {df.iloc[id].title}\n{url}\n")
                df["autor"][id]  = None

            # Try to get the date
            try:
                # Find the span element with the specified class directly
                span_publication = soup.find('span', class_='publication')
                time_element = span_publication.find('time')

                # Extract the date from time element 
                date = time_element.text.strip()

                # Format date like'%Y-%m-%d'
                date = datetime.datetime.strptime(date,'%m/%d/%Y').strftime('%Y-%m-%d')

                # Add the date to the DataFrame
                df["date"][id]  = date

            except:
                print(f"No date found for article {df.iloc[id].title}\n{url}\n")
                df["date"][id]  = None 

            # Try to get category
            try:
                # Find the div element with the specified class
                div_kicker = soup.find('div', class_='kicker')
        
                # Find the span elements within the div
                span_elements = div_kicker.find_all('span')

                # Extract the category 
                df["category"][id] = span_elements[0].text

            except:
                print(f"No category found for article {df.iloc[id].title}\n{url}\n")
                df["category"][id] = None

            # Try to get region
            try:
                # Extract the region 
                df["region"][id] = span_elements[1].text

            except:
                print(f"No region found for article {df.iloc[id].title}\n{url}\n")
                df["region"][id] = None

            # Try to get title
            try:
                # Find the h1 element with the specified class
                h1 = soup.find('h1', class_='sc-HjNCl wdGIM sc-iuWDFx cFfUdQ')

                # Extract the title from the h1 element and add it to the DataFrame
                df["title"][id] = h1.text.strip()
                
            except:
                print(f"No title found for article {df.iloc[id].title}\n{url}\n")
                df["title"][id] = None

            # Try to get summary
            try:
                # get summary via css selector
                p_summary = soup.select_one("p.sc-ezGUZh")
                
                # Extract the summary from the p element and add it to the DataFrame
                df["summary"][id] = p_summary.text.strip()

            except:
                print(f"No summary found for article {df.iloc[id].title}\n{url}\n")
                df["summary"][id] = None

            # Try to get text
            try:
                # Find the main content element
                main_content = soup.find('div', class_='sc-ezGUZh sc-kMbQoj llLYdd itboQC sc-czCoYo iBsIdr rich-text has-italic')
                
                # Get all p elements within the main content
                p_elements = main_content.find_all('p')

                # Extract the text of the p elements and combine them to one string
                text = " ".join([p.text.replace('\xa0', ' ') for p in p_elements])

                # Add " to the beginning and end of the string
                text = '"' + text + '"'
                
                # Add the text to the DataFrame
                df["text"][id] = text

            except:
                print(f"No text found for article {df.iloc[id].title}\n{url}\n")
                df["text"][id] = None

            # Try to get subheadings
            try:
                # Find the main content element
                main_content = soup.find('div', class_='sc-ezGUZh sc-kMbQoj llLYdd itboQC sc-czCoYo iBsIdr rich-text has-italic')
                
                # Get the subheadings via h2 within the main content
                h2_subheadings = main_content.find_all('h2')

                # Extract the text of the subheadings and remove the \xa0
                subheadings = [h2.text.replace('\xa0', ' ') for h2 in h2_subheadings]

                # Assign the subheadings to the DataFrame
                df["subheadings"][id] = subheadings

            except:
                print(f"No subheadings found for article {df.iloc[id].title}\n{url}\n")
                df["subheadings"][id] = None

            # Try to get related topics
            try:
                # Find the aside element with the specified class
                links = soup.find('aside', class_='link-wrapper').find_all('a')
                
                # Extract the text of the links
                text_list = [link.text for link in links]

                # Assign the related topics to the DataFrame
                df["related_topics"][id] = text_list

            except:
                print(f"No related topics found for article {df.iloc[id].title}\n{url}\n")
                df["related_topics"][id] = None
                
            # Wait for a second 
            time.sleep(1)

    # Return the DataFrame
    return df


In [6]:
def date_transform(start, end): 
    '''
    Function to transform the date format from YYYY-MM-DD to DD.MM.YYYY
    
    Args: 
        start and end date in the format YYYY-MM-DD
    Return: 
        start and end date in the format DD.MM.YYYY
    '''
    start_date = datetime.datetime.strptime(start, '%Y-%m-%d')
    end_date = datetime.datetime.strptime(end, '%Y-%m-%d')
    start_date_t = datetime.datetime.strftime(start_date, '%d.%m.%Y')
    end_date_t = datetime.datetime.strftime(end_date, '%d.%m.%Y')
    return start_date_t, end_date_t

In [7]:
def scrape_dw(start, end):
    '''
    Function to scrape articles from DW website

    Args:
        start (str): start date in format yyyy-mm-dd
        end (str): end date in format yyyy-mm-dd

    Returns: 
        df (pd.DataFrame): DataFrame with scraped articles
    '''
    # Transform date format from yyyy-mm-dd to dd.mm.yyyy
    start_date, end_date = date_transform(start, end)
    
    # Check if start_date is before end_date
    if start_date > end_date:
        # switch start_date and end_date
        end_date, start_date = start_date, end_date
    
    # Get overview page
    df_overview = scrape_dw_overview_page(start_date=start_date, end_date=end_date)
    
    # Check if scraping overview page was successful
    if df_overview is not None:
        # Get detailed text
        df = get_detailed_text(df_overview)
        return df
    else:
        print('No articles found.')
        return None
    

# Domonstration that the function scrape_dw() works

In [8]:
# Call scrape_dw 
df = scrape_dw( start='2023-10-08', end='2023-10-09')


Scraping article 0 of 27

Scraping article 1 of 27
No author found for article 
https://www.dw.com/en/scholz-and-macron-convene-for-strategic-retreat-in-hamburg/a-67047196


Scraping article 2 of 27
No author found for article 
https://www.dw.com/en/xi-to-senators-us-china-ties-impact-destiny-of-mankind/a-67046529


Scraping article 3 of 27
No author found for article 
https://www.dw.com/en/russia-uses-hamas-attacks-on-israel-for-domestic-propaganda/a-67042395


Scraping article 4 of 27

Scraping article 5 of 27

Scraping article 6 of 27
No author found for article 
https://www.dw.com/en/liberia-election-george-weah-seeks-reelection/a-67040415

No related topics found for article Liberia election: George Weah seeks reelection
https://www.dw.com/en/liberia-election-george-weah-seeks-reelection/a-67040415


Scraping article 7 of 27

Scraping article 8 of 27
No author found for article 
https://www.dw.com/en/afghanistan-earthquake-aid-agencies-appeal-for-help/a-67041833


Scraping articl

## Shape of the DataFrame

In [9]:
# Display the shape of the DataFrame
df.shape

(27, 10)

## Display the first 5 rows of the DataFrame

In [10]:
# Display the first 5 rows of the DataFrame
df.head(5)

Unnamed: 0,url,autor,date,category,region,title,summary,text,subheadings,related_topics
0,https://www.dw.com/en/hamas-attacks-on-israel-...,Lisa Hänel,2023-10-09,Conflicts,Germany,Hamas attacks on Israel triggers debate in Ger...,In the wake of the terrorist attack by Islamis...,"""It began with a tweet by the German Minister ...","[Diverse Muslim community, Central Council of ...","[Rhine River, Robert Habeck, Poverty in German..."
1,https://www.dw.com/en/scholz-and-macron-conven...,,2023-10-09,Politics,Germany,Scholz and Macron convene for 'strategic' retr...,Chancellor Olaf Scholz welcomed President Emma...,"""German Chancellor Olaf Scholz on Monday welco...",[Franco-German ties 'more important than ever'...,"[Emmanuel Macron, French elections, Rhine Rive..."
2,https://www.dw.com/en/xi-to-senators-us-china-...,,2023-10-09,Politics,United States of America,Xi to senators: US-China ties impact 'destiny ...,A US delegation met with Chinese President Xi ...,"""Chinese President Xi Jinping on Monday told a...","[Managing tensions in the South China Sea, Man...","[Uyghur community, Pentagon, Washington, White..."
3,https://www.dw.com/en/russia-uses-hamas-attack...,,2023-10-09,Conflicts,Russian Federation,Russia uses Hamas attacks on Israel for domest...,"Moscow, which has enjoyed good ties with Israe...","""The flag at the Israeli embassy in Moscow is ...","[Russia condemns escalation, Israel condemns R...","[Dmitry Medvedev, BRICS, Black Sea, Russia, Ru..."
4,https://www.dw.com/en/former-israeli-football-...,Stefan Nestler,2023-10-09,Soccer,Israel,Former Israeli football star Lior Asulin murde...,Israeli football fans are mourning the loss of...,"""A day after he had turned 43, Lior Asulin was...","[History maker, Nomadic career]","[Terrorism, Bayern Munich, Israel, Hamas]"


## Look where information is missing

In [11]:
# Check for missing values
df.isnull().sum()

url                0
autor             19
date               0
category           0
region             0
title              0
summary            0
text               0
subheadings        0
related_topics     3
dtype: int64