# German news pages Webscraper

## Imports

In [1]:
import os
import csv
from datetime import date
import datetime
import time

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import logging

import bs4
from bs4 import BeautifulSoup

import pandas as pd

import html.parser

import re

import schedule


## Get HTML of news pages

In [2]:
attempts=3
loadingWebPage_time_long=10
loadingWebPage_time_short=5
retry_time=5

def create_logfile():
    date_time = datetime.datetime.today().strftime('%d-%b-%y_%H:%M:%S')
    logfile = f"/Users/jan/Documents/Python_Projects/Bachelorthesis/log/{date_time}.log"
    logging.basicConfig(filename=logfile, filemode='w', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S', force=True)
    logging.info(f'Log file {logfile} created')
    return logging

def create_html_file(html, newsPage_name, logging):
    # create file
    filepath="/Users/jan/Documents/Python_Projects/Bachelorthesis/HTML_Backlog/"
    logging.info(f"{newsPage_name} Creating html file @ {filepath}")
    dateTime=datetime.datetime.now()
    
    filename=newsPage_name+"_"+dateTime.strftime("%d%m%Y_%H_%M_%S") + ".html"
    logging.info(f"{newsPage_name}: Creating html file @ {filepath}/{filename}")
    
    # delete file if it already exists
    logging.info(f"{newsPage_name}: Check if html already created")
    os.chdir(filepath)
    if os.path.exists(filename):
        os.remove(file)
        logging.info(f"{newsPage_name}: {filename} deleted")
    else:
        logging.info(f"{newsPage_name}: {filename} doesnt exist yet")
    
    with open(filepath+filename,"w+") as file:
        file.write(str(html))
        file.close()
    logging.info(f"{newsPage_name}: Successfully saved file @ {filepath}/{filename}")
    logging.info("_______________________________________________________________________")
                
        
    
          
def startChromedriver(startUpUrl,logging):

    logging.info(f"Starting Chromedriver with @ {startUpUrl} and loadingTime {loadingWebPage_time_long}s")
    ser = Service("/Applications/chromedriver")

    # start chrome driver
    driver = webdriver.Chrome(service=ser)
    driver.get(startUpUrl)

    # wait for page to load
    time.sleep(loadingWebPage_time_long)

    return driver
    
    
def open_WebPage_AcceptCookies(logging, newsPage_name, url, cookieWindowFrame_XPATH, cookieWindowAcceptButton_XPATH):
    loggingInfo=newsPage_name+": "
    logging.info(f"{loggingInfo}Start scraping {newsPage_name} news page...")

    logging.info(f"{loggingInfo}Scraping {newsPage_name} @ {url}")
    
    #cookieWindowFrame_XPATH= "//iframe[contains(@id,'sp_message_iframe_541484')]"
    #cookieWindowAcceptButton_XPATH = "//*[@id=\"notice\"]/div[3]/div[1]/button"
    
    # starting Chromedriver
    driver = startChromedriver(url,logging)
    
    # switch to CookieWindow (IFrame)
    logging.info(f"{loggingInfo}Switching to {newsPage_name} CookieWindow with XPATH: {cookieWindowFrame_XPATH}")
    
    # find iframe
    for attempt in range(attempts):        
        try:
            iframe = driver.find_element(By.XPATH, cookieWindowFrame_XPATH)
            logging.info(f"{loggingInfo}Found cookie window...")
            break
        except Exception as e:
            logging.error(f"{loggingInfo}Can't find cookie window for {newsPage_name} news page")
            logging.error(e)
            time.sleep(retry_time)
            
    for attempt in range(attempts):    
        try:
            # switch to iframe
            driver.switch_to.frame(iframe)
    
            # accept Cookies
            logging.info(f"{loggingInfo}Accepting Cookies with XPATH: {cookieWindowAcceptButton_XPATH}")
            driver.find_element(By.XPATH, cookieWindowAcceptButton_XPATH).click()
            
            # switch back to default frame
            driver.switch_to.default_content()           
            logging.info(f"{loggingInfo}Accepting cookies successfull")
            break
        except Exception as e:
            logging.error(f"{loggingInfo} Error while switching to frame and accepting cookies...")
            logging.error(e)
            time.sleep(retry_time)
            
            
    return driver
    
    
    
    
def get_Spiegel_HTML(logging, driver):
    
    # Scroll down to bottom of page
    logging.info(f"Spiegel: Scrolling down to bottom of page")
    driver.find_element(By.CSS_SELECTOR,"body").send_keys(Keys.CONTROL, Keys.END);
    
    # save HTML file
    create_html_file(driver.page_source,"Spiegel", logging)
    
    # Close the driver to avoid memory leak errors
    driver.quit()

    
    
def get_Sueddeutsche_HTML(logging,driver):
    ammountOfPages = 21
    # save first page 
    
    for page in range(ammountOfPages):
        logging.info("Sueddeutsche: Starting to get Sueddeutsche"+ str(page))
        create_html_file(driver.page_source,"Sueddeutsche"+ str(page),logging)
        
        driver.find_element(By.XPATH, "//*[@id=\"paging\"]/li[3]/a").click()
        time.sleep(loadingWebPage_time_short)
        
    # Close the driver to avoid memory leak errors        
    driver.quit()
        
def get_Bild_HTML(logging,driver):
    # Scroll down to bottom of page
    logging.info(f"Bild: Scrolling down to bottom of page")
    driver.find_element(By.CSS_SELECTOR,"body").send_keys(Keys.CONTROL, Keys.END);
    
    # save HTML file
    create_html_file(driver.page_source,"Bild", logging)
    
    # Close the driver to avoid memory leak errors
    driver.quit()
    
    

## Scrape news pages

In [3]:
csvBacklog_filepath="/Users/jan/Documents/Python_Projects/Bachelorthesis/CSV_Backlog"
htmlBacklog_filepath="/Users/jan/Documents/Python_Projects/Bachelorthesis/HTML_Backlog"

def saveAsCSV(all_news_articles,news_article_labels, filepath):
    with open(filepath + "csv", "a") as f:
        w = csv.writer(f)
        w.writerow(news_article_labels)
        w.writerows(all_news_articles)
        
    
    #df = pd.DataFrame(all_news_articles)
    #df.to_csv(filepath +"csv", index=True)

def start_Scraping(logging):   
    logging.info("*** Starting Scraper ***")
    
    #get list of csvBacklog and make them compareable to htmlBacklog
    logging.info("get list of csv Backlog")
    csv_files = os.listdir(csvBacklog_filepath)
    logging.info(str(len(csv_files))+ " files in csv Backlog...")
    csvBacklog_filenames=[]
    for file in csv_files:
        logging.info(f"FILENAME: from csv Files: {file}")
        file_name= file.replace("csv","html")
        csvBacklog_filenames.append(file_name)
    
    #get list of htmlBacklog    
    logging.info("get list of html Backlog")
    html_files = os.listdir(htmlBacklog_filepath)
    logging.info(str(len(html_files)) +" files in html Backlog...")

    # get all html files that are not scraped yet
    unscraped_html_files=list(set(html_files) - set(csvBacklog_filenames))
    logging.info(str(len(unscraped_html_files)) + " Html files are not scraped yet")
    
    
    logging.info("Start scraping files...")
    for unscraped_file in unscraped_html_files:
        logging.info(f"Scraping: {unscraped_file}")
        
        file_name = str(unscraped_file).replace(htmlBacklog_filepath,"")
        
        if file_name.startswith("Spiegel"):
            scrape_Spiegel_NewsPage(logging,unscraped_file)
            
        elif file_name.startswith("Sueddeutsche"):
            scrape_Sueddeutsche_NewsPage(logging,unscraped_file)
            
        elif file_name.startswith("Bild"):
            logging.error("BILD Scraper not implemented yet!")
        
def scrape_Spiegel_NewsPage(logging, fileName):
    logging.info(f"Starting to Scrape Spiegel file {fileName}")
    # open html file
    html_file=open(htmlBacklog_filepath + "/" +fileName)
    # read html file
    htmlSoup=BeautifulSoup(html_file.read(),"html.parser")
    #f ind all articles
    newsElements=htmlSoup.find_all(attrs={"data-block-el": "articleTeaser"})
    all_news_articles=[]
    # iterate over all news articles
    for newsElement in newsElements:
        article= newsElement.find("article")
        header=article.find("header").find("h2").find("a")
        URL=header.get("href")
        Titel=header.get("title")
        logging.info(f"Trying to scrape: {Titel}")
        header=article.find("header")
        h2=header.find("h2").find("a")
        try: 
            footer=article.find("div", {"class" : "mt-8 flex items-center justify-between"})               
            footer_Text=footer.text
            footer_lst=footer_Text.split("•")
            Date_Info=footer_lst[0].replace("Uhr","").replace("\n","")
            Kategorie=footer_lst[1].replace("\n","")
            
        except:
            footer=article.find("footer", {"class" : "mt-4 inline-block whitespace-nowrap font-sansUI font-normal text-s text-shade-dark dark:text-shade-light"})               
            footer_Text=footer.text
            footer_lst=footer_Text.split("•")
            Date_Info=footer_lst[0].replace("Uhr","").replace("\n","")
            Kategorie=footer_lst[1].replace("\n","")
            
        Zugriff_Datum=str(fileName).replace(htmlBacklog_filepath+ "/Spiegel_","").replace(".html","")

        if "Dezember" in Date_Info:
                Date_Info=Date_Info.replace("Dezember", "December")
        if "Januar" in Date_Info:
                Date_Info=Date_Info.replace("Januar", "January")
                
        file_dateName=fileName.replace("Spiegel_","")        
        
        if len(Date_Info.strip())==5 and "." in Date_Info:
            Date_Info=Date_Info.replace(".",":") 
            
            Date_Info=file_dateName[0:8] + "_"+Date_Info

        elif "," in Date_Info:
            Date_Info = Date_Info.replace(" ","")
            Date_Info = Date_Info.replace(",",file_dateName[4:8]+",",1)
            Date_Info = datetime.datetime.strptime(Date_Info, '%d.%B%Y,%H.%M')
            Date_Info=Date_Info.strftime('%d%m%Y_%H:%M')
            
            #Hotfix for wrong dates at year end
            if "31122022"  in Date_Info:
                Date_Info = Date_Info.replace("31122022","31122021")
            elif "30122022"  in Date_Info:
                Date_Info = Date_Info.replace("30122022","30122021")
            elif "29122022"  in Date_Info:
                Date_Info = Date_Info.replace("29122022","29122021")
            elif "28122022"  in Date_Info:
                Date_Info = Date_Info.replace("28122022","28122021")
            elif "27122022"  in Date_Info:
                Date_Info = Date_Info.replace("27122022","27122021")
            elif "26122022"  in Date_Info:
                Date_Info = Date_Info.replace("26122022","26122021")
            elif "25122022"  in Date_Info:
                Date_Info = Date_Info.replace("25122022","25122021")
                   
        News_page="Spiegel"
        news_article = [Titel,URL,Date_Info,News_page]
        Date_Info = Date_Info.strip()
        news_article_labels=["Titel","URL","Date_Info","News_page"]
        all_news_articles.append(news_article)
    new_csvFilePath=csvBacklog_filepath+"/"+fileName
    new_csvFilePath=new_csvFilePath.replace("html","")
    saveAsCSV(all_news_articles,news_article_labels,new_csvFilePath)
    
def scrape_Sueddeutsche_NewsPage(logging,fileName):
    logging.info(f"Starting to Scrape Sueddeutsche file {fileName}")
    all_news_articles=[]
    #open html file
    html_file=open(htmlBacklog_filepath + "/" +fileName)
    htmlSoup=BeautifulSoup(html_file.read(),"html.parser")
    newsElements=htmlSoup.find("div",class_="entrylist is-detail")
    newsElements_list=newsElements.find_all("div",class_="entrylist__entry")
    
    for newsElement in newsElements_list:
        timeInfo=newsElement.find("time", class_ ="entrylist__time")
        logging.info(timeInfo)
        Date_Info=timeInfo.text.replace("\n","").replace("vor","")
        newTime= timeInfo.text.replace("\n","").replace("vor","")

        content=newsElement.find("div", class_="entrylist__content")
        a=content.find("a",class_="entrylist__link")

        URL=a.get("href")
        detailedInformations=content.find("p",class_="entrylist__detail detailed-information")
        try:
            Overline = a.find("strong", class_="entrylist__overline").text
        except:
            logging.info("No Overline found")
            Overline=""
        
        try:
            Titel=a.find("em",class_="entrylist__title").text
        except:
            Titel=""
        try:
            singleBreadCrumbItem=content.find("span", class_="breadcrumb-list__item").text
            Breadcrumb=singleBreadCrumbItem
        except:
            logging.info("breadcrumb-List__Item not found")
            Breadcrumb=""
        try:
            BreadCrumb = content.find("ul", class_="breadcrumb-list")
            BreadCrumb_list=BreadCrumb.find_all("li",class_="breadcrumb-list__item")
            list1=[]
            for crumb in BreadCrumb_list:
                list1.append(crumb.text.replace("\n","").strip())
            breadcrumbs=",".join(list)
            Breadcrumb=breadcrumbs
        except:
            logging.info("Breadcrumb-list not found")

        try:
            author=detailedInformations.find("span", class_="entrylist__author").text
        except:
            logging.info("No author found")
            author=""

        try:
            detailed_informations=detailedInformations.text.strip()
        except:
            logging.info("No detailed informations found")
            detailed_informations=""


        # create regex to only get the data from filename
        pattern = "(?<=_)(\d{8})(?=_)"
        regex = re.search(pattern, fileName)
        
        # create access date
        Zugriff_Datum = regex.group().strip()

        #Fix Date_Info
        Date_Info=Date_Info.strip()
        if len(Date_Info) == 5 and ":" in Date_Info:
            Date_Info=Zugriff_Datum + "_" + Date_Info
            
        elif "|" in Date_Info:            
            Date_Info = datetime.datetime.strptime(Date_Info, "%d.%m.%Y | %H:%M")
            Date_Info=Date_Info.strftime('%d%m%Y_%H:%M')
        elif "Min." in Date_Info:
            Date_Info = Date_Info.replace(" Min.","")
            timeDelta = datetime.timedelta(minutes=int(Date_Info))
            
            Date_Span = regex.span()
            
            Date_Span = list(Date_Span)      
            Date_Span[1]=Date_Span[1]+9
            
            Date_Info_From_FileName=fileName[int(Date_Span[0]) : int(Date_Span[1])]
            Date_Info_From_FileName=datetime.datetime.strptime(Date_Info_From_FileName,'%d%m%Y_%H_%M_%S')
            Date_Info_From_FileName=Date_Info_From_FileName.strftime('%d%m%Y_%H:%M')
            Date_Info_From_FileName=datetime.datetime.strptime(Date_Info_From_FileName,'%d%m%Y_%H:%M')
            
            Date_Info = Date_Info_From_FileName - timeDelta
            Date_Info=Date_Info.strftime('%d%m%Y_%H:%M')
        elif "gerade eben" in Date_Info:
            Date_Span = regex.span()
            
            Date_Span = list(Date_Span)      
            Date_Span[1]=Date_Span[1]+9
            Date_Info_From_FileName=fileName[int(Date_Span[0]) : int(Date_Span[1])]
            Date_Info_From_FileName=datetime.datetime.strptime(Date_Info_From_FileName,'%d%m%Y_%H_%M_%S')
            Date_Info_From_FileName=Date_Info_From_FileName.strftime('%d%m%Y_%H:%M')
            Date_Info_From_FileName=datetime.datetime.strptime(Date_Info_From_FileName,'%d%m%Y_%H:%M')
            Date_Info=Date_Info_From_FileName.strftime('%d%m%Y_%H:%M')
        else:
            ErrorCounter.append(Date_Info)
        News_page="Sueddeutsche"   
        Date_Info = Date_Info.strip()
        news_article = [Titel, Date_Info, URL, Overline, Breadcrumb, author, detailed_informations, Zugriff_Datum,News_page]
        news_article_labels=["Titel", "Date_Info", "URL", "Overline", "Breadcrumb", "author", "detailed_informations", "Zugriff_Datum","News_page"]
        all_news_articles.append(news_article)
        
    new_csvFilePath=csvBacklog_filepath+"/"+fileName
    new_csvFilePath=new_csvFilePath.replace("html","")
    saveAsCSV(all_news_articles, news_article_labels, new_csvFilePath)

ErrorCounter=[]

    
    

In [4]:
def MAIN():
    spiegel_url="https://www.spiegel.de/schlagzeilen/"
    spiegel_cookieWindowFrame_XPATH = "//iframe[contains(@id,'sp_message_iframe_541484')]"
    spiegel_cookieAcceptButton_XPATH = "//*[@id=\"notice\"]/div[3]/div[1]/button"

    sueddeutsche_url="https://www.sueddeutsche.de/news"
    sueddeutsche_cookieWindowFrame_XPATH = "//iframe[contains(@id,'sp_message_iframe_596049')]"
    sueddeutsch_cookieAcceptButton_XPATH= "//*[@id=\"notice\"]/div[3]/div/div/button[1]"
    bild_url = "https://www.bild.de/home/newsticker/news/alle-news-54190636.bild.html"
    bild_cookieWindowFrame_XPATH = "//iframe[contains(@id,'sp_message_iframe_585666')]"
    bild_cookieAcceptButton_XPATH = "//*[@id=\"notice\"]/div[3]/div[2]/button"


    logging = create_logfile()
    # Spiegel
    driver = open_WebPage_AcceptCookies(logging, "Spiegel", spiegel_url, spiegel_cookieWindowFrame_XPATH,spiegel_cookieAcceptButton_XPATH)
    get_Spiegel_HTML(logging,driver)

    # Sueddeutsche
    driver = open_WebPage_AcceptCookies(logging,"Sueddeutsche", sueddeutsche_url,sueddeutsche_cookieWindowFrame_XPATH, sueddeutsch_cookieAcceptButton_XPATH)
    get_Sueddeutsche_HTML(logging,driver)

    # Bild
    driver =open_WebPage_AcceptCookies(logging,"Bild" ,bild_url,bild_cookieWindowFrame_XPATH,bild_cookieAcceptButton_XPATH)
    get_Bild_HTML(logging,driver)

    logging.info("###################### Downloading news pages HTML finished!... ######################")
    start_Scraping(logging)


In [13]:
schedule.every().day.at("12:00").do(MAIN)

while True:
    schedule.run_pending()
    time.sleep(60)


NoSuchWindowException: Message: no such window: window was already closed
  (Session info: chrome=98.0.4758.80)
Stacktrace:
0   chromedriver                        0x000000010505c280 chromedriver + 4833920
1   chromedriver                        0x0000000104ff0bf8 chromedriver + 4393976
2   chromedriver                        0x0000000104be6c84 chromedriver + 158852
3   chromedriver                        0x0000000104bd7b28 chromedriver + 97064
4   chromedriver                        0x0000000104bd24fc chromedriver + 75004
5   chromedriver                        0x0000000104c3f918 chromedriver + 522520
6   chromedriver                        0x0000000104c0b7e0 chromedriver + 309216
7   chromedriver                        0x000000010501e828 chromedriver + 4581416
8   chromedriver                        0x0000000105033450 chromedriver + 4666448
9   chromedriver                        0x0000000105037d1c chromedriver + 4685084
10  chromedriver                        0x0000000105033c28 chromedriver + 4668456
11  chromedriver                        0x0000000105014610 chromedriver + 4539920
12  chromedriver                        0x000000010504d82c chromedriver + 4773932
13  chromedriver                        0x000000010504d9a0 chromedriver + 4774304
14  chromedriver                        0x0000000105062e44 chromedriver + 4861508
15  libsystem_pthread.dylib             0x00000001b474d240 _pthread_start + 148
16  libsystem_pthread.dylib             0x00000001b4748024 thread_start + 8


In [17]:
MAIN()

ValueError: time data '15.Februar2022,23.09' does not match format '%d.%B%Y,%H.%M'