# Text analytics

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException

from selenium.common.exceptions import StaleElementReferenceException

from selenium.webdriver.common.keys import Keys  
from selenium.webdriver.common.action_chains import ActionChains  

import pyautogui  
import pandas as pd
import time
import random

### Datensammlung: Artikel

In [None]:
dax_companies = {
    36714349 : "Adidas", #N
    98641 : "Airbus Group (EADS)", #N
    83219 : "Allianz", #N
    34694526 : "BASF", #N
    25272187 : "Bayer", #N
    81500 : "Beiersdorf", #N
    81490 : "BMW", #N
    28975512 : "Brenntag", #N
    83139 : "Commerzbank", #N
    81967 : "Continental", #N
    12587335 : "Covestro", #L
    208894340 : "Daimler Truck", #L
    81348 : "Deutsche Bank", #L
    3459922 : "Deutsche Börse", #L
    181029 : "Deutsche Telekom", #L
    82088 : "DHL Group (ex Deutsche Post)", #L
    21074892 : "E.ON", #L
    82235 : "Fresenius", #L
    182351 : "Hannover Rück", #L
    82340 : "Heidelberg Materials", #L
    82344 : "Henkel", #J
    82561 : "Infineon", #J
    82840 : "Mercedes-Benz Group (Daimler)", #J
    82676 : "Merck", #J
    12254991 : "MTU Aero Engines", #J
    83258 : "Münchener Rück", #J
    231291652 : "Porsche AG (Vz.)", #J
    21178031 : "Porsche Automobil Holding SE", #J
    90268 : "Qiagen NV", #J
    82811 : "Rheinmetall", #J
    82818 : "RWE",        # M
    82849 : "SAP",        # M
    82852 : "Sartorius (Vz.)",        # M
    82902 : "Siemens",        # M
    180455076 : "Siemens Energy",        # M
    134397957 : "Siemens Healthineers",        # M
    15630917 : "Symrise",        # M
    83057 : "Volkswagen (Vz.)",        # M
    62903083 : "Vonovia",        # M
    81388537 : "Zalando"        # M
          }


news_subtypes = {
    1 : "Aktieneinstufung",
    2 : "Analyse",
    3 : "Directors Dealings",
    4 : "Empfehlung",
    5 : "Erklärstück",
    6 : "Fundamentalanalyse",
    7 : "Interview",
    8 : "Kommentar",
    9 : "Marktbericht",
    10 : "News", 
    11 : "Pflichtmitteilung",
    12 : "Pressemitteilung",
    13 : "Ratgeber",
    14 : "Technische Analyse",
    15 : "Themen-Spezial"
}

# Words that are marked as paragraphs in the HTML structure, but are not relevant for the analysis.
filtered_words = ["Top Aktien", "Top Märkte", "Ratgeber", "onvista media GmbH", "Rechtliche Hinweise", "App", "Social Media", "Veröffentlichung der Original-Studie", "Erstmalige Weitergabe der Original-Studie"]  

detailedFeedback = True  # True if the title, date, source, URL and content of each article are to be displayed as feedback in the terminal. If False, only general progress is displayed.
zoom_out = True  # True if the browser is to be zoomed out to 80%.
windows = True  # True if Windows device is used, False for Mac (regarding keyboard shortcuts to zoom out).

driver = webdriver.Firefox()
driver.set_window_size(1920, 1080)

# Iteration over all keys of the companies from the dictionary 'dax_companies'.
for key in dax_companies.keys():  
    
    article_data = []
    
    # Iteration per company across all 15 news types
    for subtype in news_subtypes.keys():  
        
        # Iteration per company and subtype over max. 10 page numbers
        for page in range(0,10):  
            
            driver.get(f'https://www.onvista.de/news/finder?page={page}&entityType=STOCK&entityValue={key}&idSubTypeGroups={subtype}')  # Filtered pages can be called up via the company-specific key
            
            time.sleep(random.uniform(4,5))

            if zoom_out and windows:
                pyautogui.keyDown('ctrl')  
                pyautogui.press('-')
                pyautogui.press('-')
                pyautogui.keyUp('ctrl')
                time.sleep(1)
                zoom_out = False
            else:
                pyautogui.keyDown('command')  
                pyautogui.press('-')
                pyautogui.press('-')
                pyautogui.keyUp('command')
                time.sleep(1)
                zoom_out = False

            # Cookie Banner Handling
            try:
                iframe = driver.find_element(By.CSS_SELECTOR, "#sp_message_container_800509 > iframe")  # id of the div above the iframe containing the button on the cookie pop-up
                driver.switch_to.frame(iframe)  # Switches the focus of the webdriver to the content of the iframe
                
                button = driver.find_element(By.XPATH, "//button[@class='message-component message-button no-children focusable sp_choice_type_13']").click()
                driver.switch_to.default_content()    # Switches the focus of the web driver to the content of the web page

            except NoSuchElementException:
                pass
            
            time.sleep(1)


            # Article overview:
            try:

                
                articles = driver.find_elements(By.XPATH,'//div[@class="ArticleTeaser_ov-article-teaser__content__yC6QF flex-layout__grow--1"]')
                counter = 0   #Counter to give feedback on the progress of the scraping.

                if len(articles) == 0:  # Checks whether the current page of the article overview is empty.
                    print("#"*100)
                    print(f"[o] Auf Seite {page+1} wurden keine Artikel gefunden.")
                    print("#"*100)

                    break  # Interrupts the iteration over the page numbers

                else:
                    print("#"*100)
                    print(f"[+] Es wurden {len(articles)} Artikel für {dax_companies[key]} zum Newstyp '{news_subtypes[subtype]}' auf Seite {page+1} gefunden.")
                    print("#"*100)

            except NoSuchElementException:                
                pass

            # Iteration over all articles found
            for article in articles:
                
                article.location_once_scrolled_into_view  # Automatic scrolling to each article
                time.sleep(random.uniform(1, 2.5))

                try:
                    ad_element = article.find_element(By.XPATH, ".//div[text()='Werbung']")
                    vid_element = article.find_element(By.XPATH, ".//div[text()='Videoanalyse']")

                    if ad_element or vid_element:
                        print("-"*100)
                        print(f"[*] Werbung gefunden")
                        print("-"*100)
                        counter += 1  # Since article is skipped, counter for next article must be adjusted
                        continue      

                except NoSuchElementException:
                    pass
                
                
                article_link =  article.find_element(By.XPATH,'.//strong[@class="ov-display--block ov-word-wrap"]')
                actions = ActionChains(driver)
                
                if windows:                   
                    actions.key_down(Keys.CONTROL).click(article_link).key_up(Keys.CONTROL).perform()  # Opens article in a new tab
                else: 
                    actions.key_down(Keys.COMMAND).click(article_link).key_up(Keys.COMMAND).perform()

                driver.switch_to.window(driver.window_handles[1])  # Focus of the webdriver is directed to new tab

                time.sleep(random.uniform(4.1, 6.4))  # Waiting time to ensure that page is fully loaded and to simulate natural user behaviour

                # Extraction of the title
                try:
                    WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.XPATH,'//h1[@class="headline headline--h1 headline--full-width outer-spacing--xxsmall-bottom"]')))   
                    title = driver.find_element(By.XPATH, '//h1[@class="headline headline--h1 headline--full-width outer-spacing--xxsmall-bottom"]').text.strip() 
                    
                except NoSuchElementException:
                    title = "NO_TITLE"
                
                except TimeoutException: 
                    print("[*] Seite konnte innerhalb von 60 Sekunden nicht geladen werden! Fahre fort mit nächsten Artikel.")
                    driver.close()  
                    driver.switch_to.window(driver.window_handles[0]) 
                    counter += 1
                    continue

                # Extraction of the date and time of publication
                try:
                    date = driver.find_element(By.XPATH, '//time[@class="color--cd-anthracite"]').text.strip()
                    
                except NoSuchElementException:
                    date = "NO_DATE"


                # Extraction of the source
                try:
                    source = driver.find_element(By.XPATH, '//a[@class="link link--unstyled link--underline"]').text.strip()
                
                except NoSuchElementException:
                    source = "NO_SOURCE"

                time.sleep(1)
                # Extraction of the text content. Depending on the article structure, different approaches are needed to collect the content.
                try:  
                    
                    if "dpa" in source and subtype == 3 or subtype == 12:  # dpa publishes "Directors Dealing" and "Press Releases" with pre-tag at subtype.
                        pre = driver.find_element(By.TAG_NAME, "pre").text  # rarely occurring pre-tag
                        if len(pre) > 100:                           
                            text = pre

                    elif "EQS" in source:  # Source that publishes public info in the form of tables

                        text = driver.find_element(By.TAG_NAME, "table").text

                    else :  # Remaining articles in paragraphs
                            
                        article_text = driver.find_elements(By.TAG_NAME,'p')  # Contains all paragraphs of the current article. The content of the article is best accessed via the paragraphs.
                        time.sleep(1)

                        # Since partially not recognisable, it is checked whether article_text is empty in order to try the following approaches
                        if article_text is None:  

                            article_text = driver.find_elements(By.CSS_SELECTOR, 'p.paragraph') 

                        elif article_text is None:

                            article_text = driver.find_elements(By.XPATH, '//p[@class="paragraph Styles_ov-content-item-list__item__wK2EM"]') 

                    
                        text_list = []  # List that is to contain one paragraph per element and is later combined into a string (text) with line breaks per element.

                        # Iteration over all contained paragraphs
                        for paragraph in article_text: 
                            
                            paragraph.location_once_scrolled_into_view
                            tx = paragraph.text
                            time.sleep(0.5)
                            text_list.append(tx)  
                            
                            # The last 9 elements are mostly/always the words that occur in 'filtered_words', but which have been declared as paragraphs according to the HTML structure.
                            for element in text_list[-9:]:  

                                if element in filtered_words:
                                    text_list.remove(element) 

                            text = '\n'.join(text_list)  # The paragraphs are joined together to form a string, which is separated by line breaks (\n).

                except NoSuchElementException:
                    text = "NO_TEXT"

                except StaleElementReferenceException:
                    print("[*] Stale Element Reference! Fahre fort mit nächsten Artikel.")
                    driver.close()  # Tab schließen
                    driver.switch_to.window(driver.window_handles[0]) 
                    continue

                # Merging the collected data of the article
                data = {"Unternehmen" : dax_companies[key], "Newstyp" : news_subtypes[subtype],  "Titel" : title, "Datum" : date, "Quelle" : source, "URL" : driver.current_url, "Text" : text}  

                article_data.append(data)
                
                counter += 1

                print("-"*100)
                print(f"[*] Artikel {counter} / {len(articles)} auf Seite {page+1} überprüft.")
                print("-"*100)

                if detailedFeedback:
                    print(f"Unternehmen: {dax_companies[key]}")
                    print(f"Newstyp: {news_subtypes[subtype]}")
                    print(f"Titel: {title}")
                    print(f"Datum: {date}")
                    print(f"Quelle: {source}")
                    print(f"URL: {driver.current_url}")
                    print(f"Inhalt:\n{text}")
                    print("-"*100)

                driver.close()  # closing tab
                driver.switch_to.window(driver.window_handles[0])  # Focus back to the first tab with the overview of the articles



    company = dax_companies[key]

    df = pd.DataFrame(article_data)

    df.to_excel(f'data/Onvista_Articles-{company}.xlsx', index=False)
    df.to_csv(f'data/Onvista_Articles-{company}.csv', index=False)

driver.close()