In [3]:
# Webscraping with Python
# X Chapter 1 - Introduction
import os
os.chdir("C:/Users/Manue/Documents/Webscraping")

# Erste Website Öffnen
from urllib.request import urlopen
html = urlopen("http://pythonscraping.com/pages/page1.html")
print(html.read())

# Mehr Infos: https://docs.python.org/3/library/urllib.html

b'<html>\n<head>\n<title>A Useful Page</title>\n</head>\n<body>\n<h1>An Interesting Title</h1>\n<div>\nLorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\n</div>\n</body>\n</html>\n'


In [5]:
# X BeautifulSoup verwenden 
# Die Library beautifulsoup4 hilft dabei, hmtl Dateien besser darzustellen und in Python zu verwenden
# Mehr Infos auf:
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/

# import sys
# !{sys.executable} -m pip install beautifulsoup4

from bs4 import BeautifulSoup
html = urlopen("http://pythonscraping.com/pages/page1.html")
bs = BeautifulSoup(html.read(), 'html.parser')
print(bs)
print(bs.h1)
print(bs.html.body.h1)

<html>
<head>
<title>A Useful Page</title>
</head>
<body>
<h1>An Interesting Title</h1>
<div>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</div>
</body>
</html>

<h1>An Interesting Title</h1>
<h1>An Interesting Title</h1>


In [7]:
# Exception Handling
# bei urlopen("url") können zwei Fehler auftreten
# a) Die Seite wurde beim Server nicht gefunden
# >> raised einen HTTPError

from urllib.error import HTTPError

try:
    html = urlopen("http://pythonscraping.com/pages/page100.html")

except HTTPError as e:
    print(e)

# b) Der Server wurde nicht erreicht
# >> raised einen URLError

from urllib.error import URLError

try:
    html = urlopen("http://pythonscrapinng.com/pages/page1.html")
except URLError as e:
    print("Server could not be found")
else:
    print("It worked")


HTTP Error 404: Not Found
Server could not be found


In [9]:
# Beispiel eines kleines Scraping-Programms, dass die Fehler abdeckt

def get_title(url):
    try:
        html = urlopen(url)
    except HTTPError as e:
        print(e)
        return None
    try:
        bs = BeautifulSoup(html.read(), "html.parser")
        title = bs.body.h1
    except AttributeError as e:
        return None
    else:
        return title

get_title("http://pythonscraping.com/pages/page1.html")


<h1>An Interesting Title</h1>

In [13]:
# Chapter 2: Advanded HTML-Parsing
# Der Advent von CSS unterstützt das Webscraper, da sonst gleichen Tags unterschiedliche Klassen zugeordnet werden, um diese zu unterschiedlich zu stylen
# eg. <span class="red"> und <span class="grün">
get_title("http://pythonscraping.com/pages/warandpeace.html")

bs = BeautifulSoup(urlopen("http://pythonscraping.com/pages/warandpeace.html"), "html.parser")

# Extrahieren alls spans mit class="grün" von der hmtl-Datei
namelist = bs.find_all("span", {"class":"green"})

for name in namelist[:4]:
    print(name.get_text())

taglist = bs.find_all(class_="green")

print("\n")
print(taglist[:4]) # Ergebnis als Liste gespeichert

Anna
Pavlovna Scherer
Empress Marya
Fedorovna
Prince Vasili Kuragin
Anna Pavlovna


[<span class="green">Anna
Pavlovna Scherer</span>, <span class="green">Empress Marya
Fedorovna</span>, <span class="green">Prince Vasili Kuragin</span>, <span class="green">Anna Pavlovna</span>]


In [31]:
# Taglisten navigieren

html = urlopen("http://pythonscraping.com/pages/page3.html")
bs = BeautifulSoup(html.read(), "html.parser")

# Children sind direkte Nachfolger von einem tag und können mit .children ausgewählt werden
# Descendents hingegen sind entferntere Nachfolger eines Tags und können mit .descendents ausgewählt werden

for child in bs.find('table',{'id':'giftList'}).children:
    print(child)


# next_siblings gibt alle Tags außer das erste: Überspringt title Tag in einer Tabelle
print("\nXX Sibling Selection")
for sibling in bs.find('table', {'id': 'giftList'}).tr.next_siblings:
    print(sibling)

# genauso gibt es previous_siblings, was die gleichen vorherigen Tags unter dem parent-Tag auswählt
# next_sibling und previous_sibling geben hingegen nur ein Tag aus!


# Test: Geschenke finden
print("\nXX Test Geschenktitel")
for tag in bs.find_all('tr', {'class': 'gift'}):
    print(tag.find('td').get_text())




<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>


<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>


<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>


<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg"/>


In [33]:
# Regular Expressions (regex)
# [A-Za-z0-9\._+]+@[A-Za-z]+\.(com|org|edu|net) # eine Regular Expression für Emails

import re

# Beispiel für die Verwendung von regular expressions
images = bs.find_all('img', {'src':re.compile('../img/gifts/img.*.jpg')})

for image in images: 
    print(image['src']) # Zugriff auf Attribut via [], alternativ können mit .attrs ein Dictionär der Attribute ausgegeben werden mit [key] accessable


# Lambda Funktionen 
# ... könnnen ebenfalls bei der Auswahl helfen

bs.find_all(lambda tag: len(tag.attrs) == 2)
# wählt tags mit genau 2 Attributen vorweisen

../img/gifts/img1.jpg
../img/gifts/img2.jpg
../img/gifts/img3.jpg
../img/gifts/img4.jpg
../img/gifts/img6.jpg


[<img src="../img/gifts/logo.jpg" style="float:left;"/>,
 <tr class="gift" id="gift1"><td>
 Vegetable Basket
 </td><td>
 This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
 <span class="excitingNote">Now with super-colorful bell peppers!</span>
 </td><td>
 $15.00
 </td><td>
 <img src="../img/gifts/img1.jpg"/>
 </td></tr>,
 <tr class="gift" id="gift2"><td>
 Russian Nesting Dolls
 </td><td>
 Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
 </td><td>
 $10,000.52
 </td><td>
 <img src="../img/gifts/img2.jpg"/>
 </td></tr>,
 <tr class="gift" id="gift3"><td>
 Fish Painting
 </td><td>
 If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
 </td><td>
 $10,005.00
 </td><td>
 <img src="../img/gifts/img3.jpg"/>
 </td>

In [35]:
# X Chapter 3: Writing a WebCrawler
# Biem Scrapen liegt das Interesse selten auf einer einzelnen statischen Website
# Häufig werden Seiten nach Links durchsucht, die zu neuen Websites führen, die wiederum gescraped werden usw.

# Beispiel: relevante Links bei Wikipedia finden
from urllib.request import urlopen
from bs4 import BeautifulSoup 

html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find('div', {"id": "bodyContent"}).find_all('a'):
    if 'href' in link.attrs:
        print(link.attrs['href'])

# Problem, dass allgemeine Seiten von Wikibedia enthalten sind, wie About, Random Page, Privacy Policy etc.


/wiki/Wikipedia:Protection_policy#semi
/wiki/Kevin_Bacon_(disambiguation)
/wiki/File:Kevin_Bacon_in_2022.jpg
/wiki/Philadelphia
/wiki/Kevin_Bacon_filmography
/wiki/Kyra_Sedgwick
/wiki/Sosie_Bacon
/wiki/Edmund_Bacon_(architect)
/wiki/Michael_Bacon_(musician)
http://baconbros.com
#cite_note-1
#cite_note-actor-2
/wiki/Leading_man
/wiki/Character_actor
/wiki/Golden_Globe_Award
/wiki/Screen_Actors_Guild_Award
/wiki/Primetime_Emmy_Award
/wiki/National_Lampoon%27s_Animal_House
/wiki/Diner_(1982_film)
/wiki/Footloose_(1984_film)
/wiki/JFK_(film)
/wiki/A_Few_Good_Men
/wiki/Apollo_13_(film)
/wiki/Mystic_River_(film)
/wiki/Frost/Nixon_(film)
/wiki/Friday_the_13th_(1980_film)
/wiki/Tremors_(1990_film)
/wiki/The_River_Wild
/wiki/The_Woodsman_(2004_film)
/wiki/Crazy,_Stupid,_Love
/wiki/X-Men:_First_Class
/wiki/Patriots_Day_(film)
/wiki/Losing_Chase
/wiki/Loverboy_(2005_film)
/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Miniseries_or_Television_Film
/wiki/Screen_Actors_Guild_Award_for_Outstandin

In [37]:
# >> Eine RegularExpression verschafft Abhilfe

html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$')):
    if 'href' in link.attrs:
        print(link.attrs['href'])

/wiki/Kevin_Bacon_(disambiguation)
/wiki/Philadelphia
/wiki/Kevin_Bacon_filmography
/wiki/Kyra_Sedgwick
/wiki/Sosie_Bacon
/wiki/Edmund_Bacon_(architect)
/wiki/Michael_Bacon_(musician)
/wiki/Leading_man
/wiki/Character_actor
/wiki/Golden_Globe_Award
/wiki/Screen_Actors_Guild_Award
/wiki/Primetime_Emmy_Award
/wiki/National_Lampoon%27s_Animal_House
/wiki/Diner_(1982_film)
/wiki/Footloose_(1984_film)
/wiki/JFK_(film)
/wiki/A_Few_Good_Men
/wiki/Apollo_13_(film)
/wiki/Mystic_River_(film)
/wiki/Frost/Nixon_(film)
/wiki/Friday_the_13th_(1980_film)
/wiki/Tremors_(1990_film)
/wiki/The_River_Wild
/wiki/The_Woodsman_(2004_film)
/wiki/Crazy,_Stupid,_Love
/wiki/Patriots_Day_(film)
/wiki/Losing_Chase
/wiki/Loverboy_(2005_film)
/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Miniseries_or_Television_Film
/wiki/Screen_Actors_Guild_Award_for_Outstanding_Performance_by_a_Male_Actor_in_a_Miniseries_or_Television_Movie
/wiki/Michael_Strobl
/wiki/HBO
/wiki/Taking_Chance
/wiki/Fox_Broadcasting_Company
/wik

In [39]:
# Damit dieser Prozess sinnvoll eingesetzt werden kann, sollte er in einer Funktion festgehalten werden

import random

def get_links(articleURL: str):
    # article Tag in Link umwandeln
    try:
        html = urlopen("http://en.wikipedia.org{}".format(articleURL))
    
    except HTTPError as e:
        print(e)
        return None

    except URLError as e:
        print(e)
        return None
        
    bs = BeautifulSoup(html, "html.parser")
    
    # Links scrapen
    try:
        article_tags = [link.attrs['href'] for link in bs.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))]

    except AttributeError as e:
        print("Attribute Error detected: Change in tag structure?")
        return None
        
    return article_tags

def link_crawler(article_tags: list, num :int):
    while len(article_tags) < num:
        scrape_list = article_tags.copy()
        tag = random.sample(scrape_list, 1)[0]
        print(tag)

        scrape_list.remove(tag)
        
        scraped_tags = get_links(tag)
        
        if scraped_tags != None:
            scrape_list += scraped_tags
            article_tags += scraped_tags

    return article_tags
         
link_list = link_crawler(['/wiki/Kevin_Bacon'], 500)


/wiki/Kevin_Bacon
/wiki/Geoffrey_Rush


In [41]:
pages = set()

def getLinks(pageUrl):
    global pages
    if len(pages) <10:
        html = urlopen('http://en.wikipedia.org{}'.format(pageUrl))
        bs = BeautifulSoup(html, 'html.parser')
        try:
            print(bs.h1.get_text())
            print(bs.find(id ='mw-content-text').find_all('p')[0].get_text())
            
        except AttributeError:
            print('This page is missing something! Continuing.')
    
        for link in bs.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$')):
            if 'href' in link.attrs:
                if link.attrs['href'] not in pages:
                    #We have encountered a new page
                    newPage = link.attrs['href']
                    print('-'*20)
                    print(newPage)
                    pages.add(newPage)
                    getLinks(newPage)

getLinks('/wiki/Kevin_Bacon')

Kevin Bacon


--------------------
/wiki/Kevin_Bacon_(disambiguation)
Kevin Bacon (disambiguation)
Kevin Bacon (born 1958) is an American film and theater actor and musician.

--------------------
/wiki/Kevin_Bacon
Kevin Bacon


--------------------
/wiki/Philadelphia
Philadelphia


--------------------
/wiki/Geographic_coordinate_system
Geographic coordinate system


--------------------
/wiki/Spatial_reference_system
Spatial reference system
A spatial reference system (SRS) or coordinate reference system (CRS) is a framework used to precisely measure locations on the surface of Earth as coordinates. It is thus the application of the abstract mathematics of coordinate systems and analytic geometry to geographic space. A particular SRS specification (for example, "Universal Transverse Mercator WGS 84 Zone 16N") comprises a choice of Earth ellipsoid, horizontal datum, map projection (except in the geographic coordinate system), origin point, and unit of measure. Thousands of coordinate 

In [83]:
# Externe Websiten scrapen
from urllib.parse import urlparse

def getInternalLinks(bs, includeURL):
    # scheme ergänzt http oder https und netloc enthält die Basis URL
    includeURL = "{}://{}".format(urlparse(includeURL).scheme, 
                                  urlparse(includeURL).netloc)
    internalLinks = []
    # Alle internen Links finden
    for link in bs.find_all('a',
                           href = re.compile('^(/|.*'+includeURL+')')): #url soll im Link tnahlten sein
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                # überprüft, ob es sich um einen verkürzten Internen Link oder einen vollen link handelt
                if (link.attrs['href'].startswith('/')):
                    internalLinks.append(includeURL + link.attrs['href'])
                else:
                    internalLinks.append(link.attrs['href'])
    return internalLinks

def getInternalLinks(bs, includeUrl):
    includeUrl = '{}://{}'.format(urlparse(includeUrl).scheme,
        urlparse(includeUrl).netloc)
    internalLinks = []
    #Finds all links that begin with a "/"
    for link in bs.find_all('a'):
        #href=re.compile('^(/|.*'+includeUrl+')')
        print(link)
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                if(link.attrs['href'].startswith('/')):
                    internalLinks.append(
                        includeUrl+link.attrs['href'])
                else:
                    internalLinks.append(link.attrs['href'])
    return internalLinks


def getExternalLinks(bs, excludeURL):
    externalLinks = []
    # finde alle Links, bei denen die aktuelle URL nicht enthalten ist
    for link in bs.find_all('a', href = re.compile('^(http|www)((?!'+ excludeURL+').)*$')):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in externalLinks:
                externalLinks.append(link.attrs['href'])
    return externalLinks

def getRandomExternalLink(startingPage):
    html = urlopen(startingPage)
    bs = BeautifulSoup(html, "html.parser")
    # Versuch External Links zu extrahieren
    externalLinks = getExternalLinks(bs,
                                    urlparse(startingPage).netloc)
    # Wenn keine externen Links auf der ersten Seite gefunden wurden, wird die Seite nach weiteren internen Links durchsucht
    if len(externalLinks) == 0:
        print("No external links have been found, Looking around the site for one")
        domain = '{}://{}'.format(urlparse(startingPage).scheme, urlparse(startingPage).netloc)
        internalLinks = getInternalLinks(bs, domain)
        print(internalLinks)
        # recursion bis external Links ausgegeben werden können
        return getRandomExternalLink(internalLinks[random.randint(0,
                                    len(internalLinks)-1)])
    else:
        return externalLinks[random.randint(0, len(externalLinks)-1)]


# Mein Funktion, die externe Links verfolgt
def followExternalOnly(startingSite):
    try:
        externalLink = getRandomExternalLink(startingSite)
    except HTTPError as e:
        print('HTTP ERROR') 
        externalLink = getRandomExternalLink(startingSite)
    print('Random external link is: {}'.format(externalLink))
    followExternalOnly(externalLink) # recursion

followExternalOnly('http://oreilly.com/')


includeURL = "{}://{}".format(urlparse('http://en.wikipedia.org').scheme, urlparse('http://en.wikipedia.org').netloc)
print(includeURL)

Random external link is: https://www.youtube.com/user/OreillyMedia
Random external link is: https://support.google.com/youtube/contact/de_cancellation
Random external link is: https://policies.google.com/terms?hl=en
Random external link is: https://www.google.com/permissions?hl=en
Random external link is: https://www.blog.google/
Random external link is: https://blog.google/products/pixel/feature-drop-october-2024/
Random external link is: https://policies.google.com/privacy
Random external link is: https://myaccount.google.com/personal-info?utm_source=pp
Random external link is: https://accounts.google.com/TOS?loc=DE&hl=en-US
Random external link is: https://transparency.google/?hl=en_US
Random external link is: https://www.youtube.com/googleprivacy
Random external link is: https://support.google.com/youtube/contact/de_cancellation
Random external link is: https://myaccount.google.com/privacypolicy?hl=en
Random external link is: https://support.google.com/websearch?p=privpol_locserp&h

ValueError: empty range in randrange(0, 0)

In [81]:
# Alle Externen Links einer Seite suchen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
allExtLinks = set()
allIntLinks = set()

def getAllExternalLinks(siteUrl):
    html = urlopen(siteUrl)
    domain = '{}://{}'.format(urlparse(siteUrl).scheme,
        urlparse(siteUrl).netloc)
    bs = BeautifulSoup(html, 'html.parser')
    
    internalLinks = getInternalLinks(bs, domain)
    externalLinks = getExternalLinks(bs, domain)
    
    for link in externalLinks:
        if link not in allExtLinks:
            allExtLinks.add(link)
            print(link)
            
    # for link in internalLinks:
    #    if link not in allIntLinks:
    #        allIntLinks.add(link)
    #        getAllExternalLinks(link)
            
allIntLinks.add('http://oreilly.com')
getAllExternalLinks('http://oreilly.com')

<a href="#maincontent"><span class="skipToMain-text">Skip to main content</span></a>
<a aria-current="page" class="logo" href="https://www.oreilly.com" title="home page"><img alt="O'Reilly home" onerror="this.src='https://cdn.oreillystatic.com/images/sitewide-headers/oreilly_logo_mark_red_@2x.png'; this.onerror=null;" src="https://cdn.oreillystatic.com/images/sitewide-headers/oreilly_logo_mark_red.svg"/></a>
<a href="https://www.oreilly.com/member/login/" id="nav-signIn">Sign In</a>
<a class="menuList-cta" href="https://www.oreilly.com/online-learning/try-now.html" id="nav-tryNow">Try Now</a>
<a class="menuList-cta" href="https://www.oreilly.com/member/login/" id="nav-platform">O’Reilly Platform</a>
<a href="https://www.oreilly.com/online-learning/teams.html">Teams</a>
<a href="https://www.oreilly.com/online-learning/teams.html">For business</a>
<a href="https://www.oreilly.com/online-learning/government.html">For government</a>
<a href="https://www.oreilly.com/online-learning/academic

In [147]:
## Chapt 4: Scraping Modelle
# Seiten mit unterschiedlichem Layout scrapen
# >> unterschiedliche Scraper für eine handvoll ausgewählter Seiten bauen

import requests


class Content:
    def __init__(self, url, title, body):
        self.url = url
        self.title = title
        self.body = body

def getPage(url):
    req = requests.get(url)
    return BeautifulSoup(req.text, 'html.parser')

def scrapeTagesschau(url):
    bs = getPage(url)
    title = bs.find("h1").text
    lines = bs.find_all("p", {"class", "textabsatz m-ten m-offset-one l-eight l-offset-two columns twelve"})
    body = '\n'.join([line.text for line in lines])
    return Content(url, title, body)

def scrapeBrookings(url):
    bs = getPage(url)
    title = bs.find("h1").text
    lines = bs.find_all("div",{"class","byo-block -narrow wysiwyg-block wysiwyg"})
    body = []
    for line in lines:
        for child in line.children:
            body.append(child.text)
    body = "\n". join(body)
    return Content(url, title, body)

# Example Brookings
url = 'https://www.brookings.edu/blog/future-development/2018/01/26/delivering-inclusive-urban-access-3-uncomfortable-truths/'
content = scrapeBrookings(url)
print('Title: {}'.format(content.title))
print('URL: {}\n'.format(content.url))
print(content.body)

# Example NYTimes
url = "https://www.tagesschau.de/inland/innenpolitik/merz-union-aussenpolitik-gruene-100.html"
content2 = scrapeTagesschau(url)
print('Title: {}'.format(content2.title))
print('URL: {}\n'.format(content2.url))
print(content2.body)

Title: Delivering inclusive urban access: 3 uncomfortable truths
URL: https://www.brookings.edu/blog/future-development/2018/01/26/delivering-inclusive-urban-access-3-uncomfortable-truths/



The past few decades have been filled with a deep optimism about the role of cities and suburbs across the world. These engines of economic growth host a majority of world population, are major drivers of economic innovation, and have created pathways to opportunities for untold amounts of people.




But all is not well within our so-called Urban Century. Rapid urbanization, rising gentrification, concentrated poverty, and shortages of basic infrastructure have combined to create spatial inequity in cities and suburbs across the globe. The challenges of housing, moving, and employing so many people have led to longer travel times, rising housing costs, and unsustainable public spending. Moreover, policymakers are questioning traditional policies and approaches.


The past couple years, we’ve led 

In [209]:
# Alternative, um einen Crawler zu bauen
# Siehe, dass für die Datenextraktion vor allem die tags benötigt werden, hinter denen sich ein TItel bzw. der Body-Content befindet
# >> eine neue Website-Klasse, die diese Informationen speichert kann verwendet werden, um ein Crawler-Objekt zu bauen, dass auf Basis der CSS Infos I
#    Inhalte extrahiert

class Content:
    """
    Common base class for all articles/pages
    """
    
    def __init__(self, url, title, body):
        self.url = url
        self.title = title
        self.body = body
        
    def print(self):
        """
        Flexible printing function controls output
        """
        print("URL: {}".format(self.url))
        print("TITLE: {}".format(self.title))
        print("BODY:\n{}".format(self.body))
        
class Website:
    """ 
    Contains information about website structure
    """
    def __init__(self, name, url, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.titleTag = titleTag
        self.bodyTag = bodyTag

# >> Die Website-Klasse nethält nur Informationen darüber, wie auf die gewünschten zugegriffen werden kann
# >> Die Infos selbst werden in der Content Klasse gespeichert


class Crawler:
    def getPage(self, url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, "html.parser")

    def safeGet(self, pageObj, selector):
        """
        Utility Function um einen String von einem BS-Objekt und einem Selektor zu bekommen
        """
        selectedElems = pageObj.select(selector)
        if selectedElems is not None and len(selectedElems) > 0:
            return "\n".join([elem.get_text() for elem in selectedElems])
        return ''

    def parse(self, site, url):
        """
        Extract content from a given website
        """
        bs = self.getPage(url)
        if bs is not None:
            title = self.safeGet(bs, site.titleTag)
            body = self.safeGet(bs, site.bodyTag)
            
            if title != '' and body != '':
                content = Content(url, title, body)
                content.print()

crawler = Crawler()
siteData = [
    ['O\'Reilly Media', 'http://oreilly.com', 'h1', 'div.content span div p'],
    ['Brookings', 'http://www.brookings.edu', 'h1.w-full', 'div.byo-block p'],
    ['Tagesschau', 'https://www.tagesschau.de', 'h1', 'p.textabsatz']
]
websites = []

# Liste von Websiten erstellen
for row in siteData:
    websites.append(Website(row[0], row[1], row[2], row[3]))

crawler.parse(websites[0], 'https://www.oreilly.com/library/view/learning-python-5th/9781449355722/')
crawler.parse(websites[1], 'https://www.brookings.edu/blog/techtank/2016/03/01/idea-to-retire-old-methods-of-policy-education/')
crawler.parse(websites[2], "https://www.tagesschau.de/inland/innenpolitik/merz-union-aussenpolitik-gruene-100.html")

# >> Diese Vorgehensweise büßt etwas Flexibilität ein, da die Seiten klar strukturiert und jedes Element durch CSS-Tags gefunden werden können muss
# >> Gut ist jedoch, dass damit die Datenextraktion vfür viele Seiten von einer Person mit Frontenderfahrung vorbereitet und schließlich durchgeführt werden kann


URL: https://www.oreilly.com/library/view/learning-python-5th/9781449355722/
TITLE: Learning Python, 5th Edition
BODY:
Get a comprehensive, in-depth introduction to the core Python language with this hands-on book. Based on author Mark Lutz’s popular training course, this updated fifth edition will help you quickly write efficient, high-quality code with Python. It’s an ideal way to begin, whether you’re new to programming or a professional developer versed in other languages.
Complete with quizzes, exercises, and helpful illustrations,  this easy-to-follow, self-paced tutorial gets you started with both Python 2.7 and 3.3— the latest releases in the 3.X  and 2.X lines—plus all other releases in common use today. You’ll also learn some advanced language features that recently have become more common in Python code.
URL: https://www.brookings.edu/blog/techtank/2016/03/01/idea-to-retire-old-methods-of-policy-education/
TITLE: Idea to Retire: Old methods of policy education
BODY:
Public p

In [119]:
# X Crawler strukturieren
# ... erweitern der vorherigen Ansätze, um einen Crawler zu erstellen, der neue Seiten entdeckt und automatisch die gewünschten Inhalte extrahiert
# ... dazu muss ein Crawler zum Beispiel auch mit fehlenden/leeren Feldern umgehen können

import requests
# Drei Ansätze
# 1) Seiten durch die Suchleiste der Website crawlen
# ... die meisten Websites haben die folgende Struktur für die Suche: http://example.com?search=myTopic 
#     >> fürs crawlen kann der erste Teil als Basis dienen und unterschiedliche Keyworte angehangen werden
# ... Ergebnisse werden oft als eine Liste von <span> Objekten dargestellt
# ... die result Links werden entweder als relative URL oder absolute URL ausgegeben
# ... Wenn die URLs vereinheitlicht wurden, können die Seiten mit der vorherigen Vorgehensweise gescraped werden

# Klassische Content-Klasse, um Informationen zu speichern
class Content:
    """
    Common base class for all articles/pages
    """
    
    def __init__(self, topic, url, title, body):
        self.topic = topic
        self.title = title
        self.body = body
        self.url = url
        
    def print(self):
        """
        Flexible printing function controls output
        """
        print("New article found for topic: {}".format(self.topic))
        print("TITLE: {}".format(self.title))
        print("BODY:\n{}".format(self.body))
        print("URL: {}".format(self.url))    


# WEbsite entält einige weitere Informationen
class Website:
    """
    Contains information about website structure
    """
    def __init__(self, name, url, searchUrl, resultListing,
        resultUrl, absoluteUrl, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.searchUrl = searchUrl # Seite, auf der mit einem Zusatz eine Suche gestartet werden kann
        self.resultListing = resultListing # definiert "Box" in der Ergebnsise gespeichert werden
        self.resultUrl = resultUrl # Definiert Tag, mit dem Links extrahiert werden können
        self.absoluteUrl=absoluteUrl # Bool, das erfasst, ob Ergebnisse absolut oder relativ sind
        self.titleTag = titleTag
        self.bodyTag = bodyTag

from selenium import webdriver
# Die Crawler Klasse erhält nun eine search Methode, mit der Seiten zu einem gewissen Thema gescraped werden können
class Crawler:
    
    def getPage(self, url):
        try:
            browser = webdriver.Firefox()
            browser.get(url)
            req = browser.page_source
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req, 'lxml')
    
    def safeGet(self, pageObj, selector):
        childObj = pageObj.select(selector)
        if childObj is not None and len(childObj) > 0:
            return ''.join([child.get_text() for child in childObj])
        return ""
    
    def search(self, topic, site):
        """
        Searches a given website for a given topic and records all pages found
        """
        bs = self.getPage(site.searchUrl + topic +"&rows=100")
        print(bs)
        print(site.searchUrl + topic+"&rows=100")
        searchResults = bs.select(site.resultListing)
        # print(searchResults)
        for result in searchResults:
            url = result.select(site.resultUrl)[0].attrs["href"]
            # Check to see whether it's a relative or an absolute URL
            if(site.absoluteUrl):
                bs = self.getPage(url)
            else:
                bs = self.getPage(site.url + url)
            if bs is None:
                print("Something was wrong with that page or URL. Skipping!")
                return
            title = self.safeGet(bs, site.titleTag)
            print(title)
            body = self.safeGet(bs, site.bodyTag)
            if title != '' and body != '':
                content = Content(topic, title, body, url)

crawler = Crawler()
siteData = [
      ['Brookings', 'http://www.brookings.edu', 'https://www.brookings.edu/?s=', 'li .ais-InfiniteHits-item', 'a.overlay-link', 
     True, 'h1.w-full', 'div.byo-block p'],
    ['O\'Reilly Media', 'http://oreilly.com', 'https://www.oreilly.com/search/?q=', 'article', 'h3 a.MuiTypography-root', 
     True, 'h1', 'div.content span div p'],
    ['Tagesschau', 'https://www.tagesschau.de', 'https://www.tagesschau.de/suche#/article/1/?searchText=', 'div.teaser-right', 'a.teaser-right__link',
     False, 'h1', 'p.textabsatz']
]

websites = []
for row in siteData:
    websites.append(Website(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7]))

topics = ['python', 'digital']

print(websites[0].resultListing)
for topic in topics:
    print("GETTING INFO ABOUT: " + topic)
    for targetSite in websites:
        print(f"tartget: {targetSite.url}, topic: {topic}")
        crawler.search(topic, targetSite)
        

li .ais-InfiniteHits-item
GETTING INFO ABOUT: python
tartget: http://www.brookings.edu, topic: python
<html dir="ltr" lang="en-US"><head><title>Nur einen Moment…</title><meta content="text/html; charset=utf-8" http-equiv="Content-Type"/><meta content="IE=Edge" http-equiv="X-UA-Compatible"/><meta content="noindex,nofollow" name="robots"/><meta content="width=device-width,initial-scale=1" name="viewport"/><style>*{box-sizing:border-box;margin:0;padding:0}html{line-height:1.15;-webkit-text-size-adjust:100%;color:#313131;font-family:system-ui,-apple-system,BlinkMacSystemFont,Segoe UI,Roboto,Helvetica Neue,Arial,Noto Sans,sans-serif,Apple Color Emoji,Segoe UI Emoji,Segoe UI Symbol,Noto Color Emoji}body{display:flex;flex-direction:column;height:100vh;min-height:100vh}.main-content{margin:8rem auto;max-width:60rem;padding-left:1.5rem}@media (width <= 720px){.main-content{margin-top:4rem}}.h2{font-size:1.5rem;font-weight:500;line-height:2.25rem}@media (width <= 720px){.h2{font-size:1.25rem;lin

IndexError: list index out of range

In [None]:
# 2) Durch Links crawlen

In [211]:
url = 'https://www.brookings.edu/blog/future-development/2018/01/26/delivering-inclus"ive-urban-access-3-uncomfortable-truths/'
# bs = getPage(url)
# title = bs.find("h1").text
# blocks = bs.find_all("div",{"class","byo-block -narrow wysiwyg-block wysiwyg"})

# for line in blocks:
#    for child in line.children:
#        print(child.text)

url = "https://www.tagesschau.de/inland/innenpolitik/merz-union-aussenpolitik-gruene-100.html"
bs = getPage(url)
title = bs.find("h1").text
lines = bs.find_all("p", {"class", "textabsatz m-ten m-offset-one l-eight l-offset-two columns twelve"})
body = "\n".join (line.text for line in lines)

print(body)


Die Union teilt nach Einschätzung ihres Kanzlerkandidaten Friedrich Merz außenpolitisch mehr Positionen mit den Grünen als mit der SPD. Auf die Frage, mit wem er nach einer erfolgreichen Bundestagswahl besser zusammenarbeiten könnte, sagte der CDU-Politiker der Bild, "in der Außen- und Sicherheitspolitik gibt es sicher mit den Grünen mehr Gemeinsamkeiten als mit der SPD".

Mit Blick auf die Wirtschaftspolitik der Grünen sei die Union aber ganz anderer Meinung, "da brauchen wir einen grundlegenden Kurswechsel", so Merz. Er betonte, zunächst hätten die Wähler das Wort. Die Union führe auch keinen Koalitionswahlkampf. "Wir kämpfen um jede Stimme. Und nach der Bundestagswahl sprechen wir dann mit den demokratischen Parteien der politischen Mitte, wie wir diesen Kurswechsel hinbekommen", sagte er. 

Am Wochenende hatte schon Grünen-Chefin Franziska Brantner im Konflikt mit Russland eine größere Nähe zu Merz als zu Bundeskanzler Olaf Scholz (SPD) signalisiert. Auf die Frage "Was können Sie 

In [255]:
req = requests.get("https://www.brookings.edu/?s=digital")
bs = BeautifulSoup(req.text, "html.parser")

elements = bs.select("div#algolia-hits")# letzter Versuch...
for lin in elements:
    print(lin)

<a class="overlay-link absolute w-full h-full top-0 left-0 z-10" href="https://www.brookings.edu/articles/who-has-to-leave-the-federal-reserve-next-2/">
<span class="sr-only"> Who has to leave the Federal Reserve next? </span>
</a>
<a class="overlay-link absolute w-full h-full top-0 left-0 z-10" href="https://www.brookings.edu/articles/the-four-working-class-votes/">
<span class="sr-only"> The four working-class votes </span>
</a>
<a class="overlay-link absolute w-full h-full top-0 left-0 z-10" href="https://www.brookings.edu/articles/hutchins-center-fiscal-impact-measure/">
<span class="sr-only"> Hutchins Center Fiscal Impact Measure </span>
</a>
<a class="overlay-link absolute w-full h-full top-0 left-0 z-10" href="https://www.brookings.edu/articles/pepfar-delivers-outsized-returns-it-deserves-more-funding/">
<span class="sr-only"> PEPFAR delivers outsized returns—it deserves more funding </span>
</a>
<a class="overlay-link absolute w-full h-full top-0 left-0 z-10" href="https://www.

In [115]:
import sys
!{sys.executable} -m pip install selenium





Collecting selenium
  Downloading selenium-4.27.1-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.27.0-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting attrs>=23.2.0 (from trio~=0.17->selenium)
  Downloading attrs-24.3.0-py3-none-any.whl.metadata (11 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.27.1-py3-none-any.whl (9.7 MB)
   ---------------------------------------- 0.0/9.7 MB ? eta -:--:--
   ---------------------------------------- 0.1/9.7 MB 2.2 MB/s eta 0:00:05
   - -------------------------------------- 0.2/9.7 MB 3.7 MB/s eta 0:00:03
   --- ------------------------------------ 0.8/9.7 MB 7.2 MB/s 

In [3]:
conda install -c conda-forge scrapy

error: incomplete escape \U at position 28

In [11]:
import os
os.getcwd()

'C:\\Users\\Manue'