# Scrape works from hausarbeiten.de


In [5]:
from bs4 import BeautifulSoup
import requests
import time
from tqdm.notebook import tqdm
import re
from pprint import pprint
import pickle
import csv
from Homework import Homework
import sys

In [4]:
sys.setrecursionlimit(100000)

In [3]:
#scraping from 03.05.2022

In [11]:
def save_obj(works, name):
    with open(name, "wb") as fp:   #Pickling
        pickle.dump(works, fp)

In [3]:
def load_obj(name):
    with open(name, "rb") as fp:  
        obj= pickle.load(fp) # Unpickling
        return obj

In [7]:
"""
    -returns a list containing some metadata about the works and a link to the specific work (with beautiful soup) of one page of the database
    -as example a page listing the works looks like this: https://www.hausarbeiten.de/search?product=ebook&source_type=document&field=title%2Csubtitle%2Cdata&language_id=1&price_range_id=1000&sort=weight-desc%2Cdate-desc&display=100&page=0

"""

def listscraping(soup, option): #option 1 equals hausarbeiten.de and diplomarbeiten24.de option 2 equals grin.de
    
    homeworks=[]
    
    objectlist = soup.find("ul", class_="objectlist") #find specific homework documents on page
    documentlist = objectlist.find_all("li", class_="objectlist-item document")
    
    for docs in documentlist:
        try:
            link = docs.find("a")["href"]
        except AttributeError:
            link = "none"
    
        try:
            if option == 1:
                title = docs.find("div", class_="heading1").find("a").text
            elif option ==2:  
                title = docs.find("h2").text
        except AttributeError:
            title = "none"
  
        try:
            author = docs.find("dl", class_="metalist").find("dd", class_="author").find("a").text
        except AttributeError:
            author = "anonymous" #special name for non existing author
   
        try:
            subject = docs.find("dl", class_="metalist").find("dd", class_="subject").find("a").text
        except AttributeError:
            subject = "none"
        
        try:
            category = docs.find("dl", class_="metalist").find("dd", class_="category").text
        except AttributeError:
            category = "none"
      
        try:
            price = docs.find("dl", class_="metalist").find("dd", class_="price").text
        except AttributeError:
            price= "none"
       

        homeworks.append(Homework(link,title,author,subject,category,price))

    return homeworks

In [8]:
"""
    -this function iterates over all results from the search in the database and calls the listscraping function
    params:
        num: is the number of pages to iterate trough
        sec: is the amount of time to wait for a request
        option: depends on the website to crawl for (hausarbeiten.de has a different html structure than grin.de)
        url: is the url of the search results of the databases without a page number
            e.g. for hausarbeiten.de: https://www.hausarbeiten.de/search?product=ebook&source_type=document&field=title%2Csubtitle%2Cdata&language_id=1&price_range_id=1000&sort=weight-desc%2Cdate-desc&display=100&page=

"""

def collecthomeworks(num, sec,option,url): #number of pages to request and waiting time between requests
    homeworks =[]

    for count in tqdm(range(num)): #max range of pages in Hausarbeiten.de is 173 (180 grin.de, 15 diplomarbeiten24.de)
       
        source = get_data(url.format(count = count))
        soup = BeautifulSoup(source, "lxml")

        homeworks += listscraping(soup, option)

        time.sleep(sec)
    return homeworks




In [18]:

def extract_text(soup):
    """extraction of the text of a work"""
    
    try:
        text_plain= soup.find("div", class_="plain-preview").text
    except AttributeError:
        text_plain = "none"
    
        
    
    return text_plain

In [19]:
def extract_metadata(soup):
    """some extraction of metadata"""
    
    try:
        pages_displayed = soup.find("div", class_="page-numbers").text
        pages_displayed = re.sub(r"[\n\t\xa0]", "", pages_displayed)
    except AttributeError:
        pages_displayed = "none"
    
    try:
        grade = soup.find("dd", class_="grade").text
    except AttributeError:
        grade = "none"
    
    try:
        pages = soup.find("dd", class_="page_count").text
    except AttributeError:
        pages = "none"
        
    try:
        institution = soup.find("dd", class_="institution").text
        institution = re.sub(r"[\n\t\xa0]", "", institution)
    except AttributeError:
        institution = "none"
        
    try:
        isbn = soup.find("dd", class_="isbn").text
    except AttributeError:
        isbn = "none"
    
    try:
        tags = soup.find("dd", class_="tags").text
        tags = re.sub(r"[\n]", " ", tags)
    except AttributeError:
        tags = "none"
        
    
    
    return {"grade": grade, "pages": pages, "institution": institution, "isbn": isbn, "tags": tags, "pages_displayed": pages_displayed}
    

In [15]:
"""function for requesting the servers of grin"""

def retry(func, retries=100):
    def retry_wrapper(*args, **kwargs):
        attempts=0
        while attempts < retries:
            try:
                return func(*args, **kwargs)
            except requests.exceptions.RequestException as e:
                print(e)
                time.sleep(30)
                attempts += 1
    return retry_wrapper


In [16]:
@retry
def get_data(url):
    r = requests.get(url)
    return r.text

In [20]:
lists = collecthomeworks(173,3,1,"https://www.hausarbeiten.de/search?product=ebook&source_type=document&field=title%2Csubtitle%2Cdata&language_id=1&price_range_id=1000&sort=weight-desc%2Cdate-desc&display=100&page=")

  0%|          | 0/1 [00:00<?, ?it/s]

In [21]:
save_obj(lists, r"C:\Users\Tim\Desktop\bachelorarbeit\lists_for_scraping\homeworks_list")

In [6]:
homeworks = load_obj(r"C:\Users\Tim\Desktop\bachelorarbeit\lists_for_scraping\grins")

In [8]:
def extract_all_data(homeworks, entry, name):
    """this function now extracts all exact information of a specific work"""
    count = 0
    num= int(entry / 500)
    worklist = []
    
    filename = name + "{num}"
    for work in tqdm(homeworks[entry:]):
      
        count+=1
        source = get_data(work.link)
        soup = BeautifulSoup(source, "lxml")
        
        if (work.price.lower() == "kostenlos") or (work.price.lower() == "free") :
            work.text_html = extract_text(soup)
        else:
            work.text_html = "none"
            
        dic = extract_metadata(soup)

        for key, value in dic.items():
            setattr(work, key, value)

        worklist.append(work)
        
        
        if count % 500 == 0:
            
            save_obj(worklist, filename.format(num=num))
            worklist = []
            num +=1
           
      
        
        time.sleep(2)
    save_obj(worklist, filename.format(num=num))

    

In [21]:
### save the objects in folder obejcts
extract_all_data(homeworks, 0, r"C:\Users\Tim\Desktop\bachelorarbeit\objects\homeworks")

  0%|          | 0/575 [00:00<?, ?it/s]

KeyboardInterrupt: 

# Complement works from grin.de

In [22]:
def get_links(works):
    link = [work.link.split("/")[-1] for work in works]
    return link

In [67]:
homeworks = load_obj("homeworks_list")

In [None]:
grin_list = collecthomeworks(180,0,"https://www.diplomarbeiten24.de/search?product=ebook&source_type=document&field=title%2Csubtitle%2Cdata&language_id=1&price_range_id=1000&sort=weight-desc%2Cdate-desc&display=100&page={count}")

In [None]:
"""check if works on hausarbeiten.de and grin.de are different"""

homework_links = get_link(homeworks)
grins=[] ##here are the works which are not on hausarbeiten.de

for work in grin_list:
    if work.link.split("/")[-1] not in homework_links:
        grins.append(work)

In [155]:
grins = load_obj("grins") #these are all works which are on grin.de but not on hausarbeiten.de (total 575)

In [156]:
extract_all_data(grins, 0, "objects_grin/grins")

  0%|          | 0/575 [00:00<?, ?it/s]

("Connection broken: InvalidChunkLength(got length b'', 0 bytes read)", InvalidChunkLength(got length b'', 0 bytes read))


# Works from diplomarbeit24.de

In [142]:
diploms = collecthomeworks(15,0,1,"https://www.diplomarbeiten24.de/search?product=ebook&source_type=document&field=title%2Csubtitle%2Cdata&language_id=1&price_range_id=1000&sort=weight-desc%2Cdate-desc&display=100&page={count}")

  0%|          | 0/15 [00:00<?, ?it/s]

In [154]:
pprint(vars(diploms[1000
                   ]))

{'author': 'Günter Rodegast (Autor:in)',
 'category': 'Forschungsarbeit, 2007',
 'grade': '',
 'institution': '',
 'isbn': '',
 'link': 'https://www.diplomarbeiten24.de/document/110637',
 'pages': '',
 'price': 'Kostenlos',
 'subject': 'Geschichte Europa - Deutschland - Nationalsozialismus, II. '
            'Weltkrieg',
 'tags': '',
 'text_html': '',
 'title': 'Juden in der Region in und um Wittenberge'}


In [149]:
save_obj(diploms, "diplom_list")


In [26]:
homeworks = load_obj(r"C:\Users\Tim\Desktop\bachelorarbeit\lists_for_scraping\homeworks_list")
grins= load_obj(r"C:\Users\Tim\Desktop\bachelorarbeit\lists_for_scraping\grins")
h = get_links(homeworks)
g = get_links(grins)
diploms = load_obj(r"C:\Users\Tim\Desktop\bachelorarbeit\lists_for_scraping\diplom_list")

In [29]:
count = 0
for work in diploms:
    if (work.link.split("/")[-1] not in h) and (work.link.split("/")[-1] not in g):
        count +=1
print(count)

0
