# Erstellung des Datensatzes 
Texte: Blogbeiträge der Blogplattform Hypotheses.org
Labels: Die von den Wissenschaftlern gewählten Themen und Disziplinen

Autorin: Maria Hartmann

In [2]:
# Import libraries
import numpy as np
import csv # for csv output
import requests # HTTP for humans
from bs4 import BeautifulSoup # module for web scraping
import _thread 
from threading import Thread # to start parallel threads
import time # to get the processing time
import os
import shutil # to move files
from collections import Counter # to count element appearences in a list

Einlesen der Ausgangsdatei metadata.csv

In [4]:
# metadata.csv einlesen
folder = '../Preprocessing'
file = folder+'/metadata.csv'
lines = [] # all lines from metadata
de_lines = [] # german lines from metadata

with open(file, 'r', encoding='utf-8') as openfile:
    metadata = openfile.readlines()
    openfile.close()
    for i, line in enumerate(metadata):
        lines.append(line.replace('\n', '').split(";"))
        if lines[i][1] == "de":
            de_lines.append(lines[i])
        else:
            continue

    
# de_lines in numpy_array umgewandelt, weil Zugriff schneller geht, kann aber nicht verändert werden
np_lines = np.array(de_lines)
print(type(np_lines))

# Blogs ohne Disziplinen aus metadata.csv rausfiltern und in error_lines.csv schreiben
# Fehlerhafte Blogs (z.B. nicht mehr verfügbar oder einmalig andere Sprache) übergehen (wegschmeißen)
# die restlichen deutschen Daten in de_lines.csv schreiben
with open(folder+'/de_lines.csv', 'w', newline='', encoding="utf-8") as decsv, open(folder+'/unlabeled_lines.csv', 'w', newline='', encoding="utf-8") as unlabeledcsv, open(folder+'/error_lines.csv', 'w', newline='', encoding="utf-8") as errorcsv: 
    de = csv.writer(decsv, delimiter = ";")
    unlabeled = csv.writer(unlabeledcsv, delimiter = ";")
    errors = csv.writer(errorcsv, delimiter = ";")
    for i, line in enumerate(np_lines):
        if (np_lines[i][7] == "marginalie") or (np_lines[i][7] == "ciera"):
            # keine Disziplinen zugeordnet, 
            unlabeled.writerow(line)
        elif (np_lines[i][7] == "holocaustwebsites"):
        # holocaustwebsites rausgefiltert, weil diese Website  nicht mehr verfügbar ist
        # alles andere wird über den Blogpost-per-Blog-Index gefiltert
        #elif (np_lines[i][7] == "holocaustwebsites") or (np_lines[i][7] == "aleesp") or (np_lines[i][7] == "filstoria") or (np_lines[i][7] == "atb"):
            # aleesp rausgefiltert, weil es eigentlich ein spanischer Blog ist und im deutschen Korpus nur 1x vorkommt
            # filstoria rausgefiltert, weil es eigentlich ein italienischer Blog ist und im deutschen Korpus nur 1x vorkommt
            # atb rausgefiltert, weil Disciplines und Themes fehlerhaft sind (mit Doppelpunkt) und der Blog mehrheitlich englisch ist
            errors.writerow(line)
        else:
            de.writerow(line)


# de_lines.csv in data einlesen, um die Fehler nicht mit einlesen zu müssen 
data = [] # alle lines aus de_lines, ohne errors
bloglist = [] # alle blogs, die in de_lines vorkommen
with open(folder+'/de_lines.csv', 'r', encoding='utf-8') as openfile:
    de_csv = openfile.readlines()
    openfile.close()
    for i, line in enumerate(de_csv):
        data.append(line.replace('\n', '').split(";")) 
        bloglist.append(data[i][7])




<class 'numpy.ndarray'>


In [5]:
# remove blogs with less than 10 posts, damit anderssprachige blogs mit einzelnen deutschen Posts rausgefiltert werden

c = Counter(bloglist)
blog_select = []
counter = 0
for key in sorted(c): 
    if c[key] < 10:
        #print("%s: %s" % (key, c[key]))
        blog_select.append(key)
        counter += c[key]


trainset = [x for x in data if x[7] not in blog_select]


print(len(data))
print(len(trainset))
print(len(bloglist))
print(len(blog_select))
print(counter)

21695
21387
21695
119
308


Auslesen der Klassenbezeichnungen 

In [None]:
# crawl subjects and themes
errorlist = []
def get_disciplines(index):
    #print("\nline 1:", line)
    #print(i+1)
    #print(line)
    url = trainset[index][9]
    #print(i, "\npage:", url)
    page = requests.get(url)
    soup = BeautifulSoup(page.text, "html.parser")

    if(soup.find(title="Zum Hypotheses-Katalogeintrag")):
        element = soup.find(title="Zum Hypotheses-Katalogeintrag")
        link = element.get("href")
        #print("link:", link)
        
    elif(soup.find(title="Zum OpenEdition-Katalogeintrag")):
        element = soup.find(title="Zum OpenEdition-Katalogeintrag")
        link = element.get("href")
        #print("link:", link)

    elif(soup.find(title="Ce carnet dans le catalogue d'Hypothèses")):
        element = soup.find(title="Ce carnet dans le catalogue d'Hypothèses")
        link = element.get("href")
        #print("link:", link)
        
    elif(soup.find(title="Ce carnet dans le catalogue d'OpenEdition")):
        element = soup.find(title="Ce carnet dans le catalogue d'OpenEdition")
        link = element.get("href")
        #print("link:", link)    

    elif(soup.find(title="This blog in Hypotheses catalogue")):
        element = soup.find(title="This blog in Hypotheses catalogue")
        link = element.get("href")
        #print("link:", link)
        
    elif(soup.find(title="This blog in OpenEdition catalogue")):
        element = soup.find(title="This blog in OpenEdition catalogue")
        link = element.get("href")
        #print("link:", link)

    else:
        print("Kein Open-Edition-Link gefunden!", index, trainset[index])
        trainset[index].append("Kein Open-Edition-Link gefunden!")
        errorlist.append(line)
        return

    subpage = requests.get(link)
    #print(subpage)
    subsoup = BeautifulSoup(subpage.text, "html.parser")
    morelinks = subsoup.find(class_="more-links")
    disciplines = []
    for i, child in enumerate(morelinks.children):
        #print("disciplines:", i, child)
        disciplines.append(child)

    #print(disciplines[9])
    #print(disciplines[14])
    if len(disciplines) > 13:
        trainset[index].append(disciplines[9].replace("\n", "").strip())
        trainset[index].append(disciplines[14].replace("\n", "").strip())
    elif len(disciplines) > 8:
        trainset[index].append(disciplines[9].replace("\n", "").replace('"', '').strip())
    else:
        print("Keine Disziplinen gefunden!", index, trainset[index])
        trainset[index].append("Keine Disziplinen gefunden!")
        errorlist.append(trainset[index])
        
    #print("\nline 2:", line)
    #print("trainset[i]:", trainset[i])
    #print("FERTIG")


start = time.time()
# Create two threads as follows
threads = []
for i in range(0,len(trainset)):
    if (i % 100 == 0):
                print("Schon wieder 100 Threads gestartet:", i)
    try:
        t = Thread(target = get_disciplines, args=(i, ))
        t.start()
        threads.append(t)
    except:
        print ("Error: unable to start thread")
        
for t in threads:
    #  join() stellt sicher, dass das Hauptprogramm wartet, bis alle Threads terminiert haben
    t.join()

print("Laufzeit in Minuten:", (time.time() - start) / 60)


Schon wieder 100 Threads gestartet: 0
Schon wieder 100 Threads gestartet: 100
Schon wieder 100 Threads gestartet: 200
Schon wieder 100 Threads gestartet: 300
Schon wieder 100 Threads gestartet: 400
Schon wieder 100 Threads gestartet: 500
Schon wieder 100 Threads gestartet: 600
Schon wieder 100 Threads gestartet: 700
Schon wieder 100 Threads gestartet: 800
Schon wieder 100 Threads gestartet: 900
Schon wieder 100 Threads gestartet: 1000
Schon wieder 100 Threads gestartet: 1100
Schon wieder 100 Threads gestartet: 1200
Schon wieder 100 Threads gestartet: 1300
Schon wieder 100 Threads gestartet: 1400
Schon wieder 100 Threads gestartet: 1500
Schon wieder 100 Threads gestartet: 1600
Schon wieder 100 Threads gestartet: 1700
Schon wieder 100 Threads gestartet: 1800
Schon wieder 100 Threads gestartet: 1900
Schon wieder 100 Threads gestartet: 2000
Schon wieder 100 Threads gestartet: 2100
Schon wieder 100 Threads gestartet: 2200
Schon wieder 100 Threads gestartet: 2300
Schon wieder 100 Threads ges

Exception in thread Thread-1649:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\threading.py", line 916, in _bootstrap_inner
    self.run()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\threading.py", line 864, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-6-8e0ee8091118>", line 53, in get_disciplines
    for i, child in enumerate(morelinks.children):
AttributeError: 'NoneType' object has no attribute 'children'



Schon wieder 100 Threads gestartet: 10000
Schon wieder 100 Threads gestartet: 10100
Schon wieder 100 Threads gestartet: 10200
Kein Open-Edition-Link gefunden! 5217 ['musermeku_861', 'de', 'Angelika Schoder', '299', 'N/A', '2013-11-21', 'N/A', 'musermeku', '861', 'http://musermeku.hypotheses.org/861', 'Einblick in die Nachkriegszeit – Das AJR Journal']
Schon wieder 100 Threads gestartet: 10300
Kein Open-Edition-Link gefunden! 5258 ['zwopktnull_184', 'de', 'Christian Bunnenberg', '442', 'N/A', '2014-07-30', 'N/A', 'zwopktnull', '184', 'http://zwopktnull.hypotheses.org/184', 'Kaiserdom-App']
Schon wieder 100 Threads gestartet: 10400
Schon wieder 100 Threads gestartet: 10500
Schon wieder 100 Threads gestartet: 10600
Schon wieder 100 Threads gestartet: 10700
Schon wieder 100 Threads gestartet: 10800
Schon wieder 100 Threads gestartet: 10900
Kein Open-Edition-Link gefunden! 5672 ['musermeku_755', 'de', 'MusErMeKu Gast', '642', 'N/A', '2013-10-10', 'N/A', 'musermeku', '755', 'http://musermeku

Exception in thread Thread-8297:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 1331, in getresponse
    response.begin()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 258, in _read

Kein Open-Edition-Link gefunden! 8339 ['musermeku_1518', 'de', 'Angelika Schoder', '54', 'N/A', '2014-06-12', 'N/A', 'musermeku', '1518', 'http://musermeku.hypotheses.org/1518', 'George Clooney im Museum – Was Kulturinstitutionen vom Clickbait -Effekt lernen können']
Schon wieder 100 Threads gestartet: 14600
Schon wieder 100 Threads gestartet: 14700
Schon wieder 100 Threads gestartet: 14800
Kein Open-Edition-Link gefunden! 8554 ['musermeku_910', 'de', 'Angelika Schoder', '695', 'N/A', '2013-12-26', 'N/A', 'musermeku', '910', 'http://musermeku.hypotheses.org/910', 'Von der Doktorarbeit zum Buch in 11 Schritten – Teil 1']
Schon wieder 100 Threads gestartet: 14900
Schon wieder 100 Threads gestartet: 15000
Schon wieder 100 Threads gestartet: 15100
Schon wieder 100 Threads gestartet: 15200
Schon wieder 100 Threads gestartet: 15300
Schon wieder 100 Threads gestartet: 15400


Exception in thread Thread-6359:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 1331, in getresponse
    response.begin()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 258, in _read

Schon wieder 100 Threads gestartet: 15500
Schon wieder 100 Threads gestartet: 15600
Kein Open-Edition-Link gefunden! 9099 ['musermeku_2609', 'de', 'Angelika Schoder', '723', 'N/A', '2015-01-24', 'N/A', 'musermeku', '2609', 'http://musermeku.hypotheses.org/2609', 'Wissenschaftliches Bloggen mit Monty Python / #wbhyp']
Kein Open-Edition-Link gefunden! 9108 ['zwopktnull_87', 'de', 'Christian Bunnenberg', '514', 'N/A', '2013-10-02', 'N/A', 'zwopktnull', '87', 'http://zwopktnull.hypotheses.org/87', 'Stimmen der Kulturwissenschaften | Podcasts II']
Schon wieder 100 Threads gestartet: 15700
Schon wieder 100 Threads gestartet: 15800
Kein Open-Edition-Link gefunden! 9265 ['musermeku_311', 'de', 'MusErMeKu Gast', '685', 'N/A', '2013-07-01', 'N/A', 'musermeku', '311', 'http://musermeku.hypotheses.org/311', '„Gutes Wetter – Schlechtes Wetter“ – Eine Ausstellung entsteht (Teil 2)']
Schon wieder 100 Threads gestartet: 15900


Exception in thread Thread-6669:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 1331, in getresponse
    response.begin()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 258, in _read

Schon wieder 100 Threads gestartet: 16000
Schon wieder 100 Threads gestartet: 16100
Schon wieder 100 Threads gestartet: 16200
Schon wieder 100 Threads gestartet: 16300
Schon wieder 100 Threads gestartet: 16400
Kein Open-Edition-Link gefunden! 9622 ['musermeku_1753', 'de', 'Angelika Schoder', '60', 'N/A', '2014-07-30', 'N/A', 'musermeku', '1753', 'http://musermeku.hypotheses.org/1753', 'Der Deutsche Museumsbund und die Frage: Brauchen Kulturinstitutionen eine Facebook-Seite?']
Schon wieder 100 Threads gestartet: 16500
Kein Open-Edition-Link gefunden! 9682 ['musermeku_132', 'de', 'Angelika Schoder', '479', 'N/A', '2013-05-09', 'N/A', 'musermeku', '132', 'http://musermeku.hypotheses.org/132', 'Vergangenheit erinnern – Zukunft gestalten: Der 36. Internationale Museumstag am 12. Mai 2013']


Exception in thread Thread-7000:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 1331, in getresponse
    response.begin()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 258, in _read

Schon wieder 100 Threads gestartet: 16600
Kein Open-Edition-Link gefunden! 9772 ['musermeku_1659', 'de', 'Angelika Schoder', '53', 'N/A', '2014-07-10', 'N/A', 'musermeku', '1659', 'http://musermeku.hypotheses.org/1659', 'Infografik zum Thema: Wozu brauchen Kulturinstitutionen eigentlich Infografiken?']
Schon wieder 100 Threads gestartet: 16700
Kein Open-Edition-Link gefunden! 7167 ['musermeku_846', 'de', 'MusErMeKu Gast', '634', 'N/A', '2013-11-07', 'N/A', 'musermeku', '846', 'http://musermeku.hypotheses.org/846', 'Den Besuchern ein Gesicht geben – Besucherforschung als Beruf']
Schon wieder 100 Threads gestartet: 16800
Schon wieder 100 Threads gestartet: 16900
Kein Open-Edition-Link gefunden! 9989 ['musermeku_919', 'de', 'Angelika Schoder', '484', 'N/A', '2014-01-02', 'N/A', 'musermeku', '919', 'http://musermeku.hypotheses.org/919', 'Von der Doktorarbeit zum Buch in 11 Schritten – Teil 2']
Kein Open-Edition-Link gefunden! 10005 ['zwopktnull_1', 'de', 'Christian Bunnenberg', '180', 'N/A

Exception in thread Thread-7645:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 1331, in getresponse
    response.begin()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 266, in _read

Schon wieder 100 Threads gestartet: 17600
Kein Open-Edition-Link gefunden! 13735 ['musermeku_5334', 'de', 'Angelika Schoder', '75', 'N/A', '2015-12-03', 'N/A', 'musermeku', '5334', 'http://musermeku.hypotheses.org/5334', '#allmymovies – Shia LaBeoufs Selbstinszenierung']
Kein Open-Edition-Link gefunden! 10501 ['musermeku_5162', 'de', 'Angelika Schoder', '59', 'N/A', '2015-11-11', 'N/A', 'musermeku', '5162', 'http://musermeku.hypotheses.org/5162', 'Hashtags als Stilelement: #allesistdesign im Vitra Design Museum']
Schon wieder 100 Threads gestartet: 17700
Schon wieder 100 Threads gestartet: 17800
Schon wieder 100 Threads gestartet: 17900
Schon wieder 100 Threads gestartet: 18000
Schon wieder 100 Threads gestartet: 18100
Schon wieder 100 Threads gestartet: 18200
Schon wieder 100 Threads gestartet: 18300
Schon wieder 100 Threads gestartet: 18400
Schon wieder 100 Threads gestartet: 18500
Kein Open-Edition-Link gefunden! 11156 ['musermeku_1166', 'de', 'Angelika Schoder', '932', 'N/A', '2014

Exception in thread Thread-8312:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 1331, in getresponse
    response.begin()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 266, in _read

Schon wieder 100 Threads gestartet: 18700
Schon wieder 100 Threads gestartet: 18800
Kein Open-Edition-Link gefunden! 11362 ['zwopktnull_164', 'de', 'Christian Bunnenberg', '992', 'N/A', '2014-04-02', 'N/A', 'zwopktnull', '164', 'http://zwopktnull.hypotheses.org/164', 'Das digitale Schulbuch – eine Verknüpfung aller Vorzüge?']
Schon wieder 100 Threads gestartet: 18900
Schon wieder 100 Threads gestartet: 19000
Kein Open-Edition-Link gefunden! 11473 ['musermeku_4226', 'de', 'Angelika Schoder', '53', 'N/A', '2015-09-16', 'N/A', 'musermeku', '4226', 'http://musermeku.hypotheses.org/4226', 'Im Land der Einhörner – Das Mindestlohn- Praktikum']
Schon wieder 100 Threads gestartet: 19100
Kein Open-Edition-Link gefunden! 11545 ['musermeku_1011', 'de', 'Angelika Schoder', '435', 'N/A', '2014-02-20', 'N/A', 'musermeku', '1011', 'http://musermeku.hypotheses.org/1011', 'Jetzt erschienen: Die Vermittlung des Unbegreiflichen. Darstellungen des Holocaust im Museum']


Exception in thread Thread-15018:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 1331, in getresponse
    response.begin()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 258, in _rea

Schon wieder 100 Threads gestartet: 19200
Schon wieder 100 Threads gestartet: 19300
Schon wieder 100 Threads gestartet: 19400
Schon wieder 100 Threads gestartet: 19500
Schon wieder 100 Threads gestartet: 19600
Schon wieder 100 Threads gestartet: 19700


Exception in thread Thread-9003:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 1331, in getresponse
    response.begin()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 266, in _read

Exception in thread Thread-9004:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 1331, in getresponse
    response.begin()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 258, in _read

Schon wieder 100 Threads gestartet: 19800
Schon wieder 100 Threads gestartet: 19900
Schon wieder 100 Threads gestartet: 20000
Schon wieder 100 Threads gestartet: 20100
Schon wieder 100 Threads gestartet: 20200
Kein Open-Edition-Link gefunden! 12325 ['zwopktnull_57', 'de', 'Christian Bunnenberg', '587', 'N/A', '2013-07-30', 'N/A', 'zwopktnull', '57', 'http://zwopktnull.hypotheses.org/57', 'Rezension | Anmerkungen zum historisch-geopolitischen mobilen Lernen']
Kein Open-Edition-Link gefunden! 12330 ['zwopktnull_152', 'de', 'Christian Bunnenberg', '954', 'N/A', '2014-02-24', 'N/A', 'zwopktnull', '152', 'http://zwopktnull.hypotheses.org/152', 'Eine Benutzeroberfläche für das digitale Schulgeschichtsbuch']
Schon wieder 100 Threads gestartet: 20300


Exception in thread Thread-9315:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 1331, in getresponse
    response.begin()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 258, in _read

Schon wieder 100 Threads gestartet: 20400
Schon wieder 100 Threads gestartet: 20500
Schon wieder 100 Threads gestartet: 20600
Schon wieder 100 Threads gestartet: 20700
Schon wieder 100 Threads gestartet: 20800
Kein Open-Edition-Link gefunden! 12759 ['archivalia_13848', 'de', 'clairemueller', '263', 'N/A', '2011-02-24', 'N/A', 'archivalia', '13848', 'http://archivalia.hypotheses.org/13848', 'Vom Suchen und Finden in Medienarchiven']
Schon wieder 100 Threads gestartet: 20900


Exception in thread Thread-12795:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 1331, in getresponse
    response.begin()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 258, in _rea

Schon wieder 100 Threads gestartet: 21000
Kein Open-Edition-Link gefunden! 16627 ['musermeku_144', 'de', 'Angelika Schoder', '442', 'N/A', '2013-05-16', 'N/A', 'musermeku', '144', 'http://musermeku.hypotheses.org/144', 'Der „Kindertransport“ 1938/39 in zeitgenössischer Perspektive']
Schon wieder 100 Threads gestartet: 21100
Schon wieder 100 Threads gestartet: 21200
Schon wieder 100 Threads gestartet: 21300


Exception in thread Thread-10039:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 1331, in getresponse
    response.begin()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 266, in _rea

Kein Open-Edition-Link gefunden! 13234 ['musermeku_3882', 'de', 'Angelika Schoder', '172', 'N/A', '2015-08-19', 'N/A', 'musermeku', '3882', 'http://musermeku.hypotheses.org/3882', 'Viele Grüße aus der Filterblase … (MusErMeKu)']
Kein Open-Edition-Link gefunden! 13538 ['musermeku_3870', 'de', 'Damián Morán Dauchez', '68', 'N/A', '2015-07-08', 'N/A', 'musermeku', '3870', 'http://musermeku.hypotheses.org/3870', 'Stadtmuseum Fürth Ludwig Erhard']
Kein Open-Edition-Link gefunden! 13624 ['zwopktnull_93', 'de', 'Christian Bunnenberg', '331', 'N/A', '2013-11-07', 'N/A', 'zwopktnull', '93', 'http://zwopktnull.hypotheses.org/93', 'Die Novemberpogrome 1938 und das Microblogging-Projekt @9nov38']


Exception in thread Thread-13682:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 1331, in getresponse
    response.begin()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 258, in _rea

Exception in thread Thread-18123:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 1331, in getresponse
    response.begin()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 258, in _rea

Kein Open-Edition-Link gefunden! 14317 ['fnzinfo_788', 'de', 'Anton Tantner', '255', 'N/A', '2016-04-14', 'N/A', 'fnzinfo', '788', 'http://fnzinfo.hypotheses.org/788', 'Einladung zum Jour fixe, 11.5.2016: Alexander Cors –  Eine Frage der Loyalität. Spanische Herrschaft im multiethnischen Louisiana, 1762–1803']


Exception in thread Thread-14535:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 1331, in getresponse
    response.begin()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 258, in _rea

Kein Open-Edition-Link gefunden! 14825 ['musermeku_512', 'de', 'Angelika Schoder', '510', 'N/A', '2013-08-08', 'N/A', 'musermeku', '512', 'http://musermeku.hypotheses.org/512', 'Nachtigall, ick hör dir bloggen']
Kein Open-Edition-Link gefunden! 15006 ['musermeku_2721', 'de', 'Angelika Schoder', '53', 'N/A', '2015-02-14', 'N/A', 'musermeku', '2721', 'http://musermeku.hypotheses.org/2721', 'Museumsmarketing: Von Hollywood-Stars und Internet-Memes']


Exception in thread Thread-15014:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 1331, in getresponse
    response.begin()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 266, in _rea


Exception in thread Thread-11594:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 1331, in getresponse
    response.begin()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 258, in _re

Kein Open-Edition-Link gefunden! 15431 ['musermeku_926', 'de', 'Angelika Schoder', '734', 'N/A', '2014-01-09', 'N/A', 'musermeku', '926', 'http://musermeku.hypotheses.org/926', 'Von der Doktorarbeit zum Buch in 11 Schritten – Teil 3']


Exception in thread Thread-19724:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 1331, in getresponse
    response.begin()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 258, in _rea

Kein Open-Edition-Link gefunden! 15757 ['musermeku_882', 'de', 'Angelika Schoder', '270', 'N/A', '2013-11-28', 'N/A', 'musermeku', '882', 'http://musermeku.hypotheses.org/882', 'Vorankündigung: Die Vermittlung des Unbegreiflichen']
Kein Open-Edition-Link gefunden! 15781 ['resilienz_251', 'de', 'Michael Meyen', '795', 'N/A', '2014-09-12', 'N/A', 'resilienz', '251', 'http://resilienz.hypotheses.org/251', 'Resilienz-Werbung']
Kein Open-Edition-Link gefunden! 15804 ['musermeku_2372', 'de', 'Angelika Schoder', '69', 'N/A', '2015-01-21', 'N/A', 'musermeku', '2372', 'http://musermeku.hypotheses.org/2372', 'Der Holocaust Memorial Day 2015 und die MemoryMakers']
Kein Open-Edition-Link gefunden! 15833Kein Open-Edition-Link gefunden! ['pophistory_979', 'de', 'Henning Wellmann', '778', 'N/A', '2013-09-13', 'N/A', 'pophistory', '979', 'http://pophistory.hypotheses.org/979', 'CfP: Sounds, Klänge, Töne – Zur klanglichen Dimension von Musik und ihrer emotionalen Bedeutung und Wahrnehmung. Berlin, 24.-

Exception in thread Thread-15992:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 1331, in getresponse
    response.begin()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 266, in _rea

Kein Open-Edition-Link gefunden! 16000 ['musermeku_1843', 'de', 'Angelika Schoder', '284', 'N/A', '2014-09-24', 'N/A', 'musermeku', '1843', 'http://musermeku.hypotheses.org/1843', 'Infografik: Warum bloggen Wissenschaftler eigentlich?']
Kein Open-Edition-Link gefunden! 16098 ['musermeku_2025', 'de', 'Angelika Schoder', '1034', 'N/A', '2014-11-19', 'N/A', 'musermeku', '2025', 'http://musermeku.hypotheses.org/2025', 'Rückblick auf den Workshop „Emotional Strategies in Museum Exhibitions“ (2)']
Kein Open-Edition-Link gefunden! 16103 ['musermeku_5662', 'de', 'Angelika Schoder', '53', 'N/A', '2016-02-03', 'N/A', 'musermeku', '5662', 'http://musermeku.hypotheses.org/5662', 'Von Verleugnung und Trauer – Toshio Hosokawas „Stilles Meer“ in der Staatsoper Hamburg']
Kein Open-Edition-Link gefunden! 16393 ['musermeku_5700', 'de', 'Angelika Schoder', '54', 'N/A', '2016-02-17', 'N/A', 'musermeku', '5700', 'http://musermeku.hypotheses.org/5700', 'Museumsarchitektur als Element der Stadtentwicklung – 

Exception in thread Thread-16521:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 1331, in getresponse
    response.begin()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 258, in _rea

Kein Open-Edition-Link gefunden! 16867 ['musermeku_1812', 'de', 'Angelika Schoder', '55', 'N/A', '2014-08-27', 'N/A', 'musermeku', '1812', 'http://musermeku.hypotheses.org/1812', 'Das Medizinhistorische Museum Hamburg']


Exception in thread Thread-13195:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 1331, in getresponse
    response.begin()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 266, in _rea

Kein Open-Edition-Link gefunden! 17054 ['musermeku_990', 'de', 'MusErMeKu Gast', '792', 'N/A', '2014-03-05', 'N/A', 'musermeku', '990', 'http://musermeku.hypotheses.org/990', 'Ellis Island: Island of Hope, Island of Tears – Ein Praktikumsbericht (Teil 2)']
Kein Open-Edition-Link gefunden! 17348 ['zwopktnull_38', 'de', 'Christian Bunnenberg', '1001', 'N/A', '2013-07-29', 'N/A', 'zwopktnull', '38', 'http://zwopktnull.hypotheses.org/38', 'Geocaching im Geschichts- und Sachunterricht | Teil I']
Kein Open-Edition-Link gefunden! 17457 ['musermeku_1769', 'de', 'Angelika Schoder', '263', 'conf', '2014-08-06', 'N/A', 'musermeku', '1769', 'http://musermeku.hypotheses.org/1769', 'Schulmaterialien des UN Outreach Programme und des USHMM zum Holocaust']


Exception in thread Thread-13647:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 1331, in getresponse
    response.begin()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 266, in _rea

Kein Open-Edition-Link gefunden! 17563 ['musermeku_3136', 'de', 'MusErMeKu Gast', '1177', 'N/A', '2015-04-01', 'N/A', 'musermeku', '3136', 'http://musermeku.hypotheses.org/3136', 'Arbeitsmarkt für Geisteswissenschaftler – besser im Jahresabo?']
Kein Open-Edition-Link gefunden! 17866 ['zwopktnull_32', 'de', 'Christian Bunnenberg', '1021', 'N/A', '2013-07-27', 'N/A', 'zwopktnull', '32', 'http://zwopktnull.hypotheses.org/32', 'App | „The American Way“ | Haus der Geschichte']
Kein Open-Edition-Link gefunden! 18021 ['zwopktnull_132', 'de', 'Christian Bunnenberg', '93', 'N/A', '2014-02-20', 'N/A', 'zwopktnull', '132', 'http://zwopktnull.hypotheses.org/132', 'Hinweis | Blogaward 2014 Publikumspreis']


Exception in thread Thread-18062:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 1331, in getresponse
    response.begin()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 258, in _rea

Kein Open-Edition-Link gefunden! 18202 ['musermeku_813', 'de', 'Angelika Schoder', '481', 'N/A', '2013-10-31', 'N/A', 'musermeku', '813', 'http://musermeku.hypotheses.org/813', 'Heute vor 75 Jahren – Ein Mikroblog über den 9. November 1938']
Kein Open-Edition-Link gefunden! 18329 ['musermeku_1985', 'de', 'Damián Morán Dauchez', '621', 'N/A', '2014-11-06', 'N/A', 'musermeku', '1985', 'http://musermeku.hypotheses.org/1985', 'Nürnberg Hauptbahnhof, Richtung Dokuzentrum']
Kein Open-Edition-Link gefunden! 18337 ['musermeku_1200', 'de', 'Angelika Schoder', '51', 'N/A', '2014-05-14', 'N/A', 'musermeku', '1200', 'http://musermeku.hypotheses.org/1200', 'Facebook hui / Twitter pfui? – Die Social Media Kommunikation zu kulturellen Veranstaltungen']
Kein Open-Edition-Link gefunden! 18500 ['musermeku_164', 'de', 'Angelika Schoder', '412', 'N/A', '2013-05-23', 'N/A', 'musermeku', '164', 'http://musermeku.hypotheses.org/164', 'Die Witzfigur mit dem Seitenscheitel']


Exception in thread Thread-14519:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 1331, in getresponse
    response.begin()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 266, in _rea

Kein Open-Edition-Link gefunden! 18956 ['musermeku_2623', 'de', 'Damián Morán Dauchez', '482', 'N/A', '2015-01-28', 'N/A', 'musermeku', '2623', 'http://musermeku.hypotheses.org/2623', 'C’est le sens de la vie! / #wbhyp']


Exception in thread Thread-19157:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 1331, in getresponse
    response.begin()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 258, in _rea

Kein Open-Edition-Link gefunden!Kein Open-Edition-Link gefunden! 19238 ['musermeku_5213', 'de', 'Angelika Schoder', '66', 'N/A', '2015-11-18', 'N/A', 'musermeku', '5213', 'http://musermeku.hypotheses.org/5213', 'Twitter-Dämmerung – MMAUVS und der #BayreuthFake']
 19275 ['zwopktnull_159', 'de', 'Christian Bunnenberg', '118', 'N/A', '2014-03-07', 'N/A', 'zwopktnull', '159', 'http://zwopktnull.hypotheses.org/159', 'Danke für den 3. Platz beim Blogaward 2014 Publikumspreis']


Exception in thread Thread-15452:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 1331, in getresponse
    response.begin()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 266, in _rea

Kein Open-Edition-Link gefunden! 19699 ['musermeku_1091', 'de', 'Angelika Schoder', '1039', 'N/A', '2014-04-09', 'N/A', 'musermeku', '1091', 'http://musermeku.hypotheses.org/1091', 'Otto Dix – Bilder der Urkatastrophe']
Kein Open-Edition-Link gefunden! 19773 ['musermeku_4284', 'de', 'Angelika Schoder', '60', 'N/A', '2015-10-23', 'N/A', 'musermeku', '4284', 'http://musermeku.hypotheses.org/4284', 'Auf den Spuren von Freya und Helmuth James von Moltke / #artbookfriday']
Kein Open-Edition-Link gefunden! 19910 ['musermeku_482', 'de', 'MusErMeKu Gast', '1194', 'N/A', '2013-07-31', 'N/A', 'musermeku', '482', 'http://musermeku.hypotheses.org/482', '“From Lesser to Tanya Ury: German-Jewish Artists 1890-2010” – Ein (subjektiver) Bericht']
Kein Open-Edition-Link gefunden! 20033 ['zwopktnull_140', 'de', 'Christian Bunnenberg', '794', 'N/A', '2014-02-21', 'N/A', 'zwopktnull', '140', 'http://zwopktnull.hypotheses.org/140', 'App „KZ-Gedenkstätte Neuengamme“']


Exception in thread Thread-15913:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 1331, in getresponse
    response.begin()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 258, in _rea

Exception in thread Thread-15911:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 1331, in getresponse
    response.begin()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 258, in _rea

Kein Open-Edition-Link gefunden! 20664 ['musermeku_5675', 'de', 'Angelika Schoder', '52', 'N/A', '2016-02-10', 'N/A', 'musermeku', '5675', 'http://musermeku.hypotheses.org/5675', 'Tanz der Strichmännchen – Jean Dubuffets „Coucou Bazar“ in der Fondation Beyeler']
Kein Open-Edition-Link gefunden! 20805 ['zwopktnull_73', 'de', 'Christian Bunnenberg', '179', 'N/A', '2013-09-05', 'N/A', 'zwopktnull', '73', 'http://zwopktnull.hypotheses.org/73', 'Geschichtsdidaktisches Blogjournal | „Public History Weekly“']


Exception in thread Thread-16385:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 1331, in getresponse
    response.begin()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 266, in _rea

Kein Open-Edition-Link gefunden! 20897 ['musermeku_3754', 'de', 'Angelika Schoder', '53', 'N/A', '2015-06-24', 'N/A', 'musermeku', '3754', 'http://musermeku.hypotheses.org/3754', 'Das 20. Jhd vor Gericht – Das Tribunal im Rahmen der ZKM-Globale']
Kein Open-Edition-Link gefunden! 21020 ['musermeku_3896', 'de', 'Angelika Schoder', '58', 'N/A', '2015-07-15', 'N/A', 'musermeku', '3896', 'http://musermeku.hypotheses.org/3896', 'Eliasson und Lampedusa – Über Wasser im Bucerius Kunst Forum']
Kein Open-Edition-Link gefunden! 21155 ['musermeku_2068', 'de', 'Angelika Schoder', '65', 'N/A', '2014-12-03', 'N/A', 'musermeku', '2068', 'http://musermeku.hypotheses.org/2068', 'Wie plant man eine (Kultur-) Bloggerreise? Möglichkeiten, Risiken und Nebenwirkungen']
Kein Open-Edition-Link gefunden! 21194 ['musermeku_292', 'de', 'MusErMeKu Gast', '464', 'N/A', '2013-06-27', 'N/A', 'musermeku', '292', 'http://musermeku.hypotheses.org/292', '„Gutes Wetter – Schlechtes Wetter“ – Eine Ausstellung wird geplant 

Exception in thread Thread-21358:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 1331, in getresponse
    response.begin()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 266, in _rea

Kein Open-Edition-Link gefunden! 21386 ['musermeku_894', 'de', 'Angelika Schoder', '195', 'N/A', '2013-12-12', 'N/A', 'musermeku', '894', 'http://musermeku.hypotheses.org/894', 'Leseempfehlung: Aktuelle Rezensionen von Oliver Sukrow']


Exception in thread Thread-21361:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 1331, in getresponse
    response.begin()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 266, in _rea

Exception in thread Thread-18592:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 1331, in getresponse
    response.begin()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 266, in _rea

Exception in thread Thread-19225:
Traceback (most recent call last):
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
    httplib_response = conn.getresponse()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 1331, in getresponse
    response.begin()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "c:\users\hartmann\appdata\local\programs\python\python36\lib\http\client.py", line 266, in _rea

In [5]:
# show errors
print(len(errorlist))
print(errorlist)

0
[]


Speicherung der deutschen Blogbeiträge und ihrer Metadaten

In [6]:
# add ; subjects ; themes to de_labeled_metadata.csv
print(type(trainset))

trainset.sort()
np_lines = np.array(trainset)
#print(np_lines)

with open(folder+'/de_labeled_metadata.csv', 'w', newline='', encoding="utf-8") as labeledcsv:
    labeled = csv.writer(labeledcsv, delimiter = ";")
    labeled.writerow(["filename", "language", "author", "numwords", "category", "date", "licence", "blog", "post", "url", "title", "disciplines", "themes"])
    
    for i, line in enumerate(np_lines):
        labeled.writerow(line)
        

    



<class 'list'>


In [8]:
# move all german files to folder txt_de
newfolder = folder+'/txt_de'
if not os.path.exists(newfolder):
    os.makedirs(newfolder)


newfilelist = os.listdir(newfolder)
newfilelist.sort()
oldfolder = folder+'/txt'
filelist = os.listdir(oldfolder)
filelist.sort()
#print(trainset[0])
#print(len(trainset))
#trainset.sort()

for line in trainset:
    file = line[0] + '.txt'
    if (file in filelist) and (file not in newfilelist):
        shutil.copy2((oldfolder+'/'+file), (newfolder+'/'+file))
        #print("deutsch: ", (oldfolder+'/'+file))
    else:
        #print("Nicht deutsch")
        continue

In [9]:
# 100 missing files in folder 'txt'
missing = []
filelist = os.listdir(newfolder)
filelist.sort()
#trainset.sort()
for line in trainset:
    file = line[0] + '.txt'
    if file not in filelist:
        missing.append(file)
        #print("deutsch: ", (directory+'/'+file))
    else:
        #print("Nicht deutsch")
        continue
    
print(missing)
print(len(missing))

[]
0


In [10]:
# open german metadata file: de_labeled_metadata.csv
# and every Blogpost

filelist = os.listdir(newfolder)
filelist.sort()
lines = [] # alle in de_labeled_metadata.csv verzeichnete Blogposts
corpus = [] # deutschsprachige Blogposts
labels = [] # zugehörige Labels
errors = [] # in metadata.csv verzeichnete, aber in hypoposts-txt.zip nicht enthaltene Blogposts
filenames = [] # Blogposts ohne Fehler
onelabel = [] # Blogposts mit nur einer Art von Label (Thema oder Disziplin)

with open(folder+'/de_labeled_metadata.csv', 'r', encoding='utf-8') as openfile:
    metadata = openfile.readlines()
    openfile.close()
    #print(metadata[0])
    for i, line in enumerate(metadata[1:]):
        #print(i)
        lines.append(line.split(";"))
        #print("\nFile:", lines[i][0])
        #print("Themes:", lines[i][11])
        #print("Disciplines:", lines[i][12])
        file = (lines[i][0] + '.txt')
        #print("Filename:", file)
        
        if file in filelist:
            
            with open((newfolder+'/'+file), 'r', encoding='utf-8') as textfile:
                text = textfile.read()
                textfile.close()
                filenames.append(file)
                corpus.append(text)
              
            if len(lines[i]) > 12:
                labels.append(lines[i][11] + "; " + lines[i][12])
            elif len(lines[i]) > 10:
                labels.append(lines[i][11])
                onelabel.append(file)
            else:
                print("keine Disziplin gefunden!", lines[i])
                
        else:
            print("File nicht gefunden!", file)
            errors.append(file)
            continue

print("\n")
print(len(corpus))
print(len(labels))
print(len(filenames))
print(len(errors))
print(len(onelabel))

for blog in onelabel:
    print(blog)



21387
21387
21387
0
36
aes_1.txt
aes_105.txt
aes_126.txt
aes_128.txt
aes_13.txt
aes_138.txt
aes_165.txt
aes_182.txt
aes_198.txt
aes_204.txt
aes_207.txt
aes_214.txt
aes_228.txt
aes_232.txt
aes_240.txt
aes_245.txt
aes_257.txt
aes_26.txt
aes_273.txt
aes_288.txt
aes_29.txt
aes_315.txt
aes_317.txt
aes_322.txt
aes_327.txt
aes_329.txt
aes_334.txt
aes_337.txt
aes_349.txt
aes_365.txt
aes_37.txt
aes_47.txt
aes_51.txt
aes_80.txt
aes_88.txt
aes_99.txt


Erstellung der Datenbasis: Dateinamen, Blogbeiträge und zugehörige Klassen (Themen und Disziplinen)

In [11]:
# write csv-file de_labeled_corpus.csv: filename, classes, text

with open(folder+'/de_labeled_corpus.csv', 'w', newline='', encoding="utf-8") as labeledcsv:
    labeled = csv.writer(labeledcsv, delimiter = ";")
    labeled.writerow(["filename", "classes", "text"])
    
    for file, label, line in zip(filenames, labels, corpus):
        labeled.writerow([file.replace('\n', ' '), label.replace('\n', ''), line.replace('\n', ' ')])
        