In [6]:
import pandas as pd
import numpy as np
import time
import string
import re
from IPython.display import clear_output
import requests
import spacy
import gensim
from gensim import corpora
from nltk.corpus import stopwords
import concurrent.futures
#nltk.download("stopwords")

import warnings
warnings.filterwarnings("ignore")

In [7]:
def update_progress(progress):
    bar_length = 50
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1

    block = int(round(bar_length * progress))

    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

In [14]:
#!python3 -m spacy download de_core_news_md
nlp = spacy.load('de_core_news_md')

In [20]:
def preprocess(text, idx):
    text = text.lower()
    print("lower - %s" % (idx))
    
    # remove "&nbsp"
    text = re.sub(r"\&nbsp", "", text)
    # remove urls
    # source: url_extract_pattern from https://uibakery.io/regex-library/url-regex-python
    url_extract_pattern = "https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)"
    text = re.sub(url_extract_pattern, '', text)
    # remvoe "\n"
    text = re.sub(r"[^ ]*\n", "", text)
    # remove file names with commom endings with 4 or 3 digits
    text = re.sub(r"[^ ]*\..{4}|[^ ]*\..{3}", "", text)
    # remove any refs
    text = re.sub(r"[^ ]*ref", "", text)
    # remove -
    text = re.sub(r"-", "", text)
    #remove punctuation thats left
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # remove stopwords
    text = text.split(" ")
    # source: https://stackoverflow.com/questions/5486337/how-to-remove-stop-words-using-nltk-or-python
    text = [word for word in text if word not in german_stopwords]
    
    # lemmatization
    text_lemma = []

    for ix, word in enumerate(text):
        doc = nlp(word)
        result = ' '.join([x.lemma_ for x in doc]) 
        text_lemma.append(result)
    
    final = [gensim.utils.simple_preprocess(word, deacc = True) for word in text_lemma]
    
    for word in final:
        if len(word) == 0:
            final.remove(word)
    print("final - %s" % (idx))
    
    preprocessed_content.update({idx: final})
    #preprocessed_content.append(final)

---

In [16]:
URL = "https://de.wikipedia.org/w/api.php"

In [17]:
german_stopwords = stopwords.words("german")

In [18]:
# request excellent arictles from german wikipedia via wiki api (10 at a time)
S = requests.Session()

params = {
    "action": "query",
    "prop": "revisions",
    "rvprop": "content",
    "rvslots": "*",
    "format": "json",
    "formatversion": 2,
    "srsearch": "incategory:Wikipedia:Exzellent",
    "list": "search",
    "sroffset": 0
}

response = S.get(url = URL, params = params)
data = response.json()

# get ids from excellent articles
ids = []

for entry in data["query"]["search"]:
    ids.append(entry["pageid"])

while data.get("continue"):
    params.update({"sroffset": data["continue"]["sroffset"]})
    
    #print("\n%s" % (PARAMS))
    response = S.get(url = URL, params = params)
    data = response.json()
    
    for entry in data["query"]["search"]:
        ids.append(entry["pageid"])

print("Anzahl gesammelter Exzellenter Artikel: %s" %(len(ids)))

#if DATA['query']['search'][0]['title'] == SEARCHPAGE:
#    print("Your search page '" + SEARCHPAGE + "' exists on English Wikipedia")

Anzahl gesammelter Exzellenter Artikel: 2801


```python
# request data to every excellent article in german wikipedia via wikipedia api using pageid
params = {
    "action": "query",
    "prop": "revisions",
    "rvprop": "content",
    "rvslots": "*",
    "format": "json",
    "formatversion": 2,
    "pageids": 0
}

data = pd.DataFrame()
content = {}

for ix, id in enumerate(ids):
    update_progress(ix / len(ids))
    params.update({"pageids": id})
    response = S.get(url = URL, params = params)
    page = response.json()
    
    """preprocessed = preprocess(page["query"]["pages"][0]["revisions"][0]["slots"]["main"]["content"], german_stopwords)
    
    for word in preprocessed:
        if len(word) == 0:
            preprocessed.remove(word)
        
    content.update({id: preprocessed})"""
    content.update({id: page["query"]["pages"][0]["revisions"][0]["slots"]["main"]["content"]})
 ```

In [22]:
def get_pages_by_id(id):
    params.update({"pageids": id})
    response = S.get(url = URL, params = params)
    page = response.json()
    content.update({id: page["query"]["pages"][0]["revisions"][0]["slots"]["main"]["content"]})

In [23]:
content = {}
start = time.time()
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(get_pages_by_id, ids)

end = time.time()
dur = end - start
print("DUR: %s" % (dur))

DUR: 86.74837279319763


In [24]:
len(content)

2784

In [25]:
# save requested data in df
df = pd.DataFrame(content.items())
df = df.rename({0: "pageid", 1:"content"}, axis = 1)

In [26]:
df.head()

Unnamed: 0,pageid,content
0,1428,"[[Datei:Danimarca XIII secolo, plinio historia..."
1,1200964,{{Dieser Artikel|behandelt den Staat. Zu weite...
2,18559,{{Infobox Schrift\n|Schrift = Japanisc...
3,2677,{{Begriffsklärungshinweis}}\n{{Infobox Staat\n...
4,2391,{{Dieser Artikel|behandelt das Land – zu ander...


In [27]:
preprocessed_content = {}
start = time.time()
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(preprocess, df["content"], ids)
    
end = time.time()
dur = end - start
print("DUR: %s" % (dur))

lower - 1428lower - 2677
lower - 1200964
lower - 880316

lower - 2391
lower - 16565
lower - 3221050
lower - 2996
lower - 290
lower - 1084
lower - 29938
lower - 490244
final - 1200964
lower - 18559
final - 16565
lower - 507246
final - 2996
lower - 639
final - 3221050
lower - 14896
final - 290
lower - 1192349
final - 2677
lower - 27231
final - 1428
lower - 11281
final - 1192349
lower - 13786
final - 880316
lower - 490101
final - 29938
lower - 2268
final - 27231
lower - 62162
final - 13786
lower - 8787
final - 1084
lower - 2501
final - 2391
lower - 60100
final - 490244
lower - 101972
final - 507246
lower - 106547
final - 11281
lower - 2655245
final - 18559
lower - 826909
final - 14896
lower - 874
final - 639
lower - 12049
final - 2655245
lower - 51610
final - 101972
lower - 3733
final - 490101
lower - 31818
final - 2501
lower - 8531
final - 2268
lower - 5716
final - 12049
lower - 19028
final - 60100
lower - 3466135
final - 106547
lower - 14174
final - 874
lower - 16651
final - 8787
lower 

final - 28868
lower - 12872
final - 1427151
lower - 18623
final - 62194
lower - 75805
final - 828
lower - 109042
final - 2751605
lower - 111667
final - 1329
lower - 53729
final - 173606
lower - 125410
final - 12872
lower - 11323
final - 205401
lower - 50280
final - 11724
lower - 13406
final - 92678
lower - 5163
final - 15718
lower - 33900
final - 75805
lower - 59775
final - 53729
lower - 5347
final - 3296
lower - 28430
final - 18623
lower - 3899
final - 125410
lower - 3840
final - 11323
lower - 4271380
final - 109042
lower - 37382
final - 5163
lower - 32983
final - 13406
lower - 11306
final - 111667
lower - 293484
final - 5347
lower - 605613
final - 33900
lower - 3291
final - 59775
lower - 27188
final - 32983
lower - 1314512
final - 4271380
lower - 3969
final - 293484
lower - 1064064
final - 11306
lower - 59730
final - 27188
lower - 22333
final - 3291
lower - 5096074
final - 28430
lower - 1312842
final - 605613
lower - 166528
final - 50280
lower - 40842
final - 37382
lower - 14642
fina

final - 1806970
lower - 613080
final - 2207
lower - 49986
final - 142787
lower - 85394
final - 13719
lower - 4787
final - 35911
lower - 52062
final - 5783
lower - 100459
final - 613080
lower - 20812
final - 52436
lower - 161171
final - 52062
lower - 928480
final - 4787
lower - 113858
final - 598895
lower - 1943
final - 25126
lower - 20063
final - 20812
lower - 790221
final - 49986
lower - 183504
final - 928480
lower - 57847
final - 100459
lower - 186079
final - 8830
lower - 4226
final - 183504
lower - 4944364
final - 161171
lower - 100013
final - 186079
lower - 39776
final - 57847
lower - 186296
final - 85394
lower - 46709
final - 123954
lower - 717865
final - 4944364
lower - 161949
final - 20063
lower - 357918
final - 1943
lower - 131140
final - 4819
lower - 4399308
final - 186296
lower - 52033
final - 113858
lower - 115894
final - 790221
lower - 5135479
final - 39776
lower - 7622
final - 46709
lower - 126315
final - 717865
lower - 45599
final - 161949
lower - 82997
final - 5135479
lo

final - 4722725
lower - 690608
final - 268172
lower - 1500883
final - 102000
lower - 40981
final - 112075
lower - 147968
final - 830369
lower - 1087178
final - 1211
lower - 1440515
final - 4696
lower - 1019349
final - 147968
lower - 955419
final - 1087178
lower - 5193014
final - 690608
lower - 10766
final - 40981
lower - 203183
final - 77807
lower - 431501
final - 955419
lower - 213055
final - 5193014
lower - 3262208
final - 1647329
lower - 161894
final - 1440515
lower - 54618
final - 449523
lower - 15747
final - 203183
lower - 104471
final - 431501
lower - 52460
final - 161894
lower - 735163
final - 3680626
lower - 217589
final - 1019349
lower - 64970
final - 15747
lower - 41453
final - 54618
lower - 5267361
final - 213055
lower - 73184
final - 64725
lower - 724600
final - 217589
lower - 131194
final - 104471
lower - 89384
final - 1500883
lower - 378192
final - 89384
lower - 10328
final - 3262208
lower - 826902
final - 10766
lower - 44485
final - 41453
lower - 120406
final - 52460
low

final - 103872
lower - 1131194
final - 89717
lower - 219004
final - 130586
lower - 1070894
final - 8772743
lower - 217723
final - 119571
lower - 61911
final - 521577
lower - 327474
final - 645872
lower - 185837
final - 1755487
lower - 220980
final - 15243
lower - 27660
final - 1070894
lower - 3818572
final - 1131194
lower - 95560
final - 219004
lower - 141371
final - 51237
lower - 1862610
final - 8383
lower - 4358
final - 772836
lower - 99123
final - 217723
lower - 287284
final - 27660
lower - 1199043
final - 185837
lower - 506197
final - 1862610
lower - 202923
final - 3818572
lower - 838910
final - 61911
lower - 175452
final - 141371
lower - 417302
final - 1199043
lower - 7427257
final - 95560
lower - 147300
final - 220980
lower - 4339337
final - 147300
lower - 988594
final - 327474
lower - 99877
final - 99123
lower - 602079
final - 287284
lower - 562674
final - 838910
lower - 72164
final - 506197
lower - 7515559
final - 4358
lower - 907151
final - 202923
lower - 356223
final - 72164


final - 77131
lower - 1754012
final - 3381266
lower - 1656311
final - 970248
lower - 205563
final - 600569
lower - 5481491
final - 136279
lower - 275555
final - 1656311
lower - 794869
final - 941122
lower - 1044307
final - 112421
lower - 507990
final - 5481491
lower - 5365398
final - 1673091
lower - 283247
final - 1754012
lower - 2613576
final - 102872
lower - 282241
final - 1652843
lower - 4140926
final - 280332
lower - 156031
final - 275555
lower - 5659797
final - 794869
lower - 98867
final - 283247
lower - 93318
final - 5365398
lower - 34460
final - 4751674
lower - 1139663
final - 205563
lower - 3330635
final - 282241
lower - 211244
final - 507990
lower - 1672774
final - 34460
lower - 1058306
final - 5659797
lower - 47464
final - 3330635
lower - 1629613
final - 1672774
lower - 272273
final - 98867
lower - 597105
final - 1629613
lower - 4190696
final - 2613576
lower - 2458221
final - 4140926
lower - 4397711
final - 597105
lower - 258910
final - 272273
lower - 10801374
final - 93318
l

final - 8857771
lower - 122741
final - 411602
lower - 97935
final - 1689257
lower - 185875
final - 748673
lower - 179375
final - 1133606
lower - 4433905
final - 1613965
lower - 201720
final - 255591
lower - 4939427
final - 495347
lower - 1138346
final - 122741
lower - 7181545
final - 201720
lower - 2113188
final - 3466790
lower - 7893877
final - 4669366
lower - 4274243
final - 6228091
lower - 189614
final - 7181545
lower - 368876
final - 185875
lower - 1287795
final - 97935
lower - 1850419
final - 1138346
lower - 504123
final - 4433905
lower - 899091
final - 3030951
lower - 1752157
final - 179375
lower - 3923062
final - 7893877
lower - 5094214
final - 2113188
lower - 385414
final - 368876
lower - 3808413
final - 4274243
lower - 259864
final - 3923062
lower - 463969
final - 504123
lower - 341217
final - 4939427
lower - 12079694
final - 5094214
lower - 256445
final - 385414
lower - 570760
final - 259864
lower - 1013435
final - 1287795
lower - 964464
final - 189614
lower - 4748711
final -

final - 593719
lower - 231864
final - 206917
lower - 564955
final - 309226
lower - 5459484
final - 1363763
lower - 2426768
final - 1238826
lower - 4462634
final - 3207910
lower - 5014324
final - 1003512
lower - 4629444
final - 742379
lower - 691783
final - 136968
lower - 163904
final - 4116541
lower - 507712
final - 9054252
lower - 73513
final - 5459484
lower - 155141
final - 4462634
lower - 735834
final - 691783
lower - 766619
final - 2426768
lower - 750746
final - 564955
lower - 4912189
final - 231864
lower - 2157155
final - 507712
lower - 5137180
final - 1012900
lower - 205462
final - 155141
lower - 195335
final - 735834
lower - 415125
final - 163904
lower - 2975676
final - 73513
lower - 555167
final - 205462
lower - 2312682
final - 5137180
lower - 6570628
final - 5014324
lower - 448376
final - 2157155
lower - 668682
final - 766619
lower - 203842
final - 750746
lower - 1551697
final - 4629444
lower - 1544307
final - 2975676
lower - 982490
final - 4912189
lower - 7437777
final - 5551

final - 3417209
lower - 2249643
final - 160005
lower - 148669
final - 127579
lower - 1221491
final - 2886456
lower - 3808930
final - 1297117
lower - 968956
final - 9017939
lower - 3837822
final - 439152
lower - 731965
final - 1894290
lower - 616225
final - 1344485
lower - 167367
final - 2249643
lower - 2664715
final - 167367
lower - 3474734
final - 11234344
lower - 426419
final - 2592289
lower - 1705125
final - 731965
lower - 5131460
final - 52680
lower - 5625488
final - 148669
lower - 5450512
final - 968956
lower - 3959276
final - 3808930
lower - 982080
final - 1221491
lower - 1287173
final - 3837822
lower - 1129422
final - 3474734
lower - 443302
final - 5131460
lower - 1103421
final - 616225
lower - 9024
final - 5625488
lower - 1053200
final - 426419
lower - 1881908
final - 2664715
lower - 9376375
final - 1287173
lower - 5866768
final - 1103421
lower - 5438165
final - 5450512
lower - 730217
final - 3959276
lower - 3987164
final - 1705125
lower - 2700395
final - 982080
lower - 3459748

final - 1704316
lower - 3818167
final - 1876060
lower - 2798997
final - 29369
lower - 327408
final - 9581767
lower - 4197564
final - 3444965
lower - 7360900
final - 3721530
lower - 2107289
final - 5697341
lower - 10767905
final - 565633
lower - 7408560
final - 2353003
lower - 7408560
final - 2844920
lower - 4205629
final - 2798997
lower - 1128416
final - 5864405
lower - 4338659
final - 7360900
lower - 18362
final - 7408560
lower - 2488001
final - 10767905
lower - 5121861
final - 7408560
lower - 1914664
final - 327408
lower - 12065164
final - 1128416
lower - 3171518
final - 18362
lower - 1730509
final - 2488001
lower - 4810509
final - 4338659
lower - 7452572
final - 4704731
lower - 1630422
final - 3818167
lower - 6920866
final - 4205629
lower - 274762
final - 5121861
lower - 9384894
final - 2107289
lower - 3191548
final - 12065164
lower - 1674773
final - 3171518
lower - 1326928
final - 274762
lower - 4760113
final - 9384894
lower - 8024026
final - 3191548
lower - 7294451
final - 1730509

final - 8379351
lower - 10835309
final - 10722349
lower - 12006691
final - 8390833
lower - 3783852
final - 1146395
lower - 7085452
final - 10729500
lower - 11161880
final - 11648350
lower - 4394682
final - 2460508
lower - 5251108
final - 2995935
lower - 8814851
final - 10835309
lower - 1028293
final - 4978644
lower - 9389305
final - 7085452
lower - 9282177
final - 10234086
lower - 9282177
final - 11161880
lower - 4344997
final - 5251108
lower - 10633936
final - 4394682
lower - 11797709
final - 2997540
lower - 11489161
final - 12006691final - 1028293
lower - 8005224

lower - 10720147
final - 9389305
lower - 7141894
final - 9282177
lower - 9678857
final - 10633936
lower - 320715
final - 8814851
lower - 8145609
final - 1714414
lower - 8452300
final - 9678857
lower - 8619585
final - 9282177
lower - 11362688
final - 11489161
lower - 9067187
final - 8005224
lower - 8481012
final - 7141894
lower - 11484644
final - 11797709
lower - 1158047
final - 3783852
lower - 11677895
final - 4344997
lower

DUR: 34655.781871795654

In [28]:
len(preprocessed_content)

2767

In [31]:
df_pp = pd.DataFrame(preprocessed_content.items())
df_pp = df_pp.rename({0: "pageid", 1:"preprocessed_content"}, axis = 1)

In [32]:
df_pp.head()

Unnamed: 0,pageid,preprocessed_content
0,1200964,"[[infobox], [schrift], [japanisch], [typ], [lo..."
1,16565,"[[dateila], [scuola], [di], [raffaelraffael], ..."
2,2996,"[[dateila], [scuola], [di], [raffaelraffael], ..."
3,3221050,"[[infobox], [ort], [deutsch], [name], [name], ..."
4,290,"[[weit], [bedeutung], [sehen], [sanken], [pete..."


In [33]:
df_pp.to_csv("./all_preprocessed_excellent_article.csv")

In [None]:
# save data to csv for faster loading
#df.to_csv("./preprocessed_excellent_article-109.csv")

In [None]:
final = [gensim.utils.simple_preprocess(word, deacc = True) for word in test_page_lemma]

id2word = corpora.Dictionary(final)

corpus = [id2word.doc2bow(word) for word in final]

In [35]:
34655.781871795654 / 60 / 60

9.626606075498792