## Imports

In [28]:
# Pandas is an open source data analysis and manipulation tool
import pandas as pd
from pandas import json_normalize

# os gives access to the operating system
import os
# The datetime module supplies classes for manipulating dates and times.
from datetime import datetime
import datetime
# This module provides various time-related functions.
import time

# Natural language toolkit
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# library to create visualizations
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib.offsetbox import AnchoredText


# open source library for automating downloading of reports from Google Trends
from pytrends.request import TrendReq

# library to get html of website (wikipedia)
import requests
# json to use wikipedia return
import json

## Load data

In [29]:
# Load exploded dataframe
path = "C:/Users/Jan/Documents/Python_Projects/Bachelorthesis/Bachelorthesis/Analysis/DataFrames/keyWords.csv"

keyWord_list = pd.read_csv(path, index_col=None,header=0)

keyWord_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89807 entries, 0 to 89806
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   KeyWord    89807 non-null  object
 1   Occurence  89807 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.4+ MB


## Create list of keyWords with occurence above N

In [36]:
keyWord_list_n25 = keyWord_list[keyWord_list["Occurence"] >= 25]
Wikipedia_DataFrame = get_wikipedia_interest_over_time(keyWord_list_n25.KeyWord.tolist())

nan
nan
nan
nan
nan
nan
nan
nan
nan
409
11390
269
13318
192
219
446
190
310
89
420
95
98
1766
417
258
26
195
74
45
65
78
1023
15
nan
80
87
114
84
nan
36
18
155
28
217
792
444
212075
nan
5841
3207
16976
561
9
771
119
1902
18
46
48
175
nan
72
203
250
565
122
347
429
32
254
68
2952
107
7
125
1458
184
28
213
708
168
73
109
2729
397
35
758
111
1702
468
992
2383
207
234
51
333
1941
102
119
63
4
38
183
nan
208
nan
7503
1762
150
82
10
24
902
101
nan
690
203
57
29
nan
10372
242
28
nan
2583
4
221
nan
179
116
65
8286
1083
273
23
217
292
854
115
220
464
521
779
2175
203
791
147
128
111
844
1518
171
nan
52
nan
14
114
1387
302
127
20
40
209
nan
1739
18
nan
116
118
nan
103
181
457
nan
1564
7
261
72
463
202
29
162
59
80
10985
nan
152
nan
180
23
nan
198
nan
6
6311
9234
nan
29
nan
213
789
nan
11
nan
30
nan
22
97
3610
33
147
28
38
5
222
1518
282
1476
5906
616
268
210
784
169
23
134
8248
11
142
309
93
9207
184
1331
61
3178
151
198
3
1809
3
41
nan
40
29
202
17
654
17
51
263
1328
60
nan
nan
5613
nan
nan
nan

In [35]:
def get_wikipedia_interest_over_time(keyword_list):
    dateTime=datetime.datetime.now()
    currentDate = dateTime.strftime("%Y%m%d")
    return_list=[]
    # chunk keywords due to wikimedia policy
    for chunk in make_chunks(keyword_list, 200):
        for keyword in chunk:
            headers = {'user-agent': 'Influence_of_daily_political_news_on_the_use_of_wikipedia/1.0 (jan.schuckatt-online.de)'}
            url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/de.wikipedia.org/all-access/all-agents/{keyword}/daily/20211107/{currentDate}"
            r = requests.get(url, headers=headers)
            return_list.append(r.text)
        time.sleep(2)
    # deserialize json
    json_dict=[]
    for wiki_dict in return_list:
        data = json.loads(wiki_dict)
        json_dict.append(data)

    #json to dataframe
    lst_of_df=[]
    pages_found_counter = 0
    pages_not_found_counter = 0
    for counter in range(len(json_dict)):
        try:
            lst_of_df.append(pd.DataFrame.from_records(json_dict[counter]["items"]))
            pages_found_counter = pages_found_counter + 1
        except:
            pages_not_found_counter = pages_not_found_counter + 1

    wikipedia_DF = pd.concat(lst_of_df)

    wikipedia_DF = wikipedia_DF.rename(columns={"article" : "KeyWord", "views" : "Occurence_in_Wikipedia", "timestamp" : "date"})

    wikipedia_DF = setCorrectDataTypes_wikipedia(wikipedia_DF)
    wikipedia_DF = wikipedia_DF[["KeyWord", "date", "Occurence_in_Wikipedia"]]
    wikipedia_DF = normalize_column_by_keyword(wikipedia_DF,keyword_list,"Occurence_in_Wikipedia")
    print(f"Successfull: {pages_found_counter}")
    print(f"Unsuccessfull: {pages_not_found_counter}")
    return wikipedia_DF

# set set datetime format
def setCorrectDataTypes_wikipedia(dataframe):
    dataframe["date"] = pd.to_datetime(dataframe["date"], format='%Y%m%d%H')
    return dataframe

def make_chunks(data, chunk_size):
    while data:
        chunk, data = data[:chunk_size], data[chunk_size:]
        yield chunk

def normalize_column_by_keyword(dataframe, keyword_list, column):
    dataframe_list = []
    new_column_name = "normalized_" + column
    for keyword in keyword_list:
        working_df = dataframe[dataframe['KeyWord'] == keyword]
        max_occurence = working_df[column].max()
        print(max_occurence)
        df_copy = working_df.copy()
        df_copy[new_column_name] = working_df[column] /working_df[column].abs().max()
        dataframe_list.append(df_copy)
    return pd.concat(dataframe_list)

In [37]:
def saveCSV(dataframe, filename):
    dataframe.to_csv("C:/Users/Jan/Documents/Python_Projects/Bachelorthesis/Bachelorthesis/Analysis/DataFrames/"+ filename +".csv",index=False)

In [38]:
saveCSV(Wikipedia_DataFrame,"Wikipedia_DataFrame")