## Imports

In [13]:
# Pandas is an open source data analysis and manipulation tool
import pandas as pd
from pandas import json_normalize

# os gives access to the operating system
import os
# The datetime module supplies classes for manipulating dates and times.
from datetime import datetime
import datetime
# This module provides various time-related functions.
import time

# Natural language toolkit
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# library to create visualizations
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib.offsetbox import AnchoredText


# open source library for automating downloading of reports from Google Trends
from pytrends.request import TrendReq

# library to get html of website (wikipedia)
import requests
# json to use wikipedia return
import json




## Load dataframes

In [9]:
# Load exploded dataframe
path = "C:/Users/Jan/Documents/Python_Projects/Bachelorthesis/Bachelorthesis/Analysis/DataFrames/"
filename_news = "All_news_articles_exploded.csv"
filename_Google = "Google_DataFrame.csv"
filename_wiki = "Wikipedia_DataFrame.csv"

News_DataFrame = pd.read_csv(path+filename_news, index_col=None,header=0)
Google_DataFrame = pd.read_csv(path+filename_Google, index_col=None,header=0)
Wiki_DataFrame = pd.read_csv(path+filename_wiki, index_col=None,header=0)


  News_DataFrame = pd.read_csv(path+filename_news, index_col=None,header=0)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 584887 entries, 0 to 584886
Data columns (total 13 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Titel                  584887 non-null  object 
 1   Overline               485780 non-null  object 
 2   Date_Info              584887 non-null  object 
 3   Kategorie              576257 non-null  object 
 4   URL                    584887 non-null  object 
 5   detailed_informations  203986 non-null  object 
 6   bild_plus              125880 non-null  object 
 7   Zugriff_Datum          493280 non-null  float64
 8   News_page              584887 non-null  object 
 9   Breadcrumb             276660 non-null  object 
 10  author                 55600 non-null   object 
 11  Titel_and_Overline     584887 non-null  object 
 12  Tokens                 584887 non-null  object 
dtypes: float64(1), object(12)
memory usage: 58.0+ MB


## Only use keywords that occured above n = 50

In [60]:
df

Unnamed: 0,KeyWord,date,Occurence_in_Wikipedia,normalized_Occurence_in_Wikipedia,Occurence_in_Google,normalized_Occurence_in_Google
0,2G,2022-01-10,38,0.092910,97,0.97
1,2G,2022-01-11,39,0.095355,100,1.00
2,2G,2022-01-12,49,0.119804,86,0.86
3,2G,2022-01-13,49,0.119804,78,0.78
4,2G,2022-01-14,54,0.132029,83,0.83
...,...,...,...,...,...,...
219866,Überraschung,2022-07-02,21,0.265823,14,0.14
219867,Überraschung,2022-07-03,17,0.215190,14,0.14
219868,Überraschung,2022-07-04,26,0.329114,10,0.10
219869,Überraschung,2022-07-05,41,0.518987,12,0.12


In [73]:
df = build_Occurence_df(News_DataFrame,50, Wiki_DataFrame, Google_DataFrame)
saveCSV(df,"Occurence_DataFrame")

In [67]:
def build_Occurence_df(news_dataFrame, n_occurences, wikipedia_DataFrame, google_DataFrame):
    article_over_occurence = get_titles_with_minimum_occurence_N(news_dataFrame[["Date_Info", "Tokens","Kategorie"]], n_occurences)

    # prep keyword list "Titles"
    keyword_list = list(article_over_occurence["Tokens"].drop_duplicates())
    # get google data
    google_data = google_DataFrame.copy()
    # get wikipedia data
    wikipedia_data = wikipedia_DataFrame.copy()

    #rename columns
    article_over_occurence = article_over_occurence.rename(columns={"Date_Info" : "date", "Tokens" : "Occurence_in_news"})
    #convert to datetime
    article_over_occurence['date'] = pd.to_datetime(article_over_occurence.date)
    google_data['date'] = pd.to_datetime(google_data.date)
    wikipedia_data['date'] = pd.to_datetime(wikipedia_data.date)
    #convert datetime format to googles datetime format
    article_over_occurence['date'] = article_over_occurence['date'].dt.strftime('%Y-%m-%d')
    #reshape our data frame to look like google data frame
    occurence_df=article_over_occurence.pivot_table(index='date', columns='Occurence_in_news', aggfunc='size').rename_axis(None, axis=1)
    #fill all NaN with 0
    occurence_df = occurence_df.fillna(0)
    occurence_df = occurence_df.astype(int)

    # bring news df to right format
    news_df = occurence_df.unstack().reset_index()
    news_df = news_df.rename(columns={"level_0" : "KeyWord", 0 : "Occurence_in_News"})
    news_df["date"]=pd.to_datetime(news_df["date"], format='%Y-%m-%d')
    news_df = normalize_column_by_keyword(news_df,keyword_list,"Occurence_in_News")

    # merge google and news
    #full_df = pd.merge(news_df, google_data, on=['KeyWord',"date"], how='outer')

    full_df = pd.merge(pd.merge(news_df,google_data,on=['KeyWord',"date"]),wikipedia_data,on=['KeyWord',"date"])


    #fill all NaN with 0
    full_df["Occurence_in_Google"] = full_df["Occurence_in_Google"].fillna(0)
    full_df["Occurence_in_Google"] = full_df["Occurence_in_Google"].astype(int)
    return full_df

In [70]:
def get_occurence_of_all_titles(data_frame, columnName):
    # Group titles by columnName
    all_titles = data_frame.groupby(columnName).size()

    #sort titles
    all_titles = all_titles.sort_values(ascending = False)

    return all_titles


def get_occurence_of_all_capital_titles(data_frame, columnName):
    all_titles=data_frame.groupby(columnName).size()

    capital_titles=[]
    counter=0
    for title in all_titles.items():
        title_s=str(title)
        title_s=title_s.strip()
        if title_s.istitle():
            capital_titles.append(title)
    Titles = pd.DataFrame.from_records(
    capital_titles, columns=['Title','Occurence'])
    return Titles.sort_values(by=['Occurence'],ascending=False)


def get_titles_with_minimum_occurence_N(data_frame, N):
    all_Titles = get_occurence_of_all_capital_titles(data_frame, "Tokens")
    above_N = []
    for index,row in all_Titles.iterrows():
        if int(row.Occurence) >= N:
            above_N.append(row)

    above_N = pd.DataFrame(above_N, columns=['Title', 'Occurence'])

    return data_frame[data_frame["Tokens"].isin(above_N.Title)]


def get_titles_with_minimum_occurence_N(data_frame, N):
    all_Titles = get_occurence_of_all_capital_titles(data_frame, "Tokens")
    above_N = []
    for index,row in all_Titles.iterrows():
        if int(row.Occurence) >= N:
            above_N.append(row)

    above_N = pd.DataFrame(above_N, columns=['Title', 'Occurence'])

    return data_frame[data_frame["Tokens"].isin(above_N.Title)]


def normalize_column_by_keyword(dataframe, keyword_list, column):
    dataframe_list = []
    new_column_name = "normalized_" + column
    for keyword in keyword_list:
        working_df = dataframe[dataframe['KeyWord'] == keyword]
        max_occurence = working_df[column].max()
        #print(max_occurence)
        df_copy = working_df.copy()
        df_copy[new_column_name] = working_df[column] /working_df[column].abs().max()
        dataframe_list.append(df_copy)
    return pd.concat(dataframe_list)


def saveCSV(dataframe, filename):
    dataframe.to_csv("C:/Users/Jan/Documents/Python_Projects/Bachelorthesis/Bachelorthesis/Analysis/DataFrames/"+ filename +".csv",index=False)