## Imports

In [1]:
import pandas as pd
import datetime

## Load dataframes

In [2]:
# Load exploded dataframe
path = "C:/Users/Jan/Documents/Python_Projects/Bachelorthesis/Bachelorthesis/Analysis/DataFrames/"
filename_news = "All_news_articles_exploded.csv"
filename_Google = "Google_DataFrame.csv"
filename_wiki = "Wikipedia_DataFrame.csv"

News_DataFrame = pd.read_csv(path+filename_news, index_col=None,header=0)
News_DataFrame["Date_Info"] = pd.to_datetime(News_DataFrame["Date_Info"], format='%Y-%m-%d')

Google_DataFrame = pd.read_csv(path+filename_Google, index_col=None,header=0)
Wiki_DataFrame = pd.read_csv(path+filename_wiki, index_col=None,header=0)


  News_DataFrame = pd.read_csv(path+filename_news, index_col=None,header=0)


# Build occurence dataFrame

## 1. Occurence dataframe by media house and combined

In [5]:
media_houses =["Spiegel", "Bild","Sueddeutsche"]

df = build_Occurence_df(News_DataFrame,50, Wiki_DataFrame, Google_DataFrame)
saveCSV(df,"Occurence_DataFrame")
for media_house in media_houses:
    News_DataFrame_ = News_DataFrame[News_DataFrame["News_page"] == media_house]
    df = build_Occurence_df(News_DataFrame_,50, Wiki_DataFrame, Google_DataFrame)
    saveCSV(df,f"Occurence_DataFrame_{media_house}")


## 2. Occurence dataframe by season

In [6]:
winter_start = datetime.datetime(2022, 1, 10)
winter_end = datetime.datetime(2022,3,19)
winter_mask = (News_DataFrame['Date_Info'] > winter_start) & (News_DataFrame['Date_Info'] <= winter_end)

spring_start = datetime.datetime(2022,3,20)
spring_end = datetime.datetime(2022,6,20)
spring_mask = (News_DataFrame['Date_Info'] > spring_start) & (News_DataFrame['Date_Info'] <= spring_end)

summer_start =datetime.datetime(2022,6,21)
summer_end = datetime.datetime(2022,7,7)
summer_mask = (News_DataFrame['Date_Info'] > summer_start) & (News_DataFrame['Date_Info'] <= summer_end)

News_DataFrame_Winter = News_DataFrame.loc[winter_mask]
News_DataFrame_Spring = News_DataFrame.loc[spring_mask]
News_DataFrame_Summer = News_DataFrame.loc[summer_mask]

season_masks = [[winter_mask,"winter"],[spring_mask,"spring"],[summer_mask,"summer"]]

for season_mask,season in season_masks:
    df = build_Occurence_df(News_DataFrame.loc[season_mask],50,Wiki_DataFrame,Google_DataFrame)
    saveCSV(df, f"Occurence_DataFrame_{season}")

In [3]:
def build_Occurence_df(news_dataFrame, n_occurences, wikipedia_DataFrame, google_DataFrame):
    article_over_occurence = get_titles_with_minimum_occurence_N(news_dataFrame[["Date_Info", "Tokens","Kategorie"]], n_occurences)

    # prep keyword list "Titles"
    keyword_list = list(article_over_occurence["Tokens"].drop_duplicates())
    # get google data
    google_data = google_DataFrame.copy()
    # get wikipedia data
    wikipedia_data = wikipedia_DataFrame.copy()

    #rename columns
    article_over_occurence = article_over_occurence.rename(columns={"Date_Info" : "date", "Tokens" : "Occurence_in_news"})
    #convert to datetime
    article_over_occurence['date'] = pd.to_datetime(article_over_occurence.date)
    google_data['date'] = pd.to_datetime(google_data.date)
    wikipedia_data['date'] = pd.to_datetime(wikipedia_data.date)
    #convert datetime format to googles datetime format
    article_over_occurence['date'] = article_over_occurence['date'].dt.strftime('%Y-%m-%d')
    #reshape our data frame to look like google data frame
    occurence_df=article_over_occurence.pivot_table(index='date', columns='Occurence_in_news', aggfunc='size').rename_axis(None, axis=1)
    #fill all NaN with 0
    occurence_df = occurence_df.fillna(0)
    occurence_df = occurence_df.astype(int)

    # bring news df to right format
    news_df = occurence_df.unstack().reset_index()
    news_df = news_df.rename(columns={"level_0" : "KeyWord", 0 : "Occurence_in_News"})
    news_df["date"]=pd.to_datetime(news_df["date"], format='%Y-%m-%d')
    news_df = normalize_column_by_keyword(news_df,keyword_list,"Occurence_in_News")

    # merge google and news
    #full_df = pd.merge(news_df, google_data, on=['KeyWord',"date"], how='outer')

    full_df = pd.merge(pd.merge(news_df,google_data,on=['KeyWord',"date"]),wikipedia_data,on=['KeyWord',"date"])


    #fill all NaN with 0
    full_df["Occurence_in_Google"] = full_df["Occurence_in_Google"].fillna(0)
    full_df["Occurence_in_Google"] = full_df["Occurence_in_Google"].astype(int)

    full_df = full_df.groupby("KeyWord").apply(smoothen_timeseries)

    return full_df

In [4]:
def smoothen_timeseries(dataframe):
    df = dataframe.copy()
    window_size = 3
    normalized_Occurence_column_names = ["normalized_Occurence_in_News","normalized_Occurence_in_Google",
                                         "normalized_Occurence_in_Wikipedia"]
    Occurence_column_names = ["Occurence_in_News","Occurence_in_Google","Occurence_in_Wikipedia"]

    Column_names = normalized_Occurence_column_names + Occurence_column_names

    for column_name in Column_names:
        # smoothing Occurence

        data = df[column_name].rolling(window_size).mean().fillna(0)
        dataframe["smoothened_"+column_name] = data
    return dataframe

def get_occurence_of_all_titles(data_frame, columnName):
    # Group titles by columnName
    all_titles = data_frame.groupby(columnName).size()

    #sort titles
    all_titles = all_titles.sort_values(ascending = False)

    return all_titles


def get_occurence_of_all_capital_titles(data_frame, columnName):
    all_titles=data_frame.groupby(columnName).size()

    capital_titles=[]
    counter=0
    for title in all_titles.items():
        title_s=str(title)
        title_s=title_s.strip()
        if title_s.istitle():
            capital_titles.append(title)
    Titles = pd.DataFrame.from_records(
    capital_titles, columns=['Title','Occurence'])
    return Titles.sort_values(by=['Occurence'],ascending=False)


def get_titles_with_minimum_occurence_N(data_frame, N):
    all_Titles = get_occurence_of_all_capital_titles(data_frame, "Tokens")
    above_N = []
    for index,row in all_Titles.iterrows():
        if int(row.Occurence) >= N:
            above_N.append(row)

    above_N = pd.DataFrame(above_N, columns=['Title', 'Occurence'])

    return data_frame[data_frame["Tokens"].isin(above_N.Title)]


def get_titles_with_minimum_occurence_N(data_frame, N):
    all_Titles = get_occurence_of_all_capital_titles(data_frame, "Tokens")
    above_N = []
    for index,row in all_Titles.iterrows():
        if int(row.Occurence) >= N:
            above_N.append(row)

    above_N = pd.DataFrame(above_N, columns=['Title', 'Occurence'])

    return data_frame[data_frame["Tokens"].isin(above_N.Title)]


def normalize_column_by_keyword(dataframe, keyword_list, column):
    dataframe_list = []
    new_column_name = "normalized_" + column
    for keyword in keyword_list:
        working_df = dataframe[dataframe['KeyWord'] == keyword]
        max_occurence = working_df[column].max()
        #print(max_occurence)
        df_copy = working_df.copy()
        df_copy[new_column_name] = working_df[column] /working_df[column].abs().max()
        dataframe_list.append(df_copy)
    return pd.concat(dataframe_list)


def saveCSV(dataframe, filename):
    dataframe.to_csv("C:/Users/Jan/Documents/Python_Projects/Bachelorthesis/Bachelorthesis/Analysis/DataFrames/"+ filename +".csv",index=False)