# Setup

In [1]:
import pandas as pd
import plotly.express as px


In [2]:
def test_occurence(query, directory, filename):
    with open(f'{directory}/{filename}', 'r', encoding='utf-8') as file:

        text = file.read()

    
    for q in query.split(', '):
        if q.lower() in text.lower():
            return True
    
    return False

# Parameters

In [3]:
DATA_DIRECTORY = 'data'
SOURCE_TABLE = 'source_table.csv'
TEXT_DIRECTORY = f'{DATA_DIRECTORY}/texts'
OUTPUT_DIRECTORY = f'{DATA_DIRECTORY}/output'

query = 'Klima'

# Preparation

In [4]:
df = pd.read_csv(f'{DATA_DIRECTORY}/{SOURCE_TABLE}', sep=';')

parties = df['partei'].str.split(',', expand=True).stack().unique()
min_year = df['jahr'].min()
max_year = df['jahr'].max()

df['match'] = df.apply(lambda row: test_occurence(query, TEXT_DIRECTORY, row['file_name'][:-3] + 'txt'), axis=1)

occurences = []

for party in parties:
    df['is_of_party'] = df['partei'].str.contains(party)
    df_tmp = df[df['is_of_party'] & df['match']]

    for year in range(min_year, max_year + 1):
        occurences.append({'partei': party, 'jahr': year, 'anz_tracktanden': len(df_tmp[df_tmp['jahr'] == year]['traktandum_guid'].unique())})

df_viz = pd.DataFrame.from_records(occurences)

# Visualization

In [5]:
fig = px.line(df_viz, x='jahr', y='anz_tracktanden', color='partei', markers=True)

fig.update_layout(
    title= f'Anzahl Traktanden, welche einen der Ausdrücke "{query}" enthalten nach Partei und Jahr',
    xaxis_title = 'Jahr',
    yaxis_title = 'Anzahl Tracktanden',
    legend_title= 'Partei',
    template='plotly_white'
)

fig.write_html(f'{OUTPUT_DIRECTORY}/{query.replace(", ", "_")}.html')

fig.show()