In [1]:
import re

In [2]:
filename='data/AFD_Wahlprogramm_2021.txt'

In [3]:
with open(filename) as f:
    text = f.read()
    #text = text.replace(r'\\d+\\n', '$')
    #text=   re.sub("\n\n", ".", text)
    #text=   re.sub("\d+.", ".", text)
    text=   re.sub(" \d+\n", ".", text)
    text=   re.sub("\n\d+", " ", text)
    text=   re.sub("\n", " ", text)
sentences = re.split(r' *[\.\?!][\'"\)\]]* *', text)

In [6]:
import yaml
import pandas as pd

DATA_DIR = "data"


def load_yaml_file(file_path):
    """
    Loads a yaml file and returns a dictionary with the contents.
    Args:
        file_path (str):
            Path to the yaml file
    Returns:
        yaml_dict:
            Dictionary with the contents of the yaml file
    """
    with open(file_path, "r") as stream:
        yaml_dict = yaml.safe_load(stream)
        return yaml_dict


def load_markdown_file(file_path):
    """
    Loads a markdown file and returns a string with the contents.
    Args:
        file_path (str):
            Path to the markdown file
    Returns:
        markdown_str:
            String with the contents of the markdown file
    """
    with open(file_path, "r") as stream:
        markdown_str = stream.read()
        return markdown_str


def get_counts(df: pd.DataFrame, label_col: str = "label") -> pd.DataFrame:
    """
    Get the labels count in a dataframe.
    Args:
        df (pd.DataFrame):
            Dataframe to get the counts from.
        label_col (str):
            Column name of the label column.
    Returns:
        counts_df:
            Dataframe with the counts.
    """
    count_df = df[label_col].value_counts().to_frame().reset_index()
    count_df.columns = [label_col, "sentence_count"]
    count_df["percent"] = count_df.sentence_count / count_df.sentence_count.sum() * 100
    return 

In [13]:
from typing import Dict, List
import spacy
from string import punctuation
import pandas as pd
from tqdm.auto import tqdm
from transformers import pipeline


def _add_sentence_to_list(sentence: str, sentences_list):
    """
    Add a sentence to the list of sentences.
    Args:
        sentence (str):
            Sentence to be added.
        sentences (List[str]):
            List of sentences.
    """
    while sentence.startswith(" "):
        # remove leading space
        sentence = sentence[1:]
    if all(c in punctuation for c in sentence) or len(sentence) == 1:
        # skip sentences with only punctuation
        return
    sentences_list.append(sentence)


def get_sentences(text: str):
    """
    Get sentences from a text.
    Args:
        text (str):
            Text to be processed.
    Returns:
        List[str]:
            List of sentences.
    """
    # get the paragraphs
    paragraphs = text.split("\n")
    paragraphs = [p for p in paragraphs if p != ""]
    # get the sentences from the paragraphs
    sentences = list()
    for paragraph in paragraphs:
        if paragraph.startswith("#"):
            _add_sentence_to_list(paragraph, sentences)
            continue
        prev_sentence_idx = 0
        for idx in range(len(paragraph)):
            if idx + 1 < len(paragraph):
                if (paragraph[idx] == "." and not paragraph[idx + 1].isdigit()) or (
                    paragraph[idx] in "!?"
                ):
                    sentence = paragraph[prev_sentence_idx : idx + 1]
                    _add_sentence_to_list(sentence, sentences)
                    prev_sentence_idx = idx + 1
            else:
                sentence = paragraph[prev_sentence_idx:]
                _add_sentence_to_list(sentence, sentences)
    return sentences


def get_words(text: str) -> List[str]:
    """
    Get every word in the text that isn't a stopword or punctuation,
    and that is either a noun, adjective, verb or interjection
    (based on the [universal POS tags](https://universaldependencies.org/u/pos/))
    Args:
        text (str):
            Text to be processed.
    Returns:
        List[str]:
            List of words.
    """
    nlp = spacy.load("de_core_news_lg")
    nlp.max_length = 8000000
    doc = nlp(text)
    words = [
        word.text.replace("\n", "").replace("*", "")  # remove new line and bold symbols
        for word in doc
        if not word.is_stop  # remove stopwords
        and not word.is_punct  # remove punctuation
        and (
            word.pos_ == "NOUN"  # noun
            or word.pos_ == "ADJ"  # adjective
            or word.pos_ == "VERB"  # verb
            or word.pos_ == "INTJ"  # interjection
            or word.pos_ == "X"  # other
        )
    ]
    # remove blank words and spaces
    words = [word for word in words if word != ""]
    words = [word.replace(" ", "") for word in words]
    # make all words lowercase
    words = [word.lower() for word in words]
    # remove undesired words
    words = [
        word
        for word in words
        if word not in ["se", "há", "política", "político", "políticos", "políticas"]
    ]
    # remove words with less than 3 characters
    words = [word for word in words if len(word) > 2]
    return words


def get_topical_sentences(
    sentences, topics
) :
    """
    Get lists of sentences per topic, based on the presence of
    words that are a part of the topic.
    Args:
        sentences (List[str]):
            List of sentences to analyse.
        topics (Dict[str, List[str]]):
            Dictionary of words per topic.
    Returns:
        Dict[str, List[str]]:
            Dictionary of sentences per topic.
    """
    topical_sentences = dict()
    for topic in topics:
        topical_sentences[topic] = list()
    for sentence in sentences:
        for topic in topics:
            if any(topical_word in sentence.lower() for topical_word in topics[topic]):
                topical_sentences[topic].append(sentence)
    return topical_sentences


def get_sentiment(sentences):
    """
    Get the sentiment of a list of sentences.
    Args:
        sentences (str):
            List of sentences to analyse.
    Returns:
        pd.DataFrame:
            Sentiment of the sentences.
    """
    sentiment_model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
    sentiment_task = pipeline(
        "sentiment-analysis", model=sentiment_model_path, tokenizer=sentiment_model_path
    )
    sentiment_outputs = [
        sentiment_task(sentence)
        for sentence in tqdm(sentences, desc="Sentiment analysis")
    ]
    sentiments_dict = dict(label=[], score=[], sentence=[])
    for idx, output in enumerate(sentiment_outputs):
        sentiments_dict["label"].append(output[0]["label"])
        sentiments_dict["score"].append(output[0]["score"])
        sentiments_dict["sentence"].append(sentences[idx])
    sentiment_df = pd.DataFrame(sentiments_dict)
    sentiment_df["label"] = sentiment_df.label.map(
        dict(Positive="positivo", Negative="negativo", Neutral="neutro")
    )
    return sentiment_df


def get_hate_speech(
    sentences, sentiment_df, label_col: str = "label"
):
    """
    Get the hate speech of a list of sentences.
    Args:
        sentences (str):
            List of sentences to analyse.
        sentiment_df (pd.DataFrame):
            Sentiment of the sentences.
        label_col (str):
            Column of the sentiment dataframe that contains the sentiment.
    Returns:
        pd.DataFrame:
            Hate speech of the sentences.
    """
    hate_model_path = "Hate-speech-CNERG/dehatebert-mono-portugese"
    hate_task = pipeline(
        "text-classification", model=hate_model_path, tokenizer=hate_model_path
    )
    hate_outputs = [
        hate_task(sentence) for sentence in tqdm(sentences, desc="Hate speech analysis")
    ]
    hate_dict = dict(label=[], score=[], sentence=[])
    for idx, output in enumerate(hate_outputs):
        hate_dict["label"].append(output[0]["label"])
        hate_dict["score"].append(output[0]["score"])
        hate_dict["sentence"].append(sentences[idx])
    hate_df = pd.DataFrame(hate_dict)
    hate_df["label"] = hate_df.label.map(dict(HATE="ódio", NON_HATE="neutro"))
    hate_condition = (hate_df.label == "ódio") & (sentiment_df[label_col] == "negativo")
    hate_df.loc[hate_condition, "label"] = "ódio"
    hate_df.loc[~hate_condition, "label"] = "neutro"
    return 

In [4]:
from typing import Dict, List
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import plotly.express as px
import plotly.graph_objects as go
from plotly.graph_objects import Figure
import os
import pandas as pd



def get_word_cloud(
    words: List[str],
    max_words: int = 500,
    image_path: str = None,
    image_name: str = None,
):
    """
    Create a word cloud based on a set of words.
    Args:
        words (List[str]):
            List of words to be included in the word cloud.
        max_words (int):
            Maximum number of words to be included in the word cloud.
        image_path (str):
            Path to the image file where to save the word cloud.
        image_name (str):
            Name of the image where to save the word cloud.
    """
    # change the value to black
    def black_color_func(
        word, font_size, position, orientation, random_state=None, **kwargs
    ):
        return "hsl(0,100%, 1%)"

    # set the wordcloud background color to white
    # set width and height to higher quality, 3000 x 2000
    wordcloud = WordCloud(
        font_path="/Library/Fonts/Arial Unicode.ttf",
        background_color="white",
        width=3000,
        height=2000,
        max_words=max_words,
    ).generate(" ".join(words))
    # set the word color to black
    wordcloud.recolor(color_func=black_color_func)
    # set the figsize
    plt.figure(figsize=[15, 10])
    # plot the wordcloud
    plt.imshow(wordcloud, interpolation="bilinear")
    # remove plot axes
    plt.axis("off")
    if image_path is not None and image_name is not None:
        # save the image
        plt.savefig(os.path.join(image_path, image_name), bbox_inches="tight")


def plot_topical_presence(
    sentences: List[str],
    topics: Dict[str, List[str]],
    title: str = None,
    color: str = "blue",
    height: int = 300,
) :
    """
    Plot the number of sentences per topic.
    Args:
        sentences (List[str]):
            List of sentences to analyse.
        topics (Dict[str, List[str]]):
            Dictionary of words per topic.
        title (str):
            Title of the plot.
        color (str):
            Color of the bars in the plot.
        height (int):
            Height of the plot.
    Returns:
        Figure:
            Plotly figure with the number of sentences per topic.
    """
    topical_sentences = get_topical_sentences(sentences, topics)
    topic_sentence_count = dict()
    for topic in topical_sentences.keys():
        topic_sentence_count[topic] = len(topical_sentences[topic])
    topic_sentence_count = pd.DataFrame(
        topic_sentence_count, index=["sentence_count"]
    ).T
    topic_sentence_count["sentence_percentage"] = (
        topic_sentence_count["sentence_count"] / len(sentences) * 100
    )
    topic_sentence_count.index.name = "topic"
    topic_sentence_count.sort_index(inplace=True)
    fig = px.bar(topic_sentence_count, x="sentence_percentage", orientation="h")
    fig.update_layout(
        title=title,
        xaxis_title="Percentagem de frases topicais no texto",
        yaxis_title="Tópico",
        yaxis=dict(categoryorder="category descending"),
        margin=dict(l=0, r=0, b=0, t=0, pad=0),
        height=height,
    )
    fig.update_traces(marker_color=color)
    return fig


def plot_approaches(
    sentences: List[str],
    approaches: Dict[str, List[str]],
    title: str = None,
    height: int = 300,
):
    """
    Plot the approaches taken to language and policy.
    Args:
        sentences (List[str]):
            List of sentences to analyse.
        approaches (Dict[str, List[str]]):
            Dictionary of words per approach.
        title (str):
            Title of the plot.
        height (int):
            Height of the plot.
    Returns:
        Figure:
            Plotly figure with the number of sentences per approach.
    """
    approach_sentences = get_topical_sentences(sentences, approaches)
    approach_sentence_count = dict()
    total_num_sentences_in_approaches = sum(
        [len(approach_sentences[approach]) for approach in approach_sentences.keys()]
    )
    for approach in approaches:
        approach_sentence_count[approach] = (
            len(approach_sentences[approach]) / total_num_sentences_in_approaches * 100
        )
    fig = go.Figure()
    fig.add_trace(
        go.Bar(
            x=[approach_sentence_count["rationality"]],
            name="racionalidade",
            orientation="h",
            marker=dict(color="green"),
            hovertemplate="racionalidade: %{x:.1f}%<extra></extra>",
        )
    )
    fig.add_trace(
        go.Bar(
            x=[approach_sentence_count["intuition"]],
            name="intuição",
            orientation="h",
            marker=dict(color="red"),
            hovertemplate="intuição: %{x:.1f}%<extra></extra>",
        )
    )
    fig.update_layout(
        title=title,
        barmode="stack",
        xaxis=dict(
            showgrid=False,  # thin lines in the background
            zeroline=False,  # thick line at x=0
            visible=False,  # numbers below
        ),
        yaxis=dict(
            showgrid=False,  # thin lines in the background
            zeroline=False,  # thick line at x=0
            visible=False,  # numbers below
        ),
        margin=dict(l=0, r=0, b=0, t=0, pad=0),
        height=height,
    )
    return fig


def plot_sentiment(
    df: pd.DataFrame, title: str = None, height: int = 300, label_col: str = "label"
):
    """
    Plot the predicted sentiment of the sentences.
    Args:
        df (pd.DataFrame):
            Dataframe with the outputs of a sentiment analysis model.
        title (str):
            Title of the plot.
        height (int):
            Height of the plot.
        label_col (str):
            Column name of the sentiment.
    Returns:
        Figure:
            Plotly figure with the percentage of hate speech.
    """
    sentiments_count = get_counts(df, label_col=label_col)
    labels_order = ["neutro", "positivo", "negativo"]
    fig = px.bar(
        x=labels_order,
        y=[
            float(sentiments_count[sentiments_count[label_col] == label].percent)
            for label in labels_order
        ],
        title=title,
    )
    fig.update_traces(
        marker_color=["gray", "green", "red"],
        hovertemplate="%{y:.1f}%<extra></extra>",
    )
    fig.update_layout(
        xaxis_title="Sentimento",
        yaxis_title="Percentagem de frases",
        margin=dict(l=0, r=0, b=0, t=0, pad=0),
        height=height,
    )
    return fig


def plot_hate_speech(
    df: pd.DataFrame, title: str = None, height: int = 300, label_col: str = "label"
) :
    """
    Show the percentage of estimated hate speech sentences.
    Args:
        df (pd.DataFrame):
            Dataframe with the outputs of a hate speech model.
        title (str):
            Title of the plot.
        height (int):
            Height of the plot.
        label_col (str):
            Column name of the hate speech.
    Returns:
        Figure:
            Plotly figure with the percentage of hate speech.
    """
    hate_count = get_counts(df, label_col=label_col)
    try:
        hate_percent = hate_count[hate_count[label_col] == "ódio"].percent.values[0]
    except IndexError:
        hate_percent = 0
    fig = go.Figure(
        go.Indicator(
            mode="number",
            value=hate_percent,
            title=title,
            number=dict(suffix="%", valueformat=".2f"),
            delta=dict(position="top", reference=320),
            domain=dict(x=[0, 1], y=[0, 1]),
        )
    )
    fig.update_layout(
        paper_bgcolor="darkred",
        font_color="white",
        margin=dict(l=0, r=0, b=0, t=0, pad=0),
        height=height,
    )
    return fig

In [11]:
import os
import sys
from ipywidgets import interact
import plotly.io as pio
from string import punctuation

In [12]:
topics = load_yaml_file('topic_modeling/topic_g.yml')
topics.keys()


dict_keys(['wirtschaft', 'Klima', 'Bildung', 'Gesundheit', 'Wissenschaft', 'soziale Ursachen', 'Politik und Ideologie', 'Infrastruktur'])

In [None]:
topics['wirtschaft'][0:10]

['Wirtschaft',
 'Volkswirtschaft',
 'Marktwirtschaft',
 'Planwirtschaft',
 'Soziale Marktwirtschaft',
 'Kapitalismus',
 'Kommunismus',
 'Sozialismus',
 'BIP (Bruttoinlandsprodukt)',
 'BIP pro Kopf']

In [None]:
approaches = load_yaml_file(os.path.join(DATA_DIR, data_name, "approaches.yml"))
approaches.keys()