In [1]:
import os

DATA_DIR='raw'
data_names = os.listdir(DATA_DIR)
data_names = [name[:-4] for name in data_names if name != ".DS_Store"]

In [2]:
from string import punctuation
import re
from transformers import pipeline, AutoTokenizer
import pandas as pd
from tqdm import tqdm



def load_markdown_file(file_path):
    with open(file_path, "r", encoding='UTF-8') as stream:
        markdown_str = stream.read()
        return markdown_str

def _add_sentence_to_list(sentence: str, sentences_list):
    """
    Add a sentence to the list of sentences.
    Args:
        sentence (str):
            Sentence to be added.
        sentences (List[str]):
            List of sentences.
    """
    while sentence.startswith(" "):
        # remove leading space
        sentence = sentence[1:]
    if all(c in punctuation for c in sentence) or len(sentence) == 1:
        # skip sentences with only punctuation
        return
    sentences_list.append(sentence)

def get_sentences(text: str):
    """
    Get sentences from a text.
    Args:
        text (str):
            Text to be processed.
    Returns:
        List[str]:
            List of sentences.
    """
    # get the paragraphs
    text=   re.sub(" \d+\n", ".", text)
    text=   re.sub("\n\d+", " ", text)
    text=   re.sub("\n", " ", text)
    text=   re.sub("\d+.", "", text)
    paragraphs = re.split(r' *[\.\?!][\'"\)\]]* *', text)
    paragraphs = [p for p in paragraphs if p != ""]
    # get the sentences from the paragraphs
    sentences = list()
    for paragraph in paragraphs:
        if paragraph.startswith("#"):
            _add_sentence_to_list(paragraph, sentences)
            continue
        prev_sentence_idx = 0
        for idx in range(len(paragraph)):
            if idx + 1 < len(paragraph):
                if (paragraph[idx] == "." and not paragraph[idx + 1].isdigit()) or (
                    paragraph[idx] in "!?"
                ):
                    sentence = paragraph[prev_sentence_idx : idx + 1]
                    _add_sentence_to_list(sentence, sentences)
                    prev_sentence_idx = idx + 1
            else:
                sentence = paragraph[prev_sentence_idx:]
                _add_sentence_to_list(sentence, sentences)
    return sentences

def get_hate_speech(
    sentences, sentiment_df, label_col: str = "label"
):
    "Get the hate speech of a list of sentences."
    
    hate_model_path = "Hate-speech-CNERG/dehatebert-mono-german"
    hate_task = pipeline(
        "text-classification", model=hate_model_path, tokenizer=hate_model_path
    )
    hate_outputs = [
        hate_task(sentence) for sentence in tqdm(sentences, desc="Hate speech analysis")
    ]
    hate_dict = dict(label=[], score=[], sentence=[])
    for idx, output in enumerate(hate_outputs):
        hate_dict["label"].append(output[0]["label"])
        hate_dict["score"].append(output[0]["score"])
        hate_dict["sentence"].append(sentences[idx])
    hate_df = pd.DataFrame(hate_dict)
    hate_df["label"] = hate_df.label.map(dict(HATE="negativ", NON_HATE="neutral"))
    hate_condition = (hate_df.label == "negativ") & (sentiment_df[label_col] == "negativ")
    hate_df.loc[hate_condition, "label"] = "negativ"
    hate_df.loc[~hate_condition, "label"] = "neutral"
    return hate_df

def get_sentiment(sentences):
    "based on pretrained model try to identify which elements have which sentiment"
    sentiment_model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
    tokenizer=AutoTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment",use_fast=False)
    sentiment_task = pipeline(
        "sentiment-analysis", model=sentiment_model_path, tokenizer=tokenizer
    )
    sentiment_outputs = [
        sentiment_task(sentence)
        for sentence in tqdm(sentences, desc="Sentiment analysis")
    ]
    sentiments_dict = dict(label=[], score=[], sentence=[])
    for idx, output in enumerate(sentiment_outputs):
        sentiments_dict["label"].append(output[0]["label"])
        sentiments_dict["score"].append(output[0]["score"])
        sentiments_dict["sentence"].append(sentences[idx])
    sentiment_df = pd.DataFrame(sentiments_dict)
    sentiment_df["label"] = sentiment_df.label.map(
        dict(positive="positiv", negative="negativ", neutral="neutral")
    )
    return sentiment_df

In [3]:
for element in list(filter(None, data_names)):
    program_txt = load_markdown_file(f"raw/{element}.txt")
    sentences = get_sentences(program_txt)
    sentiment_df1 = get_sentiment(sentences)     
    hate_df1 = get_hate_speech(sentences,sentiment_df1,label_col= "label")     
    sentiment_df1.to_csv(f"data/{element}_sentiment.csv",sep='§')     
    hate_df1.to_csv(f"data/{element}_hate.csv",sep='§')


Sentiment analysis: 100%|██████████| 4880/4880 [04:46<00:00, 17.05it/s]


Downloading:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/669M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/152 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/872k [00:00<?, ?B/s]