In [None]:
import os

DATA_DIR= 'raw'
data_names = os.listdir(DATA_DIR)
data_names = [name[:-4] for name in data_names if name != ".DS_Store"]

In [None]:
import yaml
from tqdm import tqdm

def load_yaml_file(file_path):
    # reads the yml files as a dictionary, were each topic is a key and the values are a list of elements
    with open(file_path, "r", encoding='UTF-8') as stream:
        yaml_dict = yaml.safe_load(stream)
        return yaml_dict

topics = load_yaml_file('topic_modeling/topic_g.yml')

In [None]:
from string import punctuation
import re
import pandas as pd


def load_markdown_file(file_path):
    with open(file_path, "r", encoding='UTF-8') as stream:
        markdown_str = stream.read()
        return markdown_str

def _add_sentence_to_list(sentence: str, sentences_list):
    """
    Add a sentence to the list of sentences.
    Args:
        sentence (str):
            Sentence to be added.
        sentences (List[str]):
            List of sentences.
    """
    while sentence.startswith(" "):
        # remove leading space
        sentence = sentence[1:]
    if all(c in punctuation for c in sentence) or len(sentence) == 1:
        # skip sentences with only punctuation
        return
    sentences_list.append(sentence)

def get_sentences(text: str):
    """
    Get sentences from a text.
    Args:
        text (str):
            Text to be processed.
    Returns:
        List[str]:
            List of sentences.
    """
    # get the paragraphs
    text=   re.sub(" \d+\n", ".", text)
    text=   re.sub("\n\d+", " ", text)
    text=   re.sub("\n", " ", text)
    text=   re.sub("\d+.", "", text)
    paragraphs = re.split(r' *[\.\?!][\'"\)\]]* *', text)
    paragraphs = [p for p in paragraphs if p != ""]
    # get the sentences from the paragraphs
    sentences = list()
    for paragraph in paragraphs:
        if paragraph.startswith("#"):
            _add_sentence_to_list(paragraph, sentences)
            continue
        prev_sentence_idx = 0
        for idx in range(len(paragraph)):
            if idx + 1 < len(paragraph):
                if (paragraph[idx] == "." and not paragraph[idx + 1].isdigit()) or (
                    paragraph[idx] in "!?"
                ):
                    sentence = paragraph[prev_sentence_idx : idx + 1]
                    _add_sentence_to_list(sentence, sentences)
                    prev_sentence_idx = idx + 1
            else:
                sentence = paragraph[prev_sentence_idx:]
                _add_sentence_to_list(sentence, sentences)
    return sentences



def get_topical_sentences(
    sentences, topics, df_y=0
) :

    "classifies the content based on the frequency of the occurring words of the classes"
    sent_df=[]
    topical_sentences = dict()
    topics_list=[]
    for topic in topics:
        topics_list.append(topic)
        topical_sentences[topic] = list()
        #topical_sentences[f'{topic}_num'] = list()
    for sentence in sentences:
        topic_list=[]
        for topic in topics:
            topic_num = 0
            if any(str(topical_word) in str(sentence.lower()) for topical_word in topics[topic]):
                for  topical_word in topics[topic]:
                        if str(topical_word) in str(sentence.lower()):
                            topic_num+=1
                
                
            else:
                topic_num=0
            topic_list.append(topic_num)
        
        topical_sentences[topics_list[max(range(len(topic_list)), key=topic_list.__getitem__)]].append(sentence)
        if df_y:
            sent_df.append([sentence,topics_list[max(range(len(topic_list)), key=topic_list.__getitem__)]])
    if df_y:
        return pd.DataFrame(data=sent_df,columns=['text','topic'])

    return topical_sentences


In [None]:
!python -m spacy download de_core_news_lg

In [None]:
import spacy

nlp = spacy.load("de_core_news_lg",disable=['parser', 'ner','tagger'])

def text_lemma(lsit):
    liste=[]
    doc = nlp(lsit)
    for token in doc:
        if not token.is_stop and not token.is_punct:
            liste.append(token.lemma_.lower())

    return ' '.join(liste)


def list_lemma(lsit):
    liste=[]
    string=' '.join(lsit)
    nlp = spacy.load("de_core_news_lg")
    doc = nlp(str(string))
    for token in tqdm(doc):
        if not token.is_stop and not token.is_punct:
            liste.append(token.lemma_.lower())
    return list(dict.fromkeys(liste)) 
topics = load_yaml_file('topic_modeling/topic_g.yml')

for topic in topics.keys():
    topics[topic]=list_lemma(topics[topic])

In [None]:
def automated_topical_sentences(data_names, topics):
    for element in list(filter(None, data_names)):
        program_txt = load_markdown_file(f"raw/{element}.txt")
        sentences = get_sentences(program_txt)
        sent = []
        for text in tqdm(sentences):
            sent.append(text_lemma(text))
        df1 = get_topical_sentences(sent, topics,1)
        df1.to_csv(f'{element}_topic_class.csv', index = True)

In [None]:
automated_topical_sentences(data_names, topics)