In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import nltk
import re

import tqdm

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [6]:
with open('data/DIE_GRUENEN_Wahlprogramm_2021_sent_lemma_list.txt', 'r') as f:
    content = f.read()
    sent = content.split('§')

In [7]:
len(sent)

14956

In [5]:
def init_data(path: str, rename_dict, drop_list=[]):

    def normalize(data, columns):
        for col in columns:
            data[col] = data[col].str.lower().replace("\\n", ".")
        return data

    data = pd.read_csv(path)
    data = data.rename(columns=rename_dict)
    data = data.drop(drop_list, axis=1)
    data = data.dropna()
    data = data.reset_index()
    data = data.drop("index", axis=1)
    data = normalize(data)

    return data

In [12]:
def sentence_tokenization(data, col):
    # Lists to store the results during the calculations
    sentences = []
    lengths_of_sentences = []

    # Initialize names for columns
    col_name = col + "_sent_tok"
    col_name_length = col + "_num_of_sent"

    # Iterating through the data row for row splitting the text in sentences and counting them
    for entry in tqdm(data[col]):
        col_sentences = nltk.sent_tokenize(entry)
        sentences.append(col_sentences)
        lengths_of_sentences.append(len(col_sentences))
    
    # Insert results to dataframe
    data[col_name] = sentences
    data[col_name_length] = lengths_of_sentences

    return data

In [10]:
for entry in ["Hallo, ich bin Niklas. Wer genau bist du? Wie kommst du hier her? Ich hoffe, es geht dir gut!", "Ja hi, mir gehts gut! Und dir?"]:
    sentence = nltk.sent_tokenize(entry)
    print(sentence)

['Hallo, ich bin Niklas.', 'Wer genau bist du?', 'Wie kommst du hier her?', 'Ich hoffe, es geht dir gut!']
['Ja hi, mir gehts gut!', 'Und dir?']


In [13]:
def word_tokenization(data, col):
    # Lists to store the results during the calculations
    words = []
    length_of_words = []

    # Initialize names for columns
    col_name = col + "_word_tok"
    col_name_length = col + "_num_of_word"

    # Iterating through the data row for row splitting the text in words and counting them
    for i in tqdm(range(len(data))):
        word_list = nltk.regexp_tokenize(data[col][i], pattern='\w+')
        words.append(word_list)
        length_of_words.append(len(word_list))

    # Insert results to dataframe
    data[col_name] = words
    data[col_name_length] = length_of_words

    return data

In [14]:
def stemming_lemming(data, col):
    # Initialize Stemmer and Lemmatizer
    pst = nltk.PorterStemmer()
    wlem = nltk.WordNetLemmatizer()

    # Lists to store results
    stems = []
    lemms = []

    # Initialize names for columns
    col_name_stems = col + "_stems"
    col_name_lemms = col + "_lemms"

    # Iterating through the data row for row creating stems and lemms
    for i in tqdm(range(len(data))):
        stem_cache = []
        lemm_cache = []

        for c in data[col][i]:
            stem_cache.append(pst.stem(c))
            lemm_cache.append(wlem.lemmatize(c))
        
        stems.append(stem_cache)
        lemms.append(lemm_cache)

    # Insert results to dataframe
    data[col_name_stems] = stems
    data[col_name_lemms] = lemms
        
    return data

In [15]:
def stopwords_count_and_removal(data, col, language="english"):

    stoplist = nltk.stopwords.words(language)

    number_of_stopwords = []
    text_without_stopwords = []

    col_name_number_of_stopwords = col + "_num_of_stopwords"
    col_name_without_stopwords = col + "_without_stopwords"

    for i in tqdm(range(len(data))):
        no_of_words = 0
        without_stopwords = []
        for word in data[col][i]:
            if word in stoplist:
                no_of_words += 1
            else:
                without_stopwords.append(word)
        
        text_without_stopwords.append(without_stopwords)
        number_of_stopwords.append(no_of_words)

    data[col_name_number_of_stopwords] = number_of_stopwords
    data[col_name_without_stopwords] = text_without_stopwords


    return data