In [None]:
import spacy
import collections
from collections import defaultdict
import csv
import glob
import pandas as pd
from os import path

# stripped down spacy pretrained pipe, just tokenizer, lemmatizer, tagger and morphologizer in pipeline
# since it is just used for tokenizing (tokenizer)
# and lemmatizing (tokenizer, lemmatizer, tagger, morphologizer)
nlp = spacy.load(
    "de_core_news_lg", exclude=["tok2vec", "ner", "parser", "attribute_ruler"]
)

In [None]:
# function to create vocab from csv file, containing tweets
# mode: should lemmas be used instead of token
def create_vocab(file_name, mode=False):
    vocab_count = defaultdict(int)
    # reading file, only using the column containing the documents
    df = pd.read_csv(file_name, sep=",", quoting=csv.QUOTE_NONE, usecols=[2])
    # tokenizing/lemmatizing
    df["text"] = df["text"].apply(lambda x: nlp(str(x)))
    # if token should be used:
    if not mode:
        # output file name
        title = path.basename(file_name)[:-4] + "_vocab_token.csv"
        # count occurences of each unique word
        for line in df["text"]:
            for token in line:
                vocab_count[token.text] += 1

    # if lemma should be used:
    if mode:
        # output file name
        title = path.basename(file_name)[:-4] + "_vocab_lemma.csv"
        # count occurences of each unique lemma
        for line in df["text"]:
            for token in line:
                vocab_count[token.lemma_] += 1

    # sorting by value and filter for min_word_count >= 3
    vocab_count = {
        k: v
        for k, v in sorted(vocab_count.items(), key=lambda item: item[1], reverse=True)
        if v > 2
    }

    # adding rank-column
    df = pd.DataFrame.from_dict(data=vocab_count, orient="index", columns=["count"])
    df["rank"] = range(1, len(df) + 1)

    # writing to file
    df.to_csv(title, header=True, index_label="token")
    return "{title} create with {count} words.".format(
        title=title, count=len(vocab_count)
    )

In [None]:
for party in glob.glob("../cleaned-data/*.csv"):
    print(create_vocab(party, False))
    print(create_vocab(party, True))

In [None]:
# merging the vocabs of CDU and CSU to create the Union vocab
# mode: should lemmas be used instead of token (see: create_vocab function)
def create_vocab_union(mode=False):
    vocab_count = defaultdict(int)

    # if token should be used:
    # importing dictionary from create_vocab (above)
    if not mode:
        title = "union_vocab_token.csv"
        with open("CDU_vocab_token.csv", mode="r") as infile:
            reader = csv.reader(infile)
            # skip first line
            next(reader)
            cdu = {rows[0]: rows[1] for rows in reader}
        with open("CSU_vocab_token.csv", mode="r") as infile:
            reader = csv.reader(infile)
            # skip first line
            next(reader)
            csu = {rows[0]: rows[1] for rows in reader}

    # if lemma should be used:
    # importing dictionary from create_vocab (above)
    if mode:
        title = "union_vocab_lemma.csv"
        with open("CDU_vocab_lemma.csv", mode="r") as infile:
            reader = csv.reader(infile)
            # skip first line
            next(reader)
            cdu = {rows[0]: rows[1] for rows in reader}
        with open("CSU_vocab_lemma.csv", mode="r") as infile:
            reader = csv.reader(infile)
            # skip first line
            next(reader)
            csu = {rows[0]: rows[1] for rows in reader}

    # merging
    for party in (cdu, csu):
        for item in party:
            vocab_count[item] += int(party[item])
    # sorting by value and filter for min_word_count >= 3
    vocab_count = {
        k: v
        for k, v in sorted(vocab_count.items(), key=lambda item: item[1], reverse=True)
        if v > 2
    }

    # adding rank-column
    df = pd.DataFrame.from_dict(data=vocab_count, orient="index", columns=["count"])
    df["rank"] = range(1, len(df) + 1)

    # writing to file
    df.to_csv(title, header=True, index_label="token")
    return "{title} create with {count} words.".format(
        title=title, count=len(vocab_count)
    )

In [None]:
# merging cdu and csu
print(create_vocab_union(False))
print(create_vocab_union(True))

In [None]:
data = pd.DataFrame(columns=['text'])

afd = pd.read_csv('../cleaned-data/AfD.csv', sep=",", quoting=csv.QUOTE_NONE, usecols=[2])
cdu = pd.read_csv('../cleaned-data/CDU.csv', sep=",", quoting=csv.QUOTE_NONE, usecols=[2])
csu = pd.read_csv('../cleaned-data/CSU.csv', sep=",", quoting=csv.QUOTE_NONE, usecols=[2])
fdp = pd.read_csv('../cleaned-data/FDP.csv', sep=",", quoting=csv.QUOTE_NONE, usecols=[2])
gru = pd.read_csv('../cleaned-data/GRÜNE.csv', sep=",", quoting=csv.QUOTE_NONE, usecols=[2])
lin = pd.read_csv('../cleaned-data/LINKE.csv', sep=",", quoting=csv.QUOTE_NONE, usecols=[2])
spd = pd.read_csv('../cleaned-data/GRÜNE.csv', sep=",", quoting=csv.QUOTE_NONE, usecols=[2])

data = data.append(afd, ignore_index=True)
data = data.append(cdu, ignore_index=True)
data = data.append(csu, ignore_index=True)
data = data.append(fdp, ignore_index=True)
data = data.append(gru, ignore_index=True)
data = data.append(lin, ignore_index=True)
data = data.append(spd, ignore_index=True)

data["text"] = data["text"].apply(lambda x: nlp(str(x)))

In [None]:
display(data)

In [None]:
vocab_count = defaultdict(int)
title = "all_lemma_vocab_token.csv"

for line in data["text"]:
    for token in line:
        vocab_count[token.lemma_] += 1

vocab_count = {
    k: v
    for k, v in sorted(vocab_count.items(), key=lambda item: item[1], reverse=True)
    if v > 2
}

df = pd.DataFrame.from_dict(data=vocab_count, orient="index", columns=["count"])
df["rank"] = range(1, len(df) + 1)

df.to_csv(title, header=True, index_label="token")
"{title} create with {count} words.".format(title=title, count=len(vocab_count))

In [None]:
def remove_user(seq): 
    seen = 0
    def inc():
        nonlocal seen 
        seen = seen + 1
        return seen == 1
    return [x for x in seq if (x != "user" or inc())]
    

In [None]:
data["reduced_user"] = data["text"].apply(lambda x: remove_user([y.text for y in x]))

In [None]:
display(data)

In [None]:
vocab_count = defaultdict(int)
title = "reduced_user_vocab_token.csv"

for line in data["reduced_user"]:
    for token in line:
            vocab_count[token] += 1

vocab_count = {
    k: v
    for k, v in sorted(vocab_count.items(), key=lambda item: item[1], reverse=True)
    if v > 2
}

df = pd.DataFrame.from_dict(data=vocab_count, orient="index", columns=["count"])
df["rank"] = range(1, len(df) + 1)

df.to_csv(title, header=True, index_label="token")
"{title} create with {count} words.".format(title=title, count=len(vocab_count))

In [None]:
def remove_party(seq):
    return [x for x in seq if x.lower() not in ["afd","cdu","csu","linke","fdp","gruene"]]

In [None]:
data["remove_party"] = data["text"].apply(lambda x: remove_party([y.text for y in x]))

In [None]:
vocab_count = defaultdict(int)
title = "remove_party_vocab_token.csv"

for line in data["remove_party"]:
    for token in line:
            vocab_count[token] += 1

vocab_count = {
    k: v
    for k, v in sorted(vocab_count.items(), key=lambda item: item[1], reverse=True)
    if v > 2
}

df = pd.DataFrame.from_dict(data=vocab_count, orient="index", columns=["count"])
df["rank"] = range(1, len(df) + 1)

df.to_csv(title, header=True, index_label="token")
"{title} create with {count} words.".format(title=title, count=len(vocab_count))