In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import nltk
import re

import tqdm

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [5]:
def init_data(path: str, rename_dict, drop_list=[]):

    def normalize(data, columns):
        for col in columns:
            data[col] = data[col].str.lower().replace("\\n", ".")
        return data

    data = pd.read_csv(path)
    data = data.rename(columns=rename_dict)
    data = data.drop(drop_list, axis=1)
    data = data.dropna()
    data = data.reset_index()
    data = data.drop("index", axis=1)
    data = normalize(data)

    return data

In [12]:
def sentence_tokenization(data, col):
    # Lists to store the results during the calculations
    sentences = []
    lengths_of_sentences = []

    # Initialize names for columns
    col_name = col + "_sent_tok"
    col_name_length = col + "_num_of_sent"

    # Iterating through the data row for row splitting the text in sentences and counting them
    for entry in tqdm(data[col]):
        col_sentences = nltk.sent_tokenize(entry)
        sentences.append(col_sentences)
        lengths_of_sentences.append(len(col_sentences))
    
    # Insert results to dataframe
    data[col_name] = sentences
    data[col_name_length] = lengths_of_sentences

    return data

In [10]:
for entry in ["Hallo, ich bin Niklas. Wer genau bist du? Wie kommst du hier her? Ich hoffe, es geht dir gut!", "Ja hi, mir gehts gut! Und dir?"]:
    sentence = nltk.sent_tokenize(entry)
    print(sentence)

['Hallo, ich bin Niklas.', 'Wer genau bist du?', 'Wie kommst du hier her?', 'Ich hoffe, es geht dir gut!']
['Ja hi, mir gehts gut!', 'Und dir?']


In [13]:
def word_tokenization(data, col):
    # Lists to store the results during the calculations
    words = []
    length_of_words = []

    # Initialize names for columns
    col_name = col + "_word_tok"
    col_name_length = col + "_num_of_word"

    # Iterating through the data row for row splitting the text in words and counting them
    for i in tqdm(range(len(data))):
        word_list = nltk.regexp_tokenize(data[col][i], pattern='\w+')
        words.append(word_list)
        length_of_words.append(len(word_list))

    # Insert results to dataframe
    data[col_name] = words
    data[col_name_length] = length_of_words

    return data

In [14]:
def stemming_lemming(data, col):
    # Initialize Stemmer and Lemmatizer
    pst = nltk.PorterStemmer()
    wlem = nltk.WordNetLemmatizer()

    # Lists to store results
    stems = []
    lemms = []

    # Initialize names for columns
    col_name_stems = col + "_stems"
    col_name_lemms = col + "_lemms"

    # Iterating through the data row for row creating stems and lemms
    for i in tqdm(range(len(data))):
        stem_cache = []
        lemm_cache = []

        for c in data[col][i]:
            stem_cache.append(pst.stem(c))
            lemm_cache.append(wlem.lemmatize(c))
        
        stems.append(stem_cache)
        lemms.append(lemm_cache)

    # Insert results to dataframe
    data[col_name_stems] = stems
    data[col_name_lemms] = lemms
        
    return data

In [15]:
def stopwords_count_and_removal(data, col, language="english"):

    stoplist = nltk.stopwords.words(language)

    number_of_stopwords = []
    text_without_stopwords = []

    col_name_number_of_stopwords = col + "_num_of_stopwords"
    col_name_without_stopwords = col + "_without_stopwords"

    for i in tqdm(range(len(data))):
        no_of_words = 0
        without_stopwords = []
        for word in data[col][i]:
            if word in stoplist:
                no_of_words += 1
            else:
                without_stopwords.append(word)
        
        text_without_stopwords.append(without_stopwords)
        number_of_stopwords.append(no_of_words)

    data[col_name_number_of_stopwords] = number_of_stopwords
    data[col_name_without_stopwords] = text_without_stopwords


    return data

In [None]:
def create_word_dict(data, without_stopwords=True, with_stopwords=False):

    if without_stopwords:
        word_dict_without_stopwords = {}
        
    if with_stopwords:
        word_dict_with_stopwords = {}

    stopword_list = stopwords.words('english')
    for i in tqdm(range(len(data))):
        if without_stopwords:
            for word in literal_eval(data["lemma_list"][i]):
                if word not in [",", ".", ")", "(", "{", "}", "[", "]", ":", ";", "\"\"", "...", "I", "-PRON-", "-", "'", "'s", "urllink"] and word not in stopword_list:
                    if word in word_dict_without_stopwords.keys():
                        word_dict_without_stopwords[word] += 1
                    else:
                        word_dict_without_stopwords[word] = 1

        if with_stopwords:
            for word in literal_eval(data["word_tokenize"][i]):
                if word not in [",", ".", ")", "(", "{", "}", "[", "]", ":", ";", "\"\"", "...", "I", "-PRON-", "-", "'"]:
                    if word in word_dict_with_stopwords.keys():
                        word_dict_with_stopwords[word] += 1
                    else:
                        word_dict_with_stopwords[word] = 1

    if without_stopwords and with_stopwords:
        return word_dict_without_stopwords, word_dict_with_stopwords
    elif without_stopwords:
        return word_dict_without_stopwords
    elif with_stopwords:
        return word_dict_with_stopwords
    else:
        print("Nothing to return selected")
        return 0

In [None]:
def visualize_specific_words(dictionary, limit=10, save_fig=True, save_name="standard_save_name"):
    data_as_df = pd.DataFrame(data={"word": dictionary.keys(), "number": dictionary.values()})
    sorted_df = data_as_df.sort_values(by="number", ascending=False)

    figure = sb.barplot(data=sorted_df[:limit], x="word", y="number", palette="deep")
    figure.set_xlabel("Words")
    figure.set_ylabel("Appearances of the word")

    if save_fig:
        fig = figure.get_figure()
        fig.savefig("images/gender/" + str(save_name + ".jpg"))

    return fig.get_figure()

In [None]:
words_without_stopwords_all = create_word_dict(data, True, False)

In [None]:
def getNumbers(data):
    data["num_nouns"] = 0
    data["num_verbs"] = 0
    data["num_conjs"] = 0
    num_nouns, num_verbs, num_conjs = [], [], []

    for i in tqdm(range(len(data))):
        nouns, verbs, conjs = 0, 0, 0
        for elem in literal_eval(data["pos_list"][i]):
            #print(elem)
            if elem == "NOUN":
                nouns += 1
            elif elem == "VERB":
                verbs += 1
            elif elem == "CCONJ":
                conjs += 1
        num_nouns.append(nouns)
        num_verbs.append(verbs)
        num_conjs.append(conjs)

    data["num_nouns"] = num_nouns
    data["num_verbs"] = num_verbs
    data["num_conjs"] = num_conjs

    return data

def getRatios(data):
    data["ratio_nouns"] = data.apply(lambda row: (row["num_nouns"] / row["word_count"]), axis=1)
    data["ratio_verbs"] = data.apply(lambda row: (row["num_verbs"] / row["word_count"]), axis=1)
    data["ratio_conjs"] = data.apply(lambda row: (row["num_conjs"] / row["word_count"]), axis=1)

    return data

In [None]:
data = getNumbers(data)
data = getRatios(data)