In [None]:
#Exercise A.1: Preprocessing

import re
import string

file = "Taylor_Swift_Quotes.txt"

def clean(text):
    text = text.lower()                                                 #Lowercase
    text = re.sub(r'\d+', " ", text)                                    #Remove digits
    text = text.translate(str.maketrans('', '', string.punctuation))    #Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()                            #Remove überflüssige Leerzeichen
    return text

with open(file, "r", encoding="utf-8") as file:
    text = file.read()

cleaned_text = clean(text)

print(cleaned_text)

#store cleaned text in new file
# with open('cleaned_text.txt', 'w', encoding="utf-8") as file:
#    file.write(cleaned_text)


In [None]:
#Exercise A.2: Tokenization

import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

tokenized_text = nltk.word_tokenize(cleaned_text)
print(tokenized_text)

#store tokenized text in new file
# with open('tokenized_text.txt', 'w', encoding="utf-8") as file:
#    file.write(tokenized_text)


In [None]:
#Exercise A.2: StopWords

from spacy.lang.en.stop_words import STOP_WORDS             #removes 'is'. 'the' etc

print(f"Number of Stopwords: {len(STOP_WORDS)}")
print(sorted(STOP_WORDS), "\n")

filtered_text = [word for word in tokenized_text if word.lower() not in STOP_WORDS]

print(filtered_text)

#store filtered text in new file
# with open('filtered_text.txt', 'w', encoding="utf-8") as file:
#    file.write(filtered_text)

In [None]:
#Exercise A.3: Stemming and lemmatization

from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
import nltk

nltk.download('wordnet')        #db for lemmatizing
nltk.download('omw-1.4')

#Ohne das würde alles als Nomen behandelt -> PartOfSpeech Tagger wird sichergestellt
try:
    nltk.data.find('taggers/averaged_perceptron_tagger_eng')
except LookupError:
    try:
        nltk.download('averaged_perceptron_tagger_eng')
    except:
        nltk.download('averaged_perceptron_tagger')

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

#übersetzt die Wortarten (POS-Tags), die der NLTK-Tagger ausgibt, in ein Format, das der WordNet-Lemmatizer versteht.
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

stemmed_text = [stemmer.stem(word) for word in filtered_text]

pos_tags = nltk.pos_tag(filtered_text)  #bestimmt Wortart
lemmatized_text = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tags]


print(stemmed_text, "\n")
print(lemmatized_text)


In [None]:
#Exercise A.4: Vocabulary

dictionary = lemmatized_text            #welchen Text sollte cih verwenden?

vocab_set = set(dictionary)

vocab_dict = {word: i for i, word in enumerate(sorted(vocab_set), start=1)} #sorted()=alphabetisch

print(f"Vocab size: {len(vocab_set)}")
for w in list(vocab_dict):
    print(w, '→', vocab_dict[w])


In [11]:
import numpy as np

#np.set_printoptions(threshold=np.inf)  # disables truncation

oneHotEncoder = np.zeros((len(lemmatized_text), len(vocab_dict)), dtype=int)

for i, word in enumerate(lemmatized_text):
    if word in vocab_dict:
        index = vocab_dict[word] - 1
        oneHotEncoder[i, index] = 1

print(oneHotEncoder[:10]) #[:10] zeigt die ersten 10 Wörter an

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
