# Opdracht
Bij deze opdracht maken we gebruik van de library keras; misschien moet je die nog even pip installen. Verder gebruiken we een aantal beschrijvingen van kanker die we van deze site hebben gedownload. De beschrijvingen kun je hier vinden. Het stappenplan staat hier onder:

1. laad de data in één lange string
2. preprocess de data
3. maak de vectoren x en de y en one-hot-encode deze
4. maak en train het model
5. maak een methode die op basis van een seed een nieuwe sequentie genereert

In [4]:
import numpy as np
from keras.api.models import Sequential
from keras.api.layers import LSTM, Dense, Embedding

# Preprocess de zinnen
def preprocess_sentence(sentence):
    to_exclude = "/.%-,'\":;()[]0123456789"
    sentence = "".join([char if char not in to_exclude else " " for char in sentence])
    sentence = " ".join([word for word in sentence.split() if word.lower() not in stopwoorden])
    return sentence

# Maak CBOW-pairen
def create_pairs(corpus, sequence_length):
    X, y = [], []
    for sentence in corpus:
        words = list(sentence)  # Splits de zin in karakters
        for i in range(len(words) - sequence_length):
            X.append(words[i:i + sequence_length])  # Context (input)
            y.append(words[i + sequence_length])   # Target (output)
    return X, y

# Laad en preprocess data
with open("wiki.txt", "r") as file:
    wiki_text = [line.strip() for line in file if len(line.strip().split()) >= 10]

with open("stopwoorden.txt", "r") as file:
    stopwoorden = [line.strip() for line in file if line]

# Combineer de data tot één string en preprocess
processed_sentences = [preprocess_sentence(sentence) for sentence in wiki_text]
text = " ".join(processed_sentences)

# Unieke karakters en mapping
chars = sorted(list(set(text)))
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

# Maak CBOW-pairen (context en target)
sequence_length = 40  # Lengte van de inputsequentie
X, y = create_pairs([text], sequence_length)

# One-hot-encode de data
X_encoded = np.zeros((len(X), sequence_length, len(chars)), dtype=np.bool_)
y_encoded = np.zeros((len(y), len(chars)), dtype=np.bool_)

for i, sequence in enumerate(X):
    for t, char in enumerate(sequence):
        X_encoded[i, t, char_to_idx[char]] = 1
    y_encoded[i, char_to_idx[y[i]]] = 1

# Model maken
model = Sequential([
    LSTM(128, input_shape=(sequence_length, len(chars))),
    Dense(len(chars), activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train het model
model.fit(X_encoded, y_encoded, batch_size=64, epochs=20)

# Methode om een nieuwe sequentie te genereren
def generate_sequence(seed, length=200):
    result = seed
    input_sequence = np.zeros((1, sequence_length, len(chars)), dtype=np.bool_)
    
    for t, char in enumerate(seed):
        input_sequence[0, t, char_to_idx[char]] = 1

    for _ in range(length):
        prediction = model.predict(input_sequence, verbose=0)
        next_char_idx = np.argmax(prediction)
        next_char = idx_to_char[next_char_idx]
        result += next_char

        # Schuif het input window
        input_sequence = np.roll(input_sequence, -1, axis=1)
        input_sequence[0, -1, :] = 0
        input_sequence[0, -1, next_char_idx] = 1

    return result

# Test sequentie genereren
seed_text = text[:sequence_length]
generated_sequence = generate_sequence(seed_text)
print("Generated sequence:", generated_sequence)


2024-12-05 23:35:21.879535: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-05 23:35:21.883284: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-05 23:35:21.893906: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733438121.914985    2448 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733438121.921437    2448 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-05 23:35:21.944396: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

Epoch 1/20


W0000 00:00:1733438125.003990    2448 gpu_device.cc:2344] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
  super().__init__(**kwargs)
2024-12-05 23:35:25.116493: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 25634880 exceeds 10% of free system memory.


[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 31ms/step - accuracy: 0.1568 - loss: 3.2105
Epoch 2/20
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 31ms/step - accuracy: 0.2061 - loss: 2.8602
Epoch 3/20
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 31ms/step - accuracy: 0.2535 - loss: 2.6355
Epoch 4/20
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 30ms/step - accuracy: 0.2678 - loss: 2.5049
Epoch 5/20
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 31ms/step - accuracy: 0.2939 - loss: 2.3828
Epoch 6/20
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 32ms/step - accuracy: 0.3234 - loss: 2.2852
Epoch 7/20
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 31ms/step - accuracy: 0.3392 - loss: 2.2453
Epoch 8/20
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 35ms/step - accuracy: 0.3589 - loss: 2.1781
Epoch 9/20
[1m186/186[0m [32m━━━━━━━━━━━