# Opdracht
Bij deze opdracht maken we gebruik van de library keras; misschien moet je die nog even pip installen. Verder gebruiken we een aantal beschrijvingen van kanker die we van deze site hebben gedownload. De beschrijvingen kun je hier vinden. Het stappenplan staat hier onder:

1. laad de data in één lange string
2. preprocess de data
3. maak de vectoren x en de y en one-hot-encode deze
4. maak en train het model
5. maak een methode die op basis van een seed een nieuwe sequentie genereert

In [19]:
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, Input


# Preprocess de zinnen
def preprocess_sentence(sentence):
    to_exclude = "/.%-,'\":;()[]0123456789"
    sentence = "".join([char if char not in to_exclude else " " for char in sentence])
    sentence = " ".join([word for word in sentence.split() if word.lower() not in stopwoorden])
    return sentence

# Maak CBOW-pairen
def create_pairs(corpus, sequence_length):
    X, y = [], []
    for sentence in corpus:
        words = list(sentence)  # Splits de zin in karakters
        for i in range(len(words) - sequence_length):
            X.append(words[i:i + sequence_length])  # Context (input)
            y.append(words[i + sequence_length])   # Target (output)
    return X, y

# Laad en preprocess data
with open("cancers.txt", "r") as file:
    cancer_text = [line.strip() for line in file if len(line.strip().split()) >= 10]

with open("stopwoorden.txt", "r") as file:
    stopwoorden = [line.strip() for line in file if line]

# Combineer de data tot één string en preprocess
processed_sentences = [preprocess_sentence(sentence) for sentence in cancer_text]
text = " ".join(processed_sentences)

# Unieke karakters en mapping
chars = sorted(list(set(text)))
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

# Maak CBOW-pairen (context en target)
sequence_length = 40  # Lengte van de inputsequentie
X, y = create_pairs([text], sequence_length)

# One-hot-encode de data
X_encoded = np.zeros((len(X), sequence_length, len(chars)), dtype=np.bool_)
y_encoded = np.zeros((len(y), len(chars)), dtype=np.bool_)

for i, sequence in enumerate(X):
    for t, char in enumerate(sequence):
        X_encoded[i, t, char_to_idx[char]] = 1
    y_encoded[i, char_to_idx[y[i]]] = 1

# Model maken

model = Sequential([
    Input(shape=(sequence_length, len(chars))),
    LSTM(128, return_sequences=True),
    LSTM(128),
    Dense(len(chars), activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train het model
model.fit(X_encoded, y_encoded, batch_size=64, epochs=10)

# Methode om een nieuwe sequentie te genereren
def generate_sequence(seed, length=200, temperature=0.8):
    result = seed
    input_sequence = np.zeros((1, sequence_length, len(chars)), dtype=np.bool_)
    
    # Zet de seed in het input sequentie
    for t, char in enumerate(seed):
        input_sequence[0, t, char_to_idx[char]] = 1

    for _ in range(length):
        prediction = model.predict(input_sequence, verbose=0)
        
        # Pas de voorspellingen aan op basis van de temperatuur
        prediction = np.log(prediction + 1e-7) / temperature
        prediction = np.exp(prediction) / np.sum(np.exp(prediction))  # Normaliseer de voorspellingen

        # Kies het volgende karakter op basis van de aangepaste voorspelling
        next_char_idx = np.argmax(np.random.multinomial(1, prediction[0]))
        next_char = idx_to_char[next_char_idx]
        result += next_char

        # Schuif het input venster
        input_sequence = np.roll(input_sequence, -1, axis=1)
        input_sequence[0, -1, :] = 0
        input_sequence[0, -1, next_char_idx] = 1

    return result


# Test sequentie genereren
seed_text = text[:sequence_length]
print(seed_text)
generated_sequence = generate_sequence(seed_text)

print("Generated sequence:", generated_sequence)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10

KeyboardInterrupt: 

In [18]:
# Methode om een nieuwe sequentie te genereren
def generate_sequence(seed, length=200, temperature=0.8):
    result = seed
    input_sequence = np.zeros((1, sequence_length, len(chars)), dtype=np.bool_)
    
    # Zet de seed in het inputsequentie
    for t, char in enumerate(seed):
        input_sequence[0, t, char_to_idx[char]] = 1

    for _ in range(length):
        prediction = model.predict(input_sequence, verbose=0)
        
        # Pas de voorspellingen aan op basis van de temperatuur
        prediction = np.log(prediction + 1e-7) / temperature
        prediction = np.exp(prediction) / np.sum(np.exp(prediction))  # Normaliseer de voorspellingen

        # Kies het volgende karakter op basis van de aangepaste voorspelling
        next_char_idx = np.argmax(np.random.multinomial(1, prediction[0]))
        next_char = idx_to_char[next_char_idx]
        result += next_char

        # Schuif het input venster
        input_sequence = np.roll(input_sequence, -1, axis=1)
        input_sequence[0, -1, :] = 0
        input_sequence[0, -1, next_char_idx] = 1

    return result


# Test sequentie genereren
seed_text = text[:sequence_length]
print(seed_text)
generated_sequence = generate_sequence(seed_text)

print("Generated sequence:", generated_sequence)


Breast cancer one the most common cancer
Generated sequence: Breast cancer one the most common cancer thate canceros matrem broatmer treatment cancer treatment bnot theh tates cancers then blvode thA ther are sthe wirviun mand breat afcer ther than cancer the cancar suriar breaide ghrowing her asels 
