In [3]:
# import benodigde modules
import re 
import numpy as np
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.models import Sequential

In [4]:
# Functie om gegevens te laden uit een tekstbestand
def load_data(file_path):
    with open (file_path, "r", encoding="utf-8") as f:
        text = f.read()
    return text

file_path = 'Deel_B/deel_4.txt'
data = load_data(file_path)


In [5]:
# Functie om tekst voor te bewerken naar nodige indeling
def preprocess_text(text):
    text = re.sub(r"[^\w\s]", "", text)  
    text = re.sub(r"\s+", " ", text.strip().lower()) 

    return text

cleaned_data = preprocess_text(data)

# Vind alle unieke tekens in de opgeschoonde tekst en sorteer ze alfabetisch
unique_chart = sorted(set(cleaned_data))

# mapping van unieke teken naar index en andersom
char_to_index = {char:index for index,char in enumerate(unique_chart)}
index_to_char = {index:char for char,index in char_to_index.items()}

print(index_to_char)

print(f"Raw data: {data[:100]}")  
print(f"Cleaned data: {cleaned_data[:100]}") 
print(f"Unique characters (unique_chart): {unique_chart}")
print(f"Length of unique characters: {len(unique_chart)}")


{0: ' ', 1: '0', 2: '1', 3: '2', 4: '3', 5: '4', 6: '5', 7: '8', 8: '9', 9: 'a', 10: 'b', 11: 'c', 12: 'd', 13: 'e', 14: 'f', 15: 'g', 16: 'h', 17: 'i', 18: 'j', 19: 'k', 20: 'l', 21: 'm', 22: 'n', 23: 'o', 24: 'p', 25: 'q', 26: 'r', 27: 's', 28: 't', 29: 'u', 30: 'v', 31: 'w', 32: 'x', 33: 'y', 34: 'z', 35: 'â'}
Raw data: Breast Cancer
Breast cancer is one of the most common cancers that affects women and people assigned
Cleaned data: breast cancer breast cancer is one of the most common cancers that affects women and people assigned
Unique characters (unique_chart): [' ', '0', '1', '2', '3', '4', '5', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'â']
Length of unique characters: 36


In [6]:

# Elke karakter wordt vervangen door zijn bijbehorende index in de `char_to_index` mapping
data_as_int = np.array([char_to_index[char]for char in cleaned_data])

sequence_length = 10
step = 1

sequence = []
targets = []

#  Loop door de data om sequenties en targets te genereren
for i in range (0, len(data_as_int)- sequence_length,step):
    # Voeg een sequentie van lengte `sequence_length` toe
    sequence.append(data_as_int[i:i+sequence_length])
    # Voeg het volgende karakter na de sequentie toe als target
    targets.append(data_as_int[i+sequence_length])

x = np.array(sequence)
y = np.array(targets)



print(x.shape)
print(y.shape)

# Eén-hots encodeer de invoersequenties en doelen
x_encoded = np.array([to_categorical(n,num_classes=len(unique_chart)) for n in x])
y_encoded = np.array(to_categorical(y,num_classes=len(unique_chart)))

(14433, 10)
(14433,)


In [7]:
from tensorflow.keras import Input

# defineer model
model = Sequential()
model.add(Input(shape=(sequence_length, len(unique_chart))))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128))
model.add(Dense(len(unique_chart), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
 

2024-12-05 08:42:44.680110: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [8]:
# Fit model
# history = model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=64,epochs=1)
history = model.fit(x_encoded, y_encoded, batch_size=64,epochs=20)

Epoch 1/20
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - accuracy: 0.1538 - loss: 3.0385
Epoch 2/20
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.2328 - loss: 2.6495
Epoch 3/20
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.3262 - loss: 2.2767
Epoch 4/20
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.3830 - loss: 2.0984
Epoch 5/20
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.4310 - loss: 1.9387
Epoch 6/20
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.4779 - loss: 1.7773
Epoch 7/20
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.5236 - loss: 1.6367
Epoch 8/20
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.5509 - loss: 1.5368
Epoch 9/20
[1m226/226[0m [32m

In [9]:
import numpy as np
import random


def generate_sequence(seed_text, model, char_to_index, index_to_char, sequence_length, num_chars_to_generate):

    generated = seed_text
    # Converteer de seed-text naar een lijst van indices
    input_sequence = [char_to_index[char] for char in seed_text]
    print(index_to_char)



    for _ in range(num_chars_to_generate):
        # Zorg ervoor dat de invoer de juiste lengte heeft
        input_sequence_padded = np.array([to_categorical(input_sequence, num_classes=len(unique_chart))])
        
        # Maak een voorspelling
        predicted_probs = model.predict(input_sequence_padded, verbose=0)
        predicted_index = np.argmax(predicted_probs)
        
        # Voeg het voorspelde karakter toe
        predicted_char = index_to_char[predicted_index]
        generated += predicted_char
        input_sequence.append(predicted_index)
    
    return generated


# De seed text5 die gebruikt word en het aantal karakters dat voorspelt moet worden.
seed_text = "breast cancer"  
num_chars_to_generate = 200  
generated_text = generate_sequence(seed_text, model, char_to_index, index_to_char, sequence_length, num_chars_to_generate)

print("\nGenerated Text:")
print(generated_text)



{0: ' ', 1: '0', 2: '1', 3: '2', 4: '3', 5: '4', 6: '5', 7: '8', 8: '9', 9: 'a', 10: 'b', 11: 'c', 12: 'd', 13: 'e', 14: 'f', 15: 'g', 16: 'h', 17: 'i', 18: 'j', 19: 'k', 20: 'l', 21: 'm', 22: 'n', 23: 'o', 24: 'p', 25: 'q', 26: 'r', 27: 's', 28: 't', 29: 'u', 30: 'v', 31: 'w', 32: 'x', 33: 'y', 34: 'z', 35: 'â'}



Generated Text:
breast cancer types of myeroma in the may at to ath a a a a cancergine andery spending maplyeact of a devers are show arenate cancer arousedeces mar camester blood cancertid and stagly that a frear researe as eve 
