In [26]:
import re 
import numpy as np
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.models import Sequential

In [27]:
def load_data(file_path):
    with open (file_path, "r", encoding="utf-8") as f:
        text = f.read()
    return text

file_path = '/homes/jrgommers/year 3/Deel_B/copilot.txt'
data = load_data(file_path)


In [28]:
def preprocess_text(text):
    text = re.sub(r"[^\w\s]", "", text)  
    text = re.sub(r"\s+", " ", text.strip().lower()) 

    return text

cleaned_data = preprocess_text(data)

unique_chart = sorted(set(cleaned_data))

char_to_index = {char:index for index,char in enumerate(unique_chart)}
index_to_char = {index:char for index,char in char_to_index.items()}

print(f"Raw data: {data[:100]}")  
print(f"Cleaned data: {cleaned_data[:100]}") 
print(f"Unique characters (unique_chart): {unique_chart}")
print(f"Length of unique characters: {len(unique_chart)}")


Raw data: Breast Cancer
Breast cancer is one of the most common cancers that affects women and people assigned
Cleaned data: breast cancer breast cancer is one of the most common cancers that affects women and people assigned
Unique characters (unique_chart): [' ', '0', '1', '2', '3', '4', '5', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'â']
Length of unique characters: 36


In [29]:
data_as_int = np.array([char_to_index[char]for char in cleaned_data])

sequence_length = 100
step = 1

sequence = []
targets = []

for i in range (0, len(data_as_int)- sequence_length,step):
    sequence.append(data_as_int[i:i+sequence_length])
    targets.append(data_as_int[i+sequence_length])

x = np.array(sequence)
y = np.array(targets)

y = to_categorical(y,num_classes = len(unique_chart))

print(x.shape)
print(y.shape)

(14343, 100)
(14343, 36)


In [30]:

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [31]:


model = Sequential([
    Embedding(input_dim=len(unique_chart), output_dim=256),
    LSTM(256, return_sequences=True),  
    LSTM(256),  
    Dense(len(unique_chart), activation='softmax') 
])


model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()


In [32]:
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=64,epochs=1)

[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 425ms/step - accuracy: 0.1577 - loss: 2.9867 - val_accuracy: 0.2802 - val_loss: 2.4436
