In [None]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import numpy as np

# Parameters for preprocessing
max_length = max([len(word) for word in data['word']]) # Maximum length of a word
vocab_size = len(set(''.join(data['word']))) # Number of unique characters
embedding_dim = 50 # Size of the embedding vector

# Tokenizing the characters in the words
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(data['word'])
sequences = tokenizer.texts_to_sequences(data['word'])

# Padding sequences
X = pad_sequences(sequences, maxlen=max_length, padding='post')

# Encoding labels (1 for correct, 0 for incorrect)
y = np.where(data['label'] == 'correct', 1, 0)

# Splitting the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Building the LSTM model
model = Sequential()
model.add(Embedding(input_dim=vocab_size + 1, output_dim=embedding_dim, input_length=max_length))
model.add(Dropout(0.5))  # Dropout layer
model.add(Bidirectional(LSTM(64)))  # Bidirectional LSTM
model.add(Dropout(0.5))  # Another dropout layer
model.add(Dense(1, activation='sigmoid'))  # Output layer

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()

# Callbacks for early stopping and model checkpointing
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
model_checkpoint = ModelCheckpoint('best_model.h5', save_best_only=True)

# Train the model with callbacks
model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val), callbacks=[early_stopping, model_checkpoint])


In [32]:
file_path = 'dataset kata benar dan typo.csv'  # Replace with your file path
data = pd.read_csv(file_path)

In [3]:
#model.save('first_model.h5')

In [12]:
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the model
model = load_model('first_model.h5')









Word: aku, Correct: True
Word: ingin, Correct: True
Word: bicara, Correct: True
Word: dengan, Correct: True
Word: kamu, Correct: True
Word: kkamu, Correct: False


In [30]:
import string
# Assuming 'tokenizer' is your character-level tokenizer used during training
def preprocess_sentence(sentence, tokenizer, max_length):
    sentence = sentence.lower()
    sentence = sentence.translate(str.maketrans("", "", string.punctuation.replace('-', '')))
    words = sentence.split()
    tokenized_words = tokenizer.texts_to_sequences(words)
    padded_words = pad_sequences(tokenized_words, maxlen=max_length, padding='post')
    return words, padded_words

sentence = 'Ia mengatakan hal itu antara lain karena anak-anak SD yang kehilangan masa belajar selama 6 bulan sama dengan kehilangan 2 tahun pengalaman belajarnya berdasarkan riset yang pernah dibacanya.'
words, preprocessed_sentence = preprocess_sentence(sentence, tokenizer, max_length)

predictions = model.predict(preprocessed_sentence)

# Determine a threshold for correctness, e.g., 0.5
threshold = 0.35
correctness = predictions > threshold

for word, is_correct in zip(words, correctness):
    print(f"Word: {word}, Correct: {is_correct[0]}")

Word: ia, Correct: False
Word: mengatakan, Correct: True
Word: hal, Correct: True
Word: itu, Correct: True
Word: antara, Correct: True
Word: lain, Correct: False
Word: karena, Correct: True
Word: anak-anak, Correct: True
Word: sd, Correct: False
Word: yang, Correct: True
Word: kehilangan, Correct: True
Word: masa, Correct: True
Word: belajar, Correct: True
Word: selama, Correct: True
Word: 6, Correct: True
Word: bulan, Correct: True
Word: sama, Correct: True
Word: dengan, Correct: True
Word: kehilangan, Correct: True
Word: 2, Correct: True
Word: tahun, Correct: True
Word: pengalaman, Correct: True
Word: belajarnya, Correct: True
Word: berdasarkan, Correct: True
Word: riset, Correct: True
Word: yang, Correct: True
Word: pernah, Correct: False
Word: dibacanya, Correct: True


In [26]:
predictions

array([[0.55805874],
       [0.60184956],
       [0.62456554],
       [0.31416276],
       [0.51999176],
       [0.7241107 ],
       [0.55941284],
       [0.00215949],
       [0.54350924],
       [0.34861234],
       [0.00636769],
       [0.28181365],
       [0.19518377],
       [0.69913644],
       [0.9074866 ],
       [0.4565683 ],
       [0.6757054 ],
       [0.59095037],
       [0.55003476],
       [0.66215694],
       [0.381959  ],
       [0.43490946],
       [0.5821152 ],
       [0.7402198 ],
       [0.67783976],
       [0.5810329 ],
       [0.62728345],
       [0.7248004 ],
       [0.38043228],
       [0.8226982 ],
       [0.9074866 ],
       [0.78651905],
       [0.38902643],
       [0.56354904],
       [0.26841888],
       [0.66474503],
       [0.46698517]], dtype=float32)

In [33]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import numpy as np

# Load the dataset
file_path = 'path_to_your_dataset.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Parameters for preprocessing
max_length = max([len(word) for word in data['word']]) # Maximum length of a word
vocab_size = len(set(''.join(data['word']))) # Number of unique characters
embedding_dim = 50 # Size of the embedding vector

# Tokenizing the characters in the words
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(data['word'])
sequences = tokenizer.texts_to_sequences(data['word'])

# Padding sequences
X = pad_sequences(sequences, maxlen=max_length, padding='post')

# Encoding labels (1 for correct, 0 for incorrect)
y = np.where(data['label'] == 'correct', 1, 0)

# Splitting the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Building the LSTM model
model = Sequential()
model.add(Embedding(input_dim=vocab_size + 1, output_dim=embedding_dim, input_length=max_length))
model.add(LSTM(64)) # You can adjust the number of LSTM units
model.add(Dense(1, activation='sigmoid')) # Output layer for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()

# Train the model
model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val)) # Adjust epochs as needed


Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 32, 50)            1550      
                                                                 
 dropout (Dropout)           (None, 32, 50)            0         
                                                                 
 bidirectional (Bidirectiona  (None, 128)              58880     
 l)                                                              
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 1)                 129       
                                                                 
Total params: 60,559
Trainable params: 60,559
Non-trainable params: 0
__________________________________________________

<keras.callbacks.History at 0x241c84cbbb0>

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# Load the dataset
file_path = 'path_to_your_dataset.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Parameters for preprocessing
max_length = max([len(word) for word in data['word']]) # Maximum length of a word
vocab_size = len(set(''.join(data['word']))) # Number of unique characters
embedding_dim = 100 # Size of the embedding vector

# Tokenizing the characters in the words
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(data['word'])
sequences = tokenizer.texts_to_sequences(data['word'])

# Padding sequences
X = pad_sequences(sequences, maxlen=max_length, padding='post')

# Encoding labels (1 for correct, 0 for incorrect)
y = np.where(data['label'] == 'correct', 1, 0)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Building the LSTM model
model = Sequential()
model.add(Embedding(input_dim=vocab_size + 1, output_dim=embedding_dim, input_length=max_length))
model.add(Bidirectional(LSTM(128, return_sequences=True)))  # Bidirectional LSTM
model.add(Dropout(0.5))  # Dropout for regularization
model.add(Bidirectional(LSTM(64)))  # Another Bidirectional LSTM layer
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

# Train the model with early stopping
model.fit(X_train, y_train, epochs=20, validation_split=0.2, callbacks=[early_stopping])




In [None]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

In [4]:
import pandas as pd
import random
import csv

def generate_typo(word):
    
    # generate typo tipe 1: penggandaan huruf secara acak
    if len(word) >= 1:
        # pilih indeks huruf yang akan digandakan secara acak
        rand_idx = random.randint(0, len(word) - 1)
        
        # selama idx mengarah ke tanda strip maka idx random akan di generate terus
        while word[rand_idx] == '-':
            rand_idx = random.randint(0, len(word) - 1)

        # masukkan indeks yang dipilih kedalam variabel letter
        letter = word[rand_idx]

        # kembalikan kata dengan huruf yang telah digandakan
        typo1 = word[:rand_idx + 1] + letter + word[rand_idx + 1:]
    else:
        typo1 = None

    
    # generate typo tipe 2: menukar posisi 2 huruf secara acak
    if len(word) >= 3:
        # generate satu bilangan acak antara 1 dan panjang kata dikurangi 1
        rand_idx1 = random.randint(1, len(word) - 1)
        
        # selama index berada di posisi terakhir kata, menuju ke tanda strip atau berada 1 posisi di belakang tanda strip
        while rand_idx1 == len(word) - 1 or word[rand_idx1] == '-' or word[rand_idx1 + 1] == '-':
            rand_idx1 = random.randint(1, len(word) - 1)
        
        rand_idx2 = rand_idx1 + 1

        # menukar posisi dari 2 huruf berdasarkan index yang telah dihasilkan
        typo2 = word[:rand_idx1] + word[rand_idx2] + word[rand_idx1] + word[rand_idx2 + 1:]
    else:
        typo2 = None

    # generate typo tipe 3: menghapus satu huruf secara acak
    if len(word) >= 3:
        # generate random idx
        rand_idx = random.randint(1, len(word) - 1)
        
        # jika index menuju ke tanda strip
        while word[rand_idx] == '-':
            rand_idx = random.randint(1, len(word) - 1)
            
        # hapus huruf pada indeks yang dihasilkan secara acak
        typo3 = word[:rand_idx] + word[rand_idx+1:]
    else:
        typo3 = None

    
    # generate typo tipe 4: mengganti satu huruf dengan huruf lain secara acak
    if len(word) >= 2:
        # memilih indeks acak pada kata, kecuali indeks pertama
        selected_index = random.randint(1, len(word) - 1)
        
        while word[selected_index] == '-':
            selected_index = random.randint(1, len(word) - 1)

        # memilih huruf acak untuk typo
        typo_char = chr(random.randint(97, 122)) # karakter huruf kecil ASCII antara a dan z

        # memeriksa apakah huruf acak sama dengan huruf pada indeks yang akan diganti
        while typo_char == word[selected_index]:
            typo_char = chr(random.randint(97, 122))

        # mengganti karakter pada indeks yang dipilih dengan huruf typo
        typo4 = word[:selected_index] + typo_char + word[selected_index + 1:]
    else:
        typo4 = None
        
    # membuat list kosong dengan nama generatedTypos
    generatedTypos = []
    
    # memasukkan typo1, typo2, typo3, dan typo4 kedalam list jika tidak none
    if typo1 is not None:
        generatedTypos.append(typo1)
    if typo2 is not None:
        generatedTypos.append(typo2)
    if typo3 is not None:
        generatedTypos.append(typo3)
    if typo4 is not None:
        generatedTypos.append(typo4)
        
    # return list generatedTypos
    return generatedTypos

# membaca data dari file dataset kata benar.csv
with open('dataset kata benar.csv', 'r') as wordData:
    reader = csv.reader(wordData)
    next(reader) # melewatkan baris pertama
    data = list(reader)

# melakukan generate typo pada setiap kata
typos = []
for row in data:
    word = row[0]
    if len(word) != 2:
        typo_list = generate_typo(word)
    
    # membuat list berisi typo dan label incorrect
    for typo in typo_list:
        typo_data = [typo, "incorrect"]
        typos.append(typo_data)


# menuliskan hasil typo ke dalam file baru
with open('dataset kata typo.csv', 'w', newline='') as output_file:
    writer = csv.writer(output_file)
    writer.writerows(typos)
    

In [5]:
df1 = pd.read_csv('dataset kata typo.csv', header=None, names=['word', 'label'])
df2 = pd.read_csv('dataset kata benar.csv')

df = pd.concat([df2,df1], ignore_index = True)
df.to_csv('all_word_dataset.csv',index=False)

In [63]:
my_list = list2

# Create a new list by separating elements with only two letters
new_list = []

for item in my_list:
    if len(item) == 2:
        new_list.append(item)

print(new_list)


['cm', 'fi', 'gu', 'kk', 'we', 'es', 'ia', 'di', 'ke', 'ya', 'oh', 'ku']
