<a href="https://colab.research.google.com/github/Janani-Withana/Sinhala-Chatbot/blob/main/Sinhala_Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import regex as re

In [2]:
def file_to_sentence_list(file_path):
    with open(file_path, 'r') as file:
        text = file.read()

    # Splitting the text into sentences using
    # delimiters like '.', '?', and '!'
    sentences = [sentence.strip() for sentence in re.split(
        r'(?<=[.!?,])\s+', text) if sentence.strip()]

    return sentences

In [4]:
file_path = '/content/sinhala_farming_data.csv'
text_data = file_to_sentence_list(file_path)

In [9]:
# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_data)
total_words = len(tokenizer.word_index) + 1

In [10]:
# Create input sequences
input_sequences = []
for line in text_data:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [11]:
# Pad sequences and split into predictors and label
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = np.array(pad_sequences(
    input_sequences, maxlen=max_sequence_len, padding='pre'))
X, y = input_sequences[:, :-1], input_sequences[:, -1]

In [12]:
# Convert target data to one-hot encoding
y = tf.keras.utils.to_categorical(y,num_classes=total_words)

In [13]:
# Define the model
model = Sequential()
model.add(Embedding(total_words, 10,
                    input_length=max_sequence_len-1))
model.add(LSTM(128))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])



In [None]:
# Train the model
model.fit(X, y, epochs=100, verbose=1)

Epoch 1/100
[1m933/933[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 109ms/step - accuracy: 0.0417 - loss: 6.2504
Epoch 2/100
[1m933/933[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 108ms/step - accuracy: 0.1007 - loss: 5.2876
Epoch 3/100
[1m933/933[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 109ms/step - accuracy: 0.1633 - loss: 4.7295
Epoch 4/100
[1m933/933[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 109ms/step - accuracy: 0.2092 - loss: 4.3346
Epoch 5/100
[1m933/933[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 114ms/step - accuracy: 0.2509 - loss: 4.0349
Epoch 6/100
[1m255/933[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m1:14[0m 109ms/step - accuracy: 0.2769 - loss: 3.7642