<a href="https://colab.research.google.com/github/Janani-Withana/Sinhala-Chatbot/blob/main/Paddy_Advisor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader

# Load the dataset
data = pd.read_csv('sinhala_farming_data.csv')  # Replace with your dataset file name
questions = data['Question'].values
answers = data['Answer'].values

# Split the dataset into training and testing sets
train_questions, test_questions, train_answers, test_answers = train_test_split(
    questions, answers, test_size=0.2, random_state=42)


In [5]:
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import torch.nn.functional as F

nltk.download('punkt')
nltk.download('punkt_tab')

# Tokenize questions and answers
def tokenize(text):
    return word_tokenize(text)

train_questions = [tokenize(q) for q in train_questions]
train_answers = [tokenize(a) for a in train_answers]

# Build vocabulary
def build_vocab(sentences):
    counter = Counter()
    for sentence in sentences:
        counter.update(sentence)
    return {word: idx+1 for idx, (word, _) in enumerate(counter.items())}  # idx+1 to reserve 0 for padding

question_vocab = build_vocab(train_questions)
answer_vocab = build_vocab(train_answers)

# Add <PAD>, <SOS>, <EOS>
PAD_TOKEN = 0
SOS_TOKEN = len(answer_vocab) + 1
EOS_TOKEN = len(answer_vocab) + 2

answer_vocab['<SOS>'] = SOS_TOKEN
answer_vocab['<EOS>'] = EOS_TOKEN


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [6]:
class ChatDataset(Dataset):
    def __init__(self, questions, answers, question_vocab, answer_vocab, max_len=20):
        self.questions = questions
        self.answers = answers
        self.question_vocab = question_vocab
        self.answer_vocab = answer_vocab
        self.max_len = max_len

    def encode(self, sentence, vocab, max_len):
        encoded = [vocab.get(word, PAD_TOKEN) for word in sentence]
        return encoded[:max_len] + [PAD_TOKEN] * (max_len - len(encoded))

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        q_encoded = self.encode(self.questions[idx], self.question_vocab, self.max_len)
        a_encoded = [SOS_TOKEN] + self.encode(self.answers[idx], self.answer_vocab, self.max_len) + [EOS_TOKEN]
        return torch.tensor(q_encoded), torch.tensor(a_encoded)


In [7]:
train_dataset = ChatDataset(train_questions, train_answers, question_vocab, answer_vocab)
test_dataset = ChatDataset(test_questions, test_answers, question_vocab, answer_vocab)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [8]:
import torch.nn as nn

class Seq2Seq(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(Seq2Seq, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=PAD_TOKEN)
        self.encoder = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.decoder = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, input, target):
        embedded_input = self.embedding(input)
        _, (hidden, cell) = self.encoder(embedded_input)

        embedded_target = self.embedding(target)
        decoder_output, _ = self.decoder(embedded_target, (hidden, cell))
        output = self.fc(decoder_output)
        return output


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = Seq2Seq(
    vocab_size=len(question_vocab) + len(answer_vocab) + 3,  # +3 for PAD, SOS, EOS
    embedding_dim=128,
    hidden_dim=256,
    output_dim=len(answer_vocab) + 3
).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

def train_epoch(model, loader, optimizer, criterion):
    model.train()
    epoch_loss = 0
    for questions, answers in loader:
        questions, answers = questions.to(device), answers.to(device)
        optimizer.zero_grad()
        outputs = model(questions, answers[:, :-1])  # Shift target by 1
        loss = criterion(outputs.view(-1, outputs.size(-1)), answers[:, 1:].reshape(-1))  # Ignore <SOS>
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(loader)

# Train the model
for epoch in range(100):  # Adjust epochs as necessary
    loss = train_epoch(model, train_loader, optimizer, criterion)
    print(f"Epoch {epoch+1}, Loss: {loss:.4f}")


Epoch 1, Loss: 6.3378
Epoch 2, Loss: 5.1640
Epoch 3, Loss: 4.7441
Epoch 4, Loss: 4.3999
Epoch 5, Loss: 4.1107
Epoch 6, Loss: 3.8728
Epoch 7, Loss: 3.6368
Epoch 8, Loss: 3.4105
Epoch 9, Loss: 3.2441
Epoch 10, Loss: 3.0656
Epoch 11, Loss: 2.8980
Epoch 12, Loss: 2.7476
Epoch 13, Loss: 2.6090
Epoch 14, Loss: 2.4789
Epoch 15, Loss: 2.3497
Epoch 16, Loss: 2.2229
Epoch 17, Loss: 2.1084
Epoch 18, Loss: 2.0009
Epoch 19, Loss: 1.9000
Epoch 20, Loss: 1.8027
Epoch 21, Loss: 1.7106
Epoch 22, Loss: 1.6252
Epoch 23, Loss: 1.5258
Epoch 24, Loss: 1.4479
Epoch 25, Loss: 1.3693
Epoch 26, Loss: 1.2951
Epoch 27, Loss: 1.2303
Epoch 28, Loss: 1.1622
Epoch 29, Loss: 1.1065
Epoch 30, Loss: 1.0468
Epoch 31, Loss: 0.9955
Epoch 32, Loss: 0.9401
Epoch 33, Loss: 0.8945
Epoch 34, Loss: 0.8433
Epoch 35, Loss: 0.7995
Epoch 36, Loss: 0.7622
Epoch 37, Loss: 0.7211
Epoch 38, Loss: 0.6834
Epoch 39, Loss: 0.6488
Epoch 40, Loss: 0.6155
Epoch 41, Loss: 0.5867
Epoch 42, Loss: 0.5556
Epoch 43, Loss: 0.5287
Epoch 44, Loss: 0.50

In [10]:
import torch

def preprocess_question(question, vocab, max_len):
    """
    Tokenize and encode the question using the provided vocabulary and max length.
    """
    tokens = word_tokenize(question)
    encoded = [vocab.get(token, PAD_TOKEN) for token in tokens]  # Default to PAD_TOKEN for unknown words
    return torch.tensor(encoded[:max_len] + [PAD_TOKEN] * (max_len - len(encoded)), dtype=torch.long)

def generate_response(model, question, question_vocab, answer_vocab, max_len=20):
    """
    Generate a response for a given question using the trained Seq2Seq model.
    """
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        # Preprocess the question
        input_tensor = preprocess_question(question, question_vocab, max_len).unsqueeze(0).to(device)  # Add batch dimension

        # Encode the question
        embedded_input = model.embedding(input_tensor)
        _, (hidden, cell) = model.encoder(embedded_input)

        # Start decoding with <SOS> token
        decoder_input = torch.tensor([[SOS_TOKEN]], dtype=torch.long).to(device)
        response_tokens = []

        for _ in range(max_len):
            # Decode the next token
            embedded_decoder_input = model.embedding(decoder_input)
            decoder_output, (hidden, cell) = model.decoder(embedded_decoder_input, (hidden, cell))
            output_token_logits = model.fc(decoder_output[:, -1, :])  # Get the last time-step output
            predicted_token = output_token_logits.argmax(1).item()  # Get the token with highest probability
            response_tokens.append(predicted_token)

            if predicted_token == EOS_TOKEN:  # Stop decoding if <EOS> token is generated
                break
            decoder_input = torch.tensor([[predicted_token]], dtype=torch.long).to(device)

        # Convert tokens back to words
        idx_to_word = {idx: word for word, idx in answer_vocab.items()}
        response = [idx_to_word[token] for token in response_tokens if token not in {PAD_TOKEN, SOS_TOKEN, EOS_TOKEN}]
        return ' '.join(response)


In [20]:
# Test the chatbot
input_question = "වල් පැලෑටි පාලනය කරන්නේ කෙසේද?"  # Replace with a Sinhala question
response = generate_response(model, input_question, question_vocab, answer_vocab)
print("Chatbot Response:", response)

Chatbot Response: වල් පැලෑටි රහිත බීජ භාවිතා කරන්න , නිසි ලෙස බිම් මට්ටම් කිරීමට පුරුදු වන්න , සහ පෙර මතුවීමට පෙර වල්


In [13]:
torch.save(model.state_dict(), 'sinhala_chatbot_model.pth')

In [14]:
# Load the trained model
model = Seq2Seq(
    vocab_size=len(question_vocab) + len(answer_vocab) + 3,  # +3 for PAD, SOS, EOS
    embedding_dim=128,
    hidden_dim=256,
    output_dim=len(answer_vocab) + 3
).to(device)

model.load_state_dict(torch.load('sinhala_chatbot_model.pth', map_location=device))

  model.load_state_dict(torch.load('sinhala_chatbot_model.pth', map_location=device))


<All keys matched successfully>