In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, RepeatVector
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
# Load the dataset
dataset = pd.read_csv('intern_screening_dataset.csv')

In [3]:
# Converting the dataset into strings
dataset['question'] = dataset['question'].astype(str)
dataset['answer'] = dataset['answer'].astype(str)

In [4]:
# Downloading necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hites\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hites\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
def clean_text(text):
    # Lowercasing the text
    text = text.lower()
    # Removing special characters and digits
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Tokenizing the data
    tokens = word_tokenize(text)
    # Removing Stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [6]:
# Applying the cleaning function to the dataset
dataset['question'] = dataset['question'].apply(clean_text)
dataset['answer'] = dataset['answer'].apply(clean_text)

In [7]:
# Tokenizing the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(dataset['question'].values)
questions_seq = tokenizer.texts_to_sequences(dataset['question'].values)
answers_seq = tokenizer.texts_to_sequences(dataset['answer'].values)


In [8]:
# Pad the sequences
max_seqlen = 150
questions_padded = pad_sequences(questions_seq, maxlen=max_seqlen)
answers_padded = pad_sequences(answers_seq, maxlen=max_seqlen)

In [9]:
# Splitting the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(questions_padded, answers_padded, test_size=0.2, random_state=42)

In [10]:
# Define the model architecture
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100
hidden_units = 128

In [11]:
# Encoder
encoder_inputs = Input(shape=(max_seqlen,))
encoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(hidden_units)(encoder_embedding)

In [12]:
# Decoder
decoder_inputs = RepeatVector(max_seqlen)(encoder_lstm)
decoder_lstm = LSTM(hidden_units, return_sequences=True)(decoder_inputs)
decoder_dense = Dense(vocab_size, activation='softmax')(decoder_lstm)

In [13]:
# Model
model = Model(encoder_inputs, decoder_dense)

In [14]:
# Compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print model summary
model.summary()

In [15]:
# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [16]:
# Train the model
epochs = 20
batch_size = 32
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val, y_val), callbacks=[early_stopping])

Epoch 1/20
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m281s[0m 672ms/step - accuracy: 0.6402 - loss: 3.8489 - val_accuracy: 0.6606 - val_loss: 2.8565
Epoch 2/20
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m266s[0m 647ms/step - accuracy: 0.6496 - loss: 2.9403 - val_accuracy: 0.6606 - val_loss: 2.8614
Epoch 3/20
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m272s[0m 663ms/step - accuracy: 0.6495 - loss: 2.9415 - val_accuracy: 0.6606 - val_loss: 2.8624
Epoch 4/20
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m286s[0m 697ms/step - accuracy: 0.6492 - loss: 2.9429 - val_accuracy: 0.6606 - val_loss: 2.8596


In [23]:
# Evaluate the model
loss, accuracy = model.evaluate(X_val, y_val)
print(f'Validation Loss: {loss:.4f}, Validation Accuracy: {accuracy:.4f}')

[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 243ms/step - accuracy: 0.6603 - loss: 2.8526
Validation Loss: 2.8565, Validation Accuracy: 0.6606


In [None]:
# Function to generate answers
def generate_answer(question):
    question_seq = tokenizer.texts_to_sequences([question])
    question_padded = pad_sequences(question_seq, maxlen=max_seqlen)
    print(f'Question sequence before padding: {question_seq}')
    print(f'Padded question sequence: {question_padded}')
    answer_seq = model.predict(question_padded)
    print(f'Answer sequence predicted: {answer_seq}')
    answer = ' '.join([tokenizer.index_word[word] for word in np.argmax(answer_seq, axis=-1)[0] if word != 0])
    return answer

# Example questions
questions = [
    "What causes heart failure?",
    "How to prevent Bronchitis?",
    "What is Kidney Disease?"
]

# Generate answers
for question in questions:
    generated_answer = generate_answer(question)
    print(f'Question: {question}')
    print(f'Generated Answer: {generated_answer}\n')