In [1]:


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

# Load the dataset
file_path = "C:/Users/ghada/DL/data/train_data_from_txt.csv"
data = pd.read_csv(file_path)


In [2]:
# 1. Text Preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    return ' '.join(tokens)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ghada\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ghada\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
# Apply preprocessing to questions
data['processed_question'] = data['question'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
data['encoded_target'] = label_encoder.fit_transform(data['target'])
labels = np.array(data['encoded_target'])

# 3. Tokenization and Padding
# Tokenize the text
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(data['processed_question'])
word_index = tokenizer.word_index

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(data['processed_question'])

# Pad sequences to ensure uniform length
max_length = 50  # Adjust as needed
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')



In [7]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    padded_sequences, labels, test_size=0.2, random_state=42
)

# Check shapes
print("Training Data Shape:", X_train.shape)
print("Validation Data Shape:", X_val.shape)

Training Data Shape: (12361, 50)
Validation Data Shape: (3091, 50)


In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

# Hyperparameters
vocab_size = 10000  # Match with Tokenizer's num_words
embedding_dim = 128  # Dimension of word embeddings
num_classes = len(label_encoder.classes_)  # Number of unique labels

# Build the CNN model
model = Sequential([
    # Embedding layer
    Embedding(input_dim=vocab_size, output_dim=embedding_dim),
    
    # Convolutional layer
    Conv1D(filters=128, kernel_size=3, activation='relu'),
    
    # Pooling layer
    GlobalMaxPooling1D(),
     # Dense layers
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')  # Softmax for multi-class classification
])

In [12]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [17]:
model.summary()

In [14]:
# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=32,
    verbose=1
)


Epoch 1/50
[1m387/387[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.6701 - loss: 0.8430 - val_accuracy: 0.9867 - val_loss: 0.0331
Epoch 2/50
[1m387/387[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.9961 - loss: 0.0150 - val_accuracy: 0.9948 - val_loss: 0.0176
Epoch 3/50
[1m387/387[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.9995 - loss: 0.0028 - val_accuracy: 0.9955 - val_loss: 0.0158
Epoch 4/50
[1m387/387[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.9999 - loss: 8.3542e-04 - val_accuracy: 0.9958 - val_loss: 0.0182
Epoch 5/50
[1m387/387[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 1.0000 - loss: 3.0193e-04 - val_accuracy: 0.9958 - val_loss: 0.0196
Epoch 6/50
[1m387/387[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 1.0000 - loss: 1.7205e-04 - val_accuracy: 0.9958 - val_loss: 0.0190
Epoch 7/50
[1m3

In [15]:
def preprocess_input_text(text, tokenizer, max_length):
    """
    Preprocess a single input text, tokenize, and pad it.
    """
    # Preprocess the text
    text = preprocess_text(text)
    # Tokenize the text
    sequence = tokenizer.texts_to_sequences([text])
    # Pad the sequence
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')
    return padded_sequence

def predict_category(text, model, tokenizer, max_length, label_encoder):
    """
    Predict the category of a given text using the trained model.
    """
    # Preprocess the input text
    padded_sequence = preprocess_input_text(text, tokenizer, max_length)
    # Predict the category probabilities
    predictions = model.predict(padded_sequence)
    # Get the category index with the highest probability
    predicted_index = np.argmax(predictions, axis=1)[0]
    # Convert the index back to the category label
    predicted_category = label_encoder.inverse_transform([predicted_index])[0]
    return predicted_category


In [16]:
# Example questions to test the prediction function
sample_questions = [
    "What is the capital of France?",
    "How do I install Python on Windows?",
    "Who won the 2020 Olympics?",
    "Explain the theory of relativity."
]

print("Sample Predictions:")
for question in sample_questions:
    predicted_category = predict_category(question, model, tokenizer, max_length, label_encoder)
    print(f"Question: {question}")
    print(f"Predicted Category: {predicted_category}")
    print("-" * 50)


Sample Predictions:
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
Question: What is the capital of France?
Predicted Category: LOC
--------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
Question: How do I install Python on Windows?
Predicted Category: ENTY
--------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Question: Who won the 2020 Olympics?
Predicted Category: LOC
--------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
Question: Explain the theory of relativity.
Predicted Category: ENTY
--------------------------------------------------
