In [5]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Read data from Excel file using pandas
data = pd.read_excel('D:/Document Classification/Preprocessed data.xlsx')  # Replace 'your_file.xlsx' with the actual file path

texts = data['Text']  # Assuming the text data is in a column named 'text_column'
labels = data['Category']  # Assuming the labels/categories are in a column named 'label_column'

# Convert categorical labels to numerical values using LabelEncoder
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

# Tokenizing and padding sequences
max_words = 1000
maxlen = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
data = pad_sequences(sequences, maxlen=maxlen)

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(data, labels_encoded, test_size=0.2, random_state=42)

# Building LSTM model
model = Sequential()
model.add(Embedding(max_words, 128, input_length=maxlen))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(np.max(labels_encoded) + 1, activation='softmax'))  # Output layer for multi-class classification

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Training the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluating the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Accuracy: {accuracy * 100:.2f}%")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 69.50%


In [6]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Read data from Excel file using pandas
data = pd.read_excel('D:/Document Classification/Preprocessed data.xlsx')  # Replace 'your_file.xlsx' with the actual file path

texts = data['Text']  # Assuming the text data is in a column named 'text_column'
labels = data['Category']  # Assuming the labels/categories are in a column named 'label_column'

# Convert categorical labels to numerical values using LabelEncoder
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

# Tokenizing and padding sequences
max_words = 1000
maxlen = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
data = pad_sequences(sequences, maxlen=maxlen)

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(data, labels_encoded, test_size=0.2, random_state=42)

# Building CNN model
cnn_model = Sequential()
cnn_model.add(Embedding(max_words, 128, input_length=maxlen))
cnn_model.add(Conv1D(32, 7, activation='relu'))
cnn_model.add(MaxPooling1D(5))
cnn_model.add(Conv1D(32, 7, activation='relu'))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(np.max(labels_encoded) + 1, activation='softmax'))  # Output layer for multi-class classification

cnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Training the model
cnn_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluating the model
loss, accuracy = cnn_model.evaluate(X_test, y_test)
print(f"Accuracy: {accuracy * 100:.2f}%")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 77.50%
