In [6]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import layers, models
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# NLTK Downloads
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Preprocess the data
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(sentence):
    words = word_tokenize(sentence.lower())
    filtered_words = [lemmatizer.lemmatize(word) for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(filtered_words)

# Load data
def load_data(file_path):
    data = pd.read_csv(file_path, header=None, names=['sentence', 'label'], sep=';')
    data['sentence'] = data['sentence'].apply(preprocess_text)
    return data

train_data = load_data('/content/cleaned_train.txt')
val_data = load_data('/content/val.txt')
test_data = load_data('/content/test.txt')

# Label encoding
label_encoder = LabelEncoder()
train_data['label'] = label_encoder.fit_transform(train_data['label'])
val_data['label'] = label_encoder.transform(val_data['label'])
test_data['label'] = label_encoder.transform(test_data['label'])

# Tokenization and Vectorization
max_tokens = 12000
output_sequence_length = 100

tokenizer = Tokenizer(num_words=max_tokens, oov_token="<OOV>")
tokenizer.fit_on_texts(train_data['sentence'])

train_sequences = get_sequences(tokenizer, train_data['sentence'])
val_sequences = get_sequences(tokenizer, val_data['sentence'])
test_sequences = get_sequences(tokenizer, test_data['sentence'])

# Model setup
model = models.Sequential([
    layers.Embedding(input_dim=max_tokens + 1, output_dim=128, input_length=output_sequence_length),
    layers.Conv1D(filters=64, kernel_size=3, activation='relu'),
    layers.GlobalMaxPooling1D(),
    layers.Dense(128, activation='relu'),  # Additional ANN layer
    layers.Dense(6, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(train_sequences, train_data['label'], epochs=10, validation_data=(val_sequences, val_data['label']))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7b45324b76a0>

IMPORTING AND TESTING THE TfLITE MODEL WITH NEW TEXT INPUT

In [19]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import tokenizer_from_json
import json

# Load the TFLite model
interpreter = tf.lite.Interpreter(model_path='/content/modelV2.tflite')
interpreter.allocate_tensors()

input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()


# Load your saved tokenizer
with open('tokenizerV2.json') as f:
    data = json.load(f)
    tokenizer = tokenizer_from_json(data)

# Function to preprocess text
def preprocess_input_text(text, tokenizer, max_length):
    sequences = tokenizer.texts_to_sequences([text])
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
    return padded_sequences

In [21]:
# Example text input
input_text = "Today I will be presenting my research results, Im so nervous that Im feeling nauseous"

max_length = 100
preprocessed_text = preprocess_input_text(input_text, tokenizer, max_length)

# Set the tensor to point to the input data to be inferred
interpreter.set_tensor(input_details[0]['index'], np.float32(preprocessed_text))

# Run the inference
interpreter.invoke()

# Extract the output
output_data = interpreter.get_tensor(output_details[0]['index'])

# Assuming the output is a probability distribution over labels
predicted_label_index = np.argmax(output_data, axis=1)

# Convert predicted_label_index to the corresponding label
predicted_label = label_encoder.inverse_transform(predicted_label_index)
print(f"Predicted Emotion: {predicted_label[0]}")

Predicted Emotion: fear
