In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding, Bidirectional
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report, accuracy_score
import pickle

In [3]:
# Load datasets
keystroke_data = pd.read_csv("../data/text_samples/KeystrokeData.csv")
stress_data = pd.read_csv("../data/text_samples/Stress.csv")

In [9]:
# Merge datasets
keystroke_data['source'] = 'keystroke'
stress_data['source'] = 'text'
data = pd.concat([keystroke_data, stress_data], ignore_index=True)

In [10]:
# Ensure text column has no NaN values and convert all to string
data['text'] = data['text'].fillna("").astype(str)

In [11]:
# Text preprocessing
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(data['text'])
X = tokenizer.texts_to_sequences(data['text'])
X = pad_sequences(X, maxlen=100)

In [12]:
# Save tokenizer
with open('../model/tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)

In [13]:
# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['label'])

In [14]:
# Save label encoder
with open('../model/text_label_encoder.pkl', 'wb') as file:
    pickle.dump(label_encoder, file)


In [15]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_shape=(100,)),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.3),
    Bidirectional(LSTM(32)),
    Dropout(0.3),
    Dense(16, activation='relu'),
    Dense(len(np.unique(y)), activation='softmax')
])

  super().__init__(**kwargs)


In [18]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [19]:
# Train Model
history = model.fit(X_train, y_train, epochs=20, batch_size=16, validation_data=(X_test, y_test))

Epoch 1/20
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 48ms/step - accuracy: 0.6466 - loss: 0.6465 - val_accuracy: 0.7019 - val_loss: 0.4422
Epoch 2/20
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 47ms/step - accuracy: 0.6825 - loss: 0.4695 - val_accuracy: 0.6995 - val_loss: 0.4335
Epoch 3/20
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 46ms/step - accuracy: 0.7172 - loss: 0.4606 - val_accuracy: 0.7864 - val_loss: 0.3969
Epoch 4/20
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 47ms/step - accuracy: 0.8429 - loss: 0.3605 - val_accuracy: 0.8192 - val_loss: 0.3517
Epoch 5/20
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 48ms/step - accuracy: 0.8837 - loss: 0.2624 - val_accuracy: 0.8169 - val_loss: 0.3555
Epoch 6/20
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 47ms/step - accuracy: 0.9326 - loss: 0.1833 - val_accuracy: 0.8298 - val_loss: 0.4522
Epoch 7/20
[1m2

In [22]:
# Evaluate Model
y_pred = np.argmax(model.predict(X_test), axis=1)
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)
print("Classification Report:")

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
Model Accuracy: 0.8204225352112676
Classification Report:


In [23]:
print(classification_report(y_test, y_pred, target_names=[str(cls) for cls in label_encoder.classes_]))

              precision    recall  f1-score   support

         0.0       0.70      0.71      0.70       255
         1.0       0.73      0.72      0.72       278
         nan       1.00      1.00      1.00       319

    accuracy                           0.82       852
   macro avg       0.81      0.81      0.81       852
weighted avg       0.82      0.82      0.82       852



In [25]:
# Save Model
model.save('../model/text_model1.h5')



ValueError: Unable to synchronously create dataset (name already exists)

In [26]:
# Save training history
with open('../model/text_model_history.pkl', 'wb') as file:
    pickle.dump(history.history, file)

In [27]:
print("Model training complete and saved!")

Model training complete and saved!
