In [48]:
# !pip install gensim

In [49]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import gensim

In [50]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [51]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Datasets/cleaned_hm.csv')

In [52]:
df.head(5)

Unnamed: 0,hmid,wid,reflection_period,original_hm,cleaned_hm,modified,num_sentence,ground_truth_category,predicted_category
0,27673,2053,24h,I went on a successful date with someone I fel...,I went on a successful date with someone I fel...,True,1,,affection
1,27674,2,24h,I was happy when my son got 90% marks in his e...,I was happy when my son got 90% marks in his e...,True,1,,affection
2,27675,1936,24h,I went to the gym this morning and did yoga.,I went to the gym this morning and did yoga.,True,1,,exercise
3,27676,206,24h,We had a serious talk with some friends of our...,We had a serious talk with some friends of our...,True,2,bonding,bonding
4,27677,6227,24h,I went with grandchildren to butterfly display...,I went with grandchildren to butterfly display...,True,1,,affection


## Data Cleaning

In [53]:
df['predicted_category'].value_counts()

Unnamed: 0_level_0,count
predicted_category,Unnamed: 1_level_1
affection,34168
achievement,33993
enjoy_the_moment,11144
bonding,10727
leisure,7458
nature,1843
exercise,1202


In [54]:
df['num_sentence'].value_counts()

Unnamed: 0_level_0,count
num_sentence,Unnamed: 1_level_1
1,83711
2,9542
3,3847
4,1624
5,821
6,336
7,183
8,107
10,68
9,61


In [55]:
df = df.loc[df['num_sentence'] <= 10]

In [56]:
df['predicted_category'].value_counts()

Unnamed: 0_level_0,count
predicted_category,Unnamed: 1_level_1
affection,34020
achievement,33966
enjoy_the_moment,11115
bonding,10700
leisure,7458
nature,1839
exercise,1202


In [57]:
encode= {
    "affection" : 0,
    "achievement" : 1,
    "enjoy_the_moment" : 2,
    "bonding" : 3,
    "leisure" : 4,
    "nature" : 5,
    "exercise" : 6
}

In [58]:
#df["predicted_category"] = df["predicted_category"].apply(lambda x: encode[x])
df.head()

Unnamed: 0,hmid,wid,reflection_period,original_hm,cleaned_hm,modified,num_sentence,ground_truth_category,predicted_category
0,27673,2053,24h,I went on a successful date with someone I fel...,I went on a successful date with someone I fel...,True,1,,affection
1,27674,2,24h,I was happy when my son got 90% marks in his e...,I was happy when my son got 90% marks in his e...,True,1,,affection
2,27675,1936,24h,I went to the gym this morning and did yoga.,I went to the gym this morning and did yoga.,True,1,,exercise
3,27676,206,24h,We had a serious talk with some friends of our...,We had a serious talk with some friends of our...,True,2,bonding,bonding
4,27677,6227,24h,I went with grandchildren to butterfly display...,I went with grandchildren to butterfly display...,True,1,,affection


## Data Preprocessing

In [59]:
import nltk
import re
from nltk.corpus import stopwords

In [60]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [61]:
lines = df["cleaned_hm"].values.tolist()

In [62]:
## Main Logic of Preprocessing
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

processed_lines=[]
for line in lines:
  review = re.sub('[^a-zA-Z0-9]', ' ', line)
  review = review.lower()
  review = review.split()

  review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
  review = ' '.join(review)
  processed_lines.append(review)

In [63]:
len(processed_lines)

100300

In [64]:
## vocabulary size
voc_size= 128

## Embeddeding Representation

In [65]:
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot

In [66]:
onehot_rep = [one_hot(words, voc_size) for words in processed_lines]
onehot_rep

[[7, 6, 100, 17, 59, 60, 124],
 [56, 65, 27, 50, 64, 112],
 [7, 109, 39, 116],
 [13, 122, 21, 27, 78, 98, 104, 106, 15],
 [7, 26, 96, 113, 12, 33],
 [119, 102, 16],
 [125, 2, 84, 26, 81, 53, 54],
 [27, 14, 108, 69, 96, 46],
 [115, 15, 118, 28],
 [6, 20, 121, 48, 54, 28],
 [53, 79, 47, 21, 42, 25, 114],
 [41, 108, 30, 55, 1, 90, 81, 37, 102],
 [7, 27, 21, 14],
 [85, 44, 125, 62, 59],
 [89,
  88,
  43,
  107,
  99,
  99,
  56,
  28,
  94,
  10,
  99,
  37,
  93,
  49,
  56,
  65,
  89,
  53,
  78,
  74,
  49,
  8,
  58,
  104],
 [65, 20, 108, 126, 39],
 [41, 110, 82, 114, 81],
 [113, 27, 55, 26, 114, 77],
 [127, 9, 108, 21, 102, 16, 125, 83],
 [2, 127, 53, 7, 61, 7, 93, 23, 61, 81, 4, 56],
 [27, 12],
 [57, 21, 2, 39, 53, 98, 56, 100, 73, 93],
 [35, 125, 66, 53, 54, 35, 54, 108],
 [3, 84, 76, 58, 84, 56, 84, 24, 65, 96, 80, 56, 4, 26, 70, 76, 39, 96],
 [1, 27, 9, 1, 53, 10],
 [26, 121, 78, 6, 24, 117],
 [39, 60, 1, 118, 22, 22, 48],
 [112, 90, 27, 77, 10, 50, 1, 88, 100, 90, 87],
 [65, 44

In [67]:
processed_lines[2]

'went gym morn yoga'

In [68]:
onehot_rep[2]

[7, 109, 39, 116]

In [69]:
max_length = 0
for sentence in processed_lines:
    max_length = max(max_length, len(sentence.split()))

print(f"Maximum sentence length: {max_length}")

Maximum sentence length: 131


In [70]:
sent_length = 132
embedded_docs = pad_sequences(onehot_rep, padding='post', maxlen = sent_length)
print(embedded_docs)

[[  7   6 100 ...   0   0   0]
 [ 56  65  27 ...   0   0   0]
 [  7 109  39 ...   0   0   0]
 ...
 [ 22  56 102 ...   0   0   0]
 [109  54 115 ...   0   0   0]
 [109  29 102 ...   0   0   0]]


In [71]:
embedding_vector_features = 40

In [72]:
model_glove = Sequential()
model_glove.add(Embedding(voc_size, embedding_vector_features))
model_glove.add(LSTM(units=32, dropout=0.2, recurrent_dropout=0.25))
model_glove.add(Dense(7, activation='softmax'))

model_glove.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [73]:
print(model_glove.summary())

None


In [76]:
from sklearn.model_selection import train_test_split

# Assuming your target variable is 'predicted_category' and features are in 'embedded_docs'
X = embedded_docs
y = df['predicted_category']

# Convert the target variable to one-hot encoding
y = pd.get_dummies(y).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [77]:
history_glove = model_glove.fit(X_train, y_train, batch_size=32, epochs=15, validation_data=(X_test, y_test), verbose=1)

Epoch 1/15
[1m 111/2351[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m22:48[0m 611ms/step - accuracy: 0.3463 - loss: 1.6847

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

plt.plot(history_glove.history['loss'])
plt.plot(history_glove.history['val_loss'])

In [None]:
plt.plot(history_glove.history['accuracy'])
plt.plot(history_glove.history['val_accuracy'])