In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout, GlobalAveragePooling1D, Embedding, Conv1D
from tensorflow.keras.layers import GlobalMaxPooling1D
from tensorflow.keras import Input
from tensorflow.keras.callbacks import EarlyStopping, History
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from nltk.stem import WordNetLemmatizer
import pandas as pd
import neattext.functions as nfx
from sklearn.preprocessing import LabelEncoder
from keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import pickle
import keras
from tensorflow.keras import Model
from tensorflow.keras.models import load_model
import keras_nlp
from keras_nlp.tokenizers import WordPieceTokenizer
from keras_nlp.layers import TokenAndPositionEmbedding, TransformerEncoder

In [2]:
with open('train.txt', 'r') as f:
    df = f.readlines()

with open('test.txt', 'r') as kk:
    c = kk.readlines()


for i in c:
    df.append(i)

with open('val.txt', 'r') as mm:
    valid = mm.readlines()


new_df = [i.split(';') for i in df]
validate = [i.split(';') for i in valid]

text = list()
label = list()
for i in new_df:
    for j in i:
        if '\n' not in j:
            text.append(j)
        else:
            label.append(j)

            
text_valid, label_valid = list(), list()
for i in validate:
    for j in i:
        if '\n' not in j:
            text_valid.append(j)
        else:
            label_valid.append(j)

data = pd.DataFrame()
data_valid = pd.DataFrame()

data['text'] = text
data['label'] = [i.replace('\n', '') for i in label]

data_valid['text_valid'] = text_valid
data_valid['label_valid'] = [i.replace('\n', '') for i in label_valid]


In [3]:
data

Unnamed: 0,text,label
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
...,...,...
17995,i just keep feeling like someone is being unki...,anger
17996,im feeling a little cranky negative after this...,anger
17997,i feel that i am useful to my people and that ...,joy
17998,im feeling more comfortable with derby i feel ...,joy


In [4]:
lem = WordNetLemmatizer()
data['text'] = data['text'].apply(lambda x: ' '.join(lem.lemmatize(word) for word in x.split()))
data_valid['text_valid'] = data_valid['text_valid'].apply(lambda x: ' '.join(lem.lemmatize(word) for word in x.split()))

In [5]:
data['text']= data['text'].apply(nfx.remove_stopwords)
data_valid['text_valid'] = data_valid['text_valid'].apply(nfx.remove_stopwords)
data

Unnamed: 0,text,label
0,didnt feel humiliated,sadness
1,feeling hopeless damned hopeful care awake,sadness
2,im grabbing minute post feel greedy wrong,anger
3,feeling nostalgic fireplace know property,love
4,feeling grouchy,anger
...,...,...
17995,feeling like unkind wrong think people close,anger
17996,im feeling little cranky negative doctor appoi...,anger
17997,feel useful people great feeling achievement,joy
17998,im feeling comfortable derby feel start step s...,joy


In [6]:
data['label'].value_counts()

joy         6057
sadness     5247
anger       2434
fear        2161
love        1463
surprise     638
Name: label, dtype: int64

In [7]:
emotions = {'joy': 0,
            'sadness': 1,
            'anger': 2,
            'fear': 3,
            'love': 4,
            'surprise': 5}
data['label'] = data['label'].map(emotions)
data_valid['label_valid'] = data_valid['label_valid'].map(emotions)

In [8]:
data

Unnamed: 0,text,label
0,didnt feel humiliated,1
1,feeling hopeless damned hopeful care awake,1
2,im grabbing minute post feel greedy wrong,2
3,feeling nostalgic fireplace know property,4
4,feeling grouchy,2
...,...,...
17995,feeling like unkind wrong think people close,2
17996,im feeling little cranky negative doctor appoi...,2
17997,feel useful people great feeling achievement,0
17998,im feeling comfortable derby feel start step s...,0


In [9]:
token = Tokenizer(num_words=4000)
token.fit_on_texts(data['text'])
vocab = ['[UNK]']
for i, j in token.word_index.items():
    vocab.append(i)

In [10]:
tokenizer = WordPieceTokenizer(vocabulary=vocab, sequence_length=20)

In [11]:
X = data['text']
y = data['label']

#token.fit(data['text'])


In [12]:
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)
X_valid = data_valid['text_valid']
y_valid = data_valid['label_valid']

In [13]:
X_train_ = tokenizer(X_train)
X_test_ = tokenizer(X_test)
X_valid_ = tokenizer(X_valid)

#X_train_ = token.transform(X_train).astype('float16')
#X_test_ = token.transform(X_test).astype('float16')
#X_valid_ = token.transform(X_valid).astype('float16')


y_train_ = to_categorical(y_train)
y_test_ = to_categorical(y_test)
y_valid_ = to_categorical(y_valid)

In [14]:
X_train_

<tf.Tensor: shape=(14400, 20), dtype=int32, numpy=
array([[ 120,   26,    3, ...,    0,    0,    0],
       [   1,    3,    4, ...,    0,    0,    0],
       [   5, 4280, 7946, ...,    0,    0,    0],
       ...,
       [   1,  640,    0, ...,    0,    0,    0],
       [   5,  162,    1, ...,    0,    0,    0],
       [   1,   87,  140, ...,    0,    0,    0]])>

In [15]:
opt = Adam(learning_rate=0.1)

vocab_size = len(token.word_index) + 1 
vocab_size

In [16]:
model = Sequential()
model.add(Input(shape=(None,)))

model.add(TokenAndPositionEmbedding(vocabulary_size=len(vocab), sequence_length=20, embedding_dim=128))

model.add(TransformerEncoder(num_heads=8, intermediate_dim=128, dropout=0.2, activation='relu'))

model.add(GlobalMaxPooling1D())
#model.add(Dense(128, activation='relu'))
#model.add(Dense(64, activation='relu'))
model.add(Dense(6, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['CategoricalAccuracy', 'accuracy', 'AUC', 'Precision', 'Recall'])
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 token_and_position_embeddin  (None, None, 128)        1813120   
 g (TokenAndPositionEmbeddin                                     
 g)                                                              
                                                                 
 transformer_encoder (Transf  (None, None, 128)        99584     
 ormerEncoder)                                                   
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 6)                 774       
                                                                 
Total params: 1,913,478
Trainable params: 1,913,478
Non-

In [17]:

#estimator = KerasClassifier(build_fn=model(), epochs=200, batch_size=5, verbose=0)

In [18]:
callbacks = EarlyStopping(monitor = 'val_loss', 
                          patience = 5, 
                          verbose = 6,
                          restore_best_weights = True,
                          mode = 'auto')

In [19]:
history = model.fit(X_train_, y_train_, callbacks=[callbacks], epochs=100, validation_data=(X_valid_, y_valid_), batch_size=20)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 7: early stopping


In [26]:
model.save_weights('C:/Users/lenovo/DATA SCIENCE PROJECTS/MODEL/EMOTION DETECTION/weights.h5', save_format='HDF5')

In [27]:
model.save('emotions.h5')

In [22]:
y_pred = model.predict(X_test_)



In [23]:
model.evaluate(X_test_, y_test_)



[0.2786557376384735,
 0.89083331823349,
 0.89083331823349,
 0.9910821914672852,
 0.8987020254135132,
 0.8847222328186035]

In [24]:
model.evaluate(X_train_, y_train_)



[0.0774773359298706,
 0.9727083444595337,
 0.9727083444595337,
 0.9990627765655518,
 0.9747700095176697,
 0.9712499976158142]

In [25]:
lis = ['for', 'to']

new = ' '.join(i for i in lis)
new

'for to'

with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(token, f)
    