In [31]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from keras.layers import Dropout
from keras.models import Sequential
from keras import layers
from keras import optimizers
from tensorflow.keras.optimizers import Adam

In [32]:
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath,encoding='utf-8') as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word]
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

In [33]:
df_train=pd.read_csv("dataset1.csv")
df_train_labels = df_train[["anger", "fear", "joy", "love", "sadness", "surprise", "thankfulness",
                            "disgust", "guilt"]]

In [34]:
X = df_train['text'].values
X= X.astype(str)
y = df_train_labels.values

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [36]:
tokenizer = Tokenizer() 
tokenizer.fit_on_texts(X_train)

In [37]:
num_words=9000
tokenizer.word_index = {e:i for e,i in tokenizer.word_index.items() if i <= num_words} # <= because tokenizer is 1 indexed
tokenizer.word_index[tokenizer.oov_token] = num_words + 1

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

maxlen = 50
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [38]:
input_dim = X_train.shape[1] 

embedding_dim = 300
embedding_matrix = create_embedding_matrix('glove.6B.300d.txt', tokenizer.word_index, embedding_dim)

nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
embedding_accuracy = nonzero_elements / vocab_size
print('embedding accuracy: ' + str(embedding_accuracy))

model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=maxlen, trainable=True))
model.add(layers.Conv1D(256, 3, activation='relu'))
model.add(Dropout(0.2))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(28, activation='sigmoid'))
opt = Adam(lr=0.0002)
model.compile(optimizer=opt, loss='binary_crossentropy')
model.summary()

embedding accuracy: 0.8636969562319484
Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 50, 300)           2700600   
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 48, 256)           230656    
_________________________________________________________________
dropout_5 (Dropout)          (None, 48, 256)           0         
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 256)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 28)                7196      
Total params: 2,938,452
Trainable params: 2,938,452
Non-trainable params: 0
_________________________________________________________________
