In [24]:
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [25]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding

In [26]:
import pandas as pd

In [27]:
import numpy as np

In [28]:
df = pd.read_excel("D:/github/Python/Deep Learning/annotation.xlsx")

In [29]:
df.head()

Unnamed: 0,sentence,indicator,factor,source
0,"Where superiority of numbers is overwhelming, ...",superiority of numbers,physical,"Clausewitz, 1989, p.196​"
1,Grand strategy should calculate and develop ec...,"economic resources, man-power",physical,"Hart, 1991, 322"
2,"Beyond geography, money has always been the gr...",finance,physical,"Smith, 2019, 19."
3,War is not so much a matter of armaments as of...,finance,physical,"Thucydides, 1972"
4,Moral elements are among the most important in...,Spirit and will,moral,"Clausewitz, 1989, p.184​"


In [30]:
df.factor.value_counts()

moral         7
physical      6
strategy      5
conceptual    4
outcome       4
Name: factor, dtype: int64

In [37]:
my_dict = {"physical":1, "conceptual":2, "moral":3, "strategy":4, "outcome":5}
my_dict

{'physical': 1, 'conceptual': 2, 'moral': 3, 'strategy': 4, 'outcome': 5}

In [39]:
df["num_factor"] = [my_dict[i] for i in df.factor]
df.head()

Unnamed: 0,sentence,indicator,factor,source,num_factor
0,"Where superiority of numbers is overwhelming, ...",superiority of numbers,physical,"Clausewitz, 1989, p.196​",1
1,Grand strategy should calculate and develop ec...,"economic resources, man-power",physical,"Hart, 1991, 322",1
2,"Beyond geography, money has always been the gr...",finance,physical,"Smith, 2019, 19.",1
3,War is not so much a matter of armaments as of...,finance,physical,"Thucydides, 1972",1
4,Moral elements are among the most important in...,Spirit and will,moral,"Clausewitz, 1989, p.184​",3


**1.Converting reviews into one-hot encoding**

In [8]:
vocab_size= 100

In [9]:
one_hot("hi men!", 30)

[12, 27]

In [18]:
encoded_pers = [one_hot(i, vocab_size) for i in df.sentence]

In [19]:
encoded_pers[1]

[6, 55, 1, 79, 69, 26, 19, 68, 69, 23, 98, 61, 22, 33, 39, 38, 34, 56, 69, 55]

In [20]:
len(encoded_pers)

26

**2.Padding to fix length**

We will make every vector size equal with pad_sequence method.

In [21]:
max_length = 50

padded_pers = pad_sequences(encoded_pers, maxlen = max_length, padding ="post")
padded_pers[1]

array([ 6, 55,  1, 79, 69, 26, 19, 68, 69, 23, 98, 61, 22, 33, 39, 38, 34,
       56, 69, 55,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

In [22]:
len(padded_pers)

26

In [23]:
embedded_vector_size= 10

model = Sequential()

#1st layer is embedding layer-one-hot encoded matrix
model.add(Embedding(vocab_size, embedded_vector_size, input_length=max_length, name="word_embedding_tf_keras_pax"))

#2nd layer: Flattened version of embedded vectors-product of embedding layer-random weighted and one-hot encoded vector of word
model.add(Flatten())

#3th layer is one nueron of sigmoid AF function. 
model.add(Dense(1, activation = "softmax"))

In [40]:
X = padded_pers
y = np.array(df.num_factor)

In [41]:
model.compile(optimizer = "adam", loss="binary_crossentropy", metrics= ["accuracy"])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 word_embedding_tf_keras_pa  (None, 50, 10)            1000      
 x (Embedding)                                                   
                                                                 
 flatten (Flatten)           (None, 500)               0         
                                                                 
 dense (Dense)               (None, 1)                 501       
                                                                 
Total params: 1501 (5.86 KB)
Trainable params: 1501 (5.86 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [42]:
#Training
model.fit(X,y, epochs = 50, verbose=0)

<keras.src.callbacks.History at 0x21eb5f98b50>

In [43]:
#Evaluate the model
loss, accuracy = model.evaluate(X,y)



**Getting Word Embeddings**

In [44]:
weights = model.get_layer("word_embedding_tf_keras_pax").get_weights()[0]
len(weights)

100

In [45]:
#grand:6
#strategy:55

In [None]:
# while training the model we get word embeddings as a by-product. 
# Word embeddings are nothing but parameters in NN.

In [46]:
a = weights[6]
a

array([-0.03765959,  0.08546515,  0.04203944,  0.00911331, -0.00683076,
       -0.00536414,  0.01121998,  0.10315671, -0.03254436, -0.08932387],
      dtype=float32)

In [47]:
b = weights[55]
b

array([-0.05860563,  0.04341277, -0.06366133,  0.02189979,  0.09917662,
        0.07977039,  0.10405131, -0.07183259,  0.00242503,  0.01796985],
      dtype=float32)

In [48]:
from numpy.linalg import norm

In [49]:
cosine = np.dot(a,b)/(norm(a)*norm(b))
print("Cosine Similarity: ", cosine)

Cosine Similarity:  -0.15566348
