### One-hot Encoding

In [1]:
# %pip install qalsadi
from qalsadi.lemmatizer import Lemmatizer

In [2]:
def arabic_lemmatize(word):
    lemmatizer = Lemmatizer()
    return lemmatizer.lemmatize(word)

In [3]:
ar_docs = ["وقبر حربٍ بمكان قفرٍ", "وليس قرب قبر حربٍ قبرٌ"]

In [4]:
vocab = {}
c = 0
for doc in ar_docs:
    words = doc.split(' ')
    for w in words:
        w = arabic_lemmatize(w)
        if w not in vocab:
            c += 1
            vocab[w] = c
            
vocab

{'قبر': 1, 'حرب': 2, 'مكان': 3, 'قفر': 4, 'ليس': 5, 'قرب': 6}

In [5]:
def get_onehot_vector(text):
    encodedVector = []
    text_tokenized = text.split(' ')
    for w in text_tokenized:
        w = arabic_lemmatize(w)
        tmp = [0] * len(vocab)
        if w in vocab:
            tmp[vocab[w] - 1] = 1
        encodedVector.append(tmp)
    return encodedVector

In [6]:
for doc in ar_docs:
    print(doc)
    print( get_onehot_vector(doc) )

وقبر حربٍ بمكان قفرٍ
[[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0]]
وليس قرب قبر حربٍ قبرٌ
[[0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0]]


##### One-hot Encoding Scikit-learn

In [7]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

In [8]:
d = ['dog bites man', 'man bites dog', 'dog eats meat', 'man eats food']

words = []
sentences = []
for s in d:
    s = s.split()
    words += s
    sentences.append(s)

print(words)
print(sentences)

['dog', 'bites', 'man', 'man', 'bites', 'dog', 'dog', 'eats', 'meat', 'man', 'eats', 'food']
[['dog', 'bites', 'man'], ['man', 'bites', 'dog'], ['dog', 'eats', 'meat'], ['man', 'eats', 'food']]


In [9]:
encodedData = LabelEncoder().fit_transform(words)
print(encodedData)

onehotEncoded = OneHotEncoder().fit_transform(sentences)
onehotEncodedArr = onehotEncoded.toarray()
print(onehotEncodedArr)
print(onehotEncoded)

[1 0 4 4 0 1 1 2 5 4 2 3]
[[1. 0. 1. 0. 0. 0. 1. 0.]
 [0. 1. 1. 0. 1. 0. 0. 0.]
 [1. 0. 0. 1. 0. 0. 0. 1.]
 [0. 1. 0. 1. 0. 1. 0. 0.]]
  (0, 0)	1.0
  (0, 2)	1.0
  (0, 6)	1.0
  (1, 1)	1.0
  (1, 2)	1.0
  (1, 4)	1.0
  (2, 0)	1.0
  (2, 3)	1.0
  (2, 7)	1.0
  (3, 1)	1.0
  (3, 3)	1.0
  (3, 5)	1.0
