In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# One-Hot Encoding

In [3]:
d1 = "Dog bites man."
d2 = "Man bites dog."
d3 = "Dog eats meat."
d4 = "Man eats food."

text = [d1, d2, d3, d4]
tag = ["D1", "D2", "D3", "D4"]

toy_df = pd.DataFrame({"tags": tag, "text": text})

In [4]:
toy_df

Unnamed: 0,tags,text
0,D1,Dog bites man.
1,D2,Man bites dog.
2,D3,Dog eats meat.
3,D4,Man eats food.


In [5]:
def pre_processing(text: str):
    text_cleaned = text.lower().replace(".", "")
    return text_cleaned

In [6]:
# lowercased, punctuation removed, tokenized ...
toy_df["text_transformed"] = [pre_processing(text) for text in toy_df["text"].to_list()]

In [7]:
toy_df

Unnamed: 0,tags,text,text_transformed
0,D1,Dog bites man.,dog bites man
1,D2,Man bites dog.,man bites dog
2,D3,Dog eats meat.,dog eats meat
3,D4,Man eats food.,man eats food


In [8]:
# build the vocab
vocab = {}
count = 0
for doc in toy_df["text_transformed"]:
    for word in doc.split():
        if word not in vocab:
            count = count + 1
            vocab[word] = count

print(vocab)

{'dog': 1, 'bites': 2, 'man': 3, 'eats': 4, 'meat': 5, 'food': 6}


In [9]:
def get_onehot_vector(somestring: str):
    onehot_encoded = []
    for word in somestring.split():
        temp = [0]*len(vocab) # a list of zeros of the len of the vocab
        if word in vocab:
            temp[vocab[word]-1] = 1  # vocab have numbers init from 1, and python from 0
        onehot_encoded.append(temp)
    return onehot_encoded


In [10]:
toy_df["one_hot_encoding"] = [get_onehot_vector(text) for text in toy_df["text_transformed"].to_list()]

In [11]:
toy_df

Unnamed: 0,tags,text,text_transformed,one_hot_encoding
0,D1,Dog bites man.,dog bites man,"[[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0..."
1,D2,Man bites dog.,man bites dog,"[[0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0], [1, 0..."
2,D3,Dog eats meat.,dog eats meat,"[[1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0..."
3,D4,Man eats food.,man eats food,"[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0..."


In [12]:
S1 = 'dog bites man'
S2 = 'man bites dog'
S3 = 'dog eats meat'
S4 = 'man eats food'

data = [S1.split(), S2.split(), S3.split(), S4.split()]
print(data)
values = data[0]+data[1]+data[2]+data[3]
print("The data: ",values)

#Label Encoding
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print("Label Encoded:",integer_encoded)

#One-Hot Encoding
onehot_encoder = OneHotEncoder(handle_unknown='ignore')
values_transformed = np.array(values).reshape(-1, 1)
onehot_encoded = onehot_encoder.fit_transform(values_transformed).toarray()
print("Onehot Encoded Matrix:\n",onehot_encoded)

[['dog', 'bites', 'man'], ['man', 'bites', 'dog'], ['dog', 'eats', 'meat'], ['man', 'eats', 'food']]
The data:  ['dog', 'bites', 'man', 'man', 'bites', 'dog', 'dog', 'eats', 'meat', 'man', 'eats', 'food']
Label Encoded: [1 0 4 4 0 1 1 2 5 4 2 3]
Onehot Encoded Matrix:
 [[0. 1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]]


# Bag of Words

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
count_vect = CountVectorizer()

bow_rep = count_vect.fit_transform(toy_df["text_transformed"].to_list())

In [15]:
count_vect.vocabulary_

{'dog': 1, 'bites': 0, 'man': 4, 'eats': 2, 'meat': 5, 'food': 3}

In [16]:
for i, item in enumerate(bow_rep):
    print(toy_df["text_transformed"].to_list()[i], item.toarray())

dog bites man [[1 1 0 0 1 0]]
man bites dog [[1 1 0 0 1 0]]
dog eats meat [[0 1 1 0 0 1]]
man eats food [[0 0 1 1 1 0]]


In [17]:
temp = count_vect.transform(["dog and dog and dog are man"])
print(temp.toarray())

[[0 3 0 0 1 0]]


In some cases we don't want the frequency of the word appearance, but just if it appears or not.

In [18]:
count_vect_bin = CountVectorizer(binary=True)
bow_rep_bin = count_vect_bin.fit_transform(toy_df["text_transformed"].to_list())

In [19]:
temp = count_vect_bin.transform(["dog and dog and dog are man"])
print(temp.toarray())

[[0 1 0 0 1 0]]


# Bag of N-Gram

In [20]:
count_vect = CountVectorizer(ngram_range=(1, 3))

In [21]:
bow_rep = count_vect.fit_transform(toy_df["text_transformed"].to_list())

In [23]:
count_vect.vocabulary_

{'dog': 3,
 'bites': 0,
 'man': 12,
 'dog bites': 4,
 'bites man': 2,
 'dog bites man': 5,
 'man bites': 13,
 'bites dog': 1,
 'man bites dog': 14,
 'eats': 8,
 'meat': 17,
 'dog eats': 6,
 'eats meat': 10,
 'dog eats meat': 7,
 'food': 11,
 'man eats': 15,
 'eats food': 9,
 'man eats food': 16}

# TF-IDF

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
tfidf = TfidfVectorizer()
bow_rep_tfidf = tfidf.fit_transform(toy_df["text_transformed"].to_list())

In [26]:
tfidf.idf_

array([1.51082562, 1.22314355, 1.51082562, 1.91629073, 1.22314355,
       1.91629073])

In [28]:
tfidf.get_feature_names_out()

array(['bites', 'dog', 'eats', 'food', 'man', 'meat'], dtype=object)

In [34]:
temp = tfidf.transform(["dog and man are meat"])
print(temp.toarray())

[[0.         0.47380449 0.         0.         0.47380449 0.74230628]]


In [30]:
temp

<1x6 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [None]:
import os
import wget
import gzip
import shutil

gn_vec_path = "GoogleNews-vectors-negative300.bin"
if not os.path.exists("GoogleNews-vectors-negative300.bin"):
    if not os.path.exists("../models/GoogleNews-vectors-negative300.bin"):
        #Downloading the reqired model
        if not os.path.exists("../models/GoogleNews-vectors-negative300.bin.gz"):
            if not os.path.exists("GoogleNews-vectors-negative300.bin.gz"):
                wget.download("https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz")
            gn_vec_zip_path = "../models/GoogleNews-vectors-negative300.bin.gz"
        else:
            gn_vec_zip_path = "../models/GoogleNews-vectors-negative300.bin.gz"
        #Extracting the required model
        with gzip.open(gn_vec_zip_path, 'rb') as f_in:
            with open(gn_vec_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
    else:
        gn_vec_path = "../models/" + gn_vec_path

print(f"Model at {gn_vec_path}")

In [3]:
from gensim.models import Word2Vec, KeyedVectors
pretrainedpath = "../../models/GoogleNews-vectors-negative300.bin.gz"
w2v_model = KeyedVectors.load_word2vec_format(pretrainedpath, binary=True)
print("Done loading Word2vec")
print(f"Numbers of words in the vocab: {len(w2v_model.key_to_index)}")
print(w2v_model.most_similar(["beautiful"]))
w2v_model["beautiful"]

Done loading Word2vec
Numbers of words in the vocab: 3000000


TypeError: 'method' object is not subscriptable

In [6]:
w2v_model.most_similar(["beautiful"])

[('gorgeous', 0.8353005051612854),
 ('lovely', 0.8106936812400818),
 ('stunningly_beautiful', 0.7329413294792175),
 ('breathtakingly_beautiful', 0.7231340408325195),
 ('wonderful', 0.6854086518287659),
 ('fabulous', 0.6700063943862915),
 ('loveliest', 0.6612576246261597),
 ('prettiest', 0.6595001816749573),
 ('beatiful', 0.6593326330184937),
 ('magnificent', 0.6591402888298035)]

In [7]:
# key is the word and the array are the values
# this array is a vector representation of beautiful
w2v_model["beautiful"]

array([-0.01831055,  0.05566406, -0.01153564,  0.07275391,  0.15136719,
       -0.06176758,  0.20605469, -0.15332031, -0.05908203,  0.22851562,
       -0.06445312, -0.22851562, -0.09472656, -0.03344727,  0.24707031,
        0.05541992, -0.00921631,  0.1328125 , -0.15429688,  0.08105469,
       -0.07373047,  0.24316406,  0.12353516, -0.09277344,  0.08203125,
        0.06494141,  0.15722656,  0.11279297, -0.0612793 , -0.296875  ,
       -0.13378906,  0.234375  ,  0.09765625,  0.17773438,  0.06689453,
       -0.27539062,  0.06445312, -0.13867188, -0.08886719,  0.171875  ,
        0.07861328, -0.10058594,  0.23925781,  0.03808594,  0.18652344,
       -0.11279297,  0.22558594,  0.10986328, -0.11865234,  0.02026367,
        0.11376953,  0.09570312,  0.29492188,  0.08251953, -0.05444336,
       -0.0090332 , -0.0625    , -0.17578125, -0.08154297,  0.01062012,
       -0.04736328, -0.08544922, -0.19042969, -0.30273438,  0.07617188,
        0.125     , -0.05932617,  0.03833008, -0.03564453,  0.24