In [23]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras


In [2]:
def get_conll_sentences(conll_file):
    sent = []
    temp_sent = []
    
    with open(conll_file) as f:
        conll_raw_data = f.readlines()
    conll_raw_data = [x.strip() for x in conll_raw_data]

    for line in conll_raw_data:
        if line != '':
            split_line = line.split()
            if len(split_line) == 4:
                if split_line[0] != '-DOCSTART-':
                    temp_sent.append(split_line[0])            
            else:
                raise IndexError('Line split length does not equal 4.')
        else:
            if len(temp_sent) > 0:
                sent.append(' '.join(temp_sent))
                temp_sent = []
    
    return sent


In [3]:
train_file = r"/content/drive/MyDrive/Dataset/NER_Dataset/CoNLL2003/eng.train"
testa_file = r"/content/drive/MyDrive/Dataset/NER_Dataset/CoNLL2003/eng.testa"
testb_file = r"/content/drive/MyDrive/Dataset/NER_Dataset/CoNLL2003/eng.testb"
testc_file = r"/content/drive/MyDrive/Dataset/NER_Dataset/CoNLL2003/eng.testc"

train_sentences = get_conll_sentences(train_file)
testa_sentences = get_conll_sentences(testa_file)
testb_sentences = get_conll_sentences(testb_file)
testc_sentences = get_conll_sentences(testc_file)


In [4]:
from tensorflow.keras.layers import TextVectorization

vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(train_sentences).batch(128)
vectorizer.adapt(text_ds)


In [5]:
vectorizer.get_vocabulary()[:5]

['', '[UNK]', 'the', 'of', 'in']

In [6]:
output = vectorizer([["the cat sat on the mat"]])
output.numpy()


array([[    2, 17174, 12423,     8,     2,     1,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
      

In [7]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))


In [9]:
test = ["the", "cat", "sat", "on", "the"]
[word_index[w] for w in test]


[2, 17174, 12423, 8, 2]

In [10]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

--2022-07-30 16:30:39--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2022-07-30 16:30:39--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-07-30 16:30:40--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.1’

gl

In [16]:
import os

path_to_glove_file = os.path.join(
    os.path.expanduser("~"), "/content/glove.6B.100d.txt"
)

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))


Found 400000 word vectors.


In [17]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))


Converted 16901 words (3099 misses)


In [18]:
from tensorflow.keras.layers import Embedding

embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)


In [19]:
from tensorflow.keras import layers

int_sequences_input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
vector = layers.GlobalMaxPooling1D()(embedded_sequences)
model = keras.Model(int_sequences_input, vector)
model.summary()


Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_1 (Embedding)     (None, None, 100)         2000200   
                                                                 
 global_max_pooling1d_1 (Glo  (None, 100)              0         
 balMaxPooling1D)                                                
                                                                 
Total params: 2,000,200
Trainable params: 0
Non-trainable params: 2,000,200
_________________________________________________________________


In [20]:
string_input = keras.Input(shape=(1,), dtype="string")
x = vectorizer(string_input)
preds = model(x)
end_to_end_model = keras.Model(string_input, preds)

vec = end_to_end_model.predict(
    [["this message is about computer graphics and 3D modeling"]]
)

vec

array([[0.66039 , 0.63888 , 1.0322  , 0.27455 , 0.76628 , 0.73879 ,
        0.43258 , 0.48177 , 0.57747 , 0.36375 , 0.62227 , 0.49926 ,
        0.50891 , 0.58327 , 0.90674 , 0.52753 , 0.43823 , 0.96185 ,
        0.20282 , 0.67362 , 0.70398 , 0.13269 , 1.1202  , 0.36973 ,
        0.44757 , 0.3022  , 0.18077 , 1.0597  , 0.16311 , 0.093723,
        0.81497 , 1.0962  , 0.59378 , 0.58384 , 0.66365 , 0.48879 ,
        0.46967 , 0.50765 , 0.89442 , 0.36136 , 0.      , 0.15095 ,
        0.56338 , 0.      , 0.12398 , 0.23831 , 0.79365 , 0.087721,
        0.43205 , 0.      , 0.6238  , 0.44283 , 0.57329 , 1.6618  ,
        0.42358 , 0.      , 0.27182 , 0.23949 , 2.5458  , 0.81925 ,
        0.64183 , 1.5991  , 0.      , 0.43507 , 1.1876  , 0.22765 ,
        1.0917  , 0.77839 , 1.0585  , 1.1262  , 0.11698 , 0.71972 ,
        0.33884 , 0.21796 , 0.74056 , 1.063   , 0.51691 , 0.52784 ,
        0.      , 0.3587  , 1.2576  , 0.5134  , 0.      , 0.69895 ,
        0.      , 0.452   , 0.66945 , 0.      , 

In [21]:
from keras.models import load_model, save_model

MODEL_FILE_PATH = 'embedding_model.h5'
save_model(model, MODEL_FILE_PATH)



In [22]:
model = load_model(MODEL_FILE_PATH)



In [28]:
def get_embedding(model, text_array):
  string_input = keras.Input(shape=(1,), dtype="string")
  x = vectorizer(string_input)
  preds = model(x)
  end_to_end_model = keras.Model(string_input, preds)

  vec = end_to_end_model.predict(text_array)

  return vec

In [24]:
text_array = [["this message is about computer graphics and 3D modeling"]]
emb = get_embedding(model, text_array)
emb.shape

(1, 100)

In [26]:
# !pip install swifter

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting swifter
  Downloading swifter-1.3.3.tar.gz (821 kB)
[K     |████████████████████████████████| 821 kB 4.5 MB/s 
Collecting psutil>=5.6.6
  Downloading psutil-5.9.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (281 kB)
[K     |████████████████████████████████| 281 kB 52.3 MB/s 
Collecting partd>=0.3.10
  Downloading partd-1.2.0-py3-none-any.whl (19 kB)
Collecting fsspec>=0.6.0
  Downloading fsspec-2022.7.1-py3-none-any.whl (141 kB)
[K     |████████████████████████████████| 141 kB 49.7 MB/s 
Collecting locket
  Downloading locket-1.0.0-py2.py3-none-any.whl (4.4 kB)
Building wheels for collected packages: swifter
  Building wheel for swifter (setup.py) ... [?25l[?25hdone
  Created wheel for swifter: filename=swifter-1.3.3-py3-none-any.whl size=16253 sha256=7cd25fba0f1eb59cf80a7262bfa605487bf6f6fb60c102b33c1dffda36f106ef
 

In [35]:
import pandas as pd

df = pd.DataFrame(data=train_sentences, columns=["text"])

In [49]:
import swifter

map_embeddings = get_embedding(model, df["text"].values)
df_embs = pd.DataFrame(data=map_embeddings, columns=[str(x) for x in range(map_embeddings.shape[1])])

print(df_embs.shape)

(14041, 100)


In [50]:
df_embs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.507190,0.70729,1.25610,0.83070,0.066019,0.511360,0.056818,0.61049,0.79658,0.39258,...,0.30297,0.46625,0.53040,1.124000,0.096275,0.55114,0.67108,0.76333,1.007700,0.55767
1,0.000000,0.30836,0.00000,0.00000,0.283460,0.149200,0.312730,0.00000,0.00000,0.00000,...,0.26925,0.13370,0.52366,1.200500,0.161140,0.00000,0.17408,1.09500,0.076023,0.00000
2,0.000000,0.00000,0.44020,0.39898,0.241560,0.000000,0.000000,0.47325,0.68013,0.00000,...,0.43749,0.00000,0.00000,0.083909,0.000000,0.95382,0.26317,0.13556,0.702810,0.43946
3,0.767080,0.80871,1.35920,0.67101,0.584430,1.102500,0.581390,1.58110,0.94021,0.47500,...,1.06230,0.43911,0.66408,1.141500,0.551160,0.83568,0.38815,0.65996,1.381600,1.08580
4,0.674060,1.24720,0.96268,0.89124,0.675720,0.434460,0.736300,1.01210,0.75047,0.50189,...,1.08640,0.94463,0.82345,1.174500,0.734190,0.85880,0.38815,0.65996,1.381600,1.17340
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14036,0.018958,0.00000,0.51960,0.10313,0.580450,0.023816,0.053620,0.75174,0.00000,0.00000,...,0.74710,0.11358,0.00000,0.870430,0.000000,0.77438,0.17787,0.11834,1.381600,0.18084
14037,0.000000,0.32739,0.50195,0.00000,0.441100,0.335990,0.983770,0.66527,0.00000,0.52767,...,0.00000,0.29341,0.00000,1.034000,0.000000,0.13273,0.51041,0.49872,0.769360,0.00000
14038,0.000000,0.64632,0.33147,0.36579,0.229220,0.300610,1.109200,0.64415,0.00000,0.27455,...,0.00000,0.77057,0.23832,1.122800,0.000000,0.00000,0.29427,1.11740,0.951300,0.00000
14039,0.000000,0.37234,0.50195,0.00000,0.504970,0.355300,0.983770,0.80128,0.00000,0.52767,...,0.00000,0.24300,0.00000,1.034000,0.000000,0.14797,0.51041,0.49872,0.769360,0.00000


In [52]:
SIMILARITY_MATRIX_FILE_PATH = "embedding_matrix.csv"
df_embs.to_csv(SIMILARITY_MATRIX_FILE_PATH)

In [53]:
from sklearn.metrics.pairwise import pairwise_distances

cosine_sim = 1-pairwise_distances(df_embs, metric='cosine')
cosine_sim[:4, :4]

array([[0.9999999 , 0.6084002 , 0.57051116, 0.9057771 ],
       [0.6084002 , 1.        , 0.4085219 , 0.6228007 ],
       [0.57051116, 0.4085219 , 1.        , 0.5609603 ],
       [0.9057771 , 0.6228007 , 0.5609603 , 1.        ]], dtype=float32)

In [54]:
def get_recommender(cosine_sim, idx, df, top_n = 5):
    indices = pd.Series(range(len(df)), index=df.index)
    sim_idx    = indices[idx]
    sim_scores = list(enumerate(cosine_sim[sim_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    idx_rec    = [i[0] for i in sim_scores]
    idx_sim    = [i[1] for i in sim_scores]
    
    return indices.iloc[idx_rec].index, idx_sim

idx_ref = 500

get_recommender(cosine_sim, idx_ref, df, top_n = 5)

(Int64Index([5061, 3724, 14020, 14028, 10419], dtype='int64'),
 [0.9302724, 0.9269957, 0.9238459, 0.92172116, 0.9140698])

In [55]:
idx_rec, idx_sim = get_recommender(cosine_sim, idx_ref, df, top_n = 6)
df.iloc[idx_rec]

Unnamed: 0,text
5061,( Australia ) 6-2 6-4
3724,( Australia ) 7-5 6-4
14020,Africa ) 66 75 76
14028,"74 , Peter O'Malley ( Australia ) 71 73 75"
10419,Guy Forget ( France ) beat Grant Stafford ( So...
4497,278 Greg Norman ( Australia ) 70 68 69 71
