In [1]:
import ast
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

  from .autonotebook import tqdm as notebook_tqdm





In [108]:
df = pd.read_csv(r"../data/test_data_pre_processed.csv")

In [109]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14740 entries, 0 to 14739
Data columns (total 37 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   unique_id        14740 non-null  object 
 1   name_english     14739 non-null  object 
 2   name             14740 non-null  object 
 3   score            14740 non-null  float64
 4   ranked           14740 non-null  int64  
 5   popularity       14740 non-null  int64  
 6   members          14740 non-null  int64  
 7   synopsis         14740 non-null  object 
 8   synonyms         14740 non-null  object 
 9   type_of          14740 non-null  object 
 10  total_episodes   14740 non-null  int64  
 11  premiered        5104 non-null   object 
 12  studios          11167 non-null  object 
 13  genres           14740 non-null  object 
 14  demographic      5115 non-null   object 
 15  duration_per_ep  14740 non-null  object 
 16  rating           14532 non-null  object 
 17  scored_by   

In [110]:
print(df['genres'][0])
print(type(ast.literal_eval(df['genres'][0])))

['Drama', 'Sports', 'Adventure', 'Historical']
<class 'list'>


In [111]:
df['demographic'].isna().sum()

9625

In [112]:
df['demographic'].fillna('', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['demographic'].fillna('', inplace=True)


In [113]:
df['input_string'] = df['synopsis'] + df['genres'].apply(lambda x : " ".join(ast.literal_eval(x))) + df['demographic'] + '.'

In [114]:
df['input_string'][0]

'Yabuki Joe is left downhearted and hopeless after a certain tragic event. In attempt to put the past behind him, Joe leaves the gym behind and begins wandering. On his travels he comes across the likes of Wolf Kanagushi and Goromaki Gondo, men who unintentionally fan the dying embers inside him, leading him to putting his wanderings to an end. His return home puts Joe back on the path to boxing, but unknown to himself and his trainer, he now suffers deep-set issues holding him back from fighting. In attempt to quell those issues, Carlos Rivera, a world renowned boxer is invited from Venezuela to help Joe recover.Drama Sports Adventure HistoricalShounen.'

In [115]:
df.drop(columns=[col for col in df.columns if col not in ['input_string', 'unique_id']], inplace=True)

In [116]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14740 entries, 0 to 14739
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   unique_id     14740 non-null  object
 1   input_string  14740 non-null  object
dtypes: object(2)
memory usage: 230.4+ KB


In [161]:
df = df[:10]

In [162]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   unique_id     10 non-null     object
 1   input_string  10 non-null     object
 2   embedding     10 non-null     object
dtypes: object(3)
memory usage: 372.0+ bytes


In [122]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [163]:
def count_generator():
  count = 1
  while True:
    yield count
    count += 1

gen = count_generator()

In [48]:
def get_cls_embedding(input_string):
    # tokenize the input_string
    string_tokens = tokenizer(input_string, return_tensors='tf', padding=True, truncation=True)
    # print('stage 1 done')

    # getting the bert embedding (output) using the model
    output = model(string_tokens)
    # print('stage 2 done')

    # extracting the cls embeddings
    cls_embedding = output.last_hidden_state[:, 0, :]
    # print('stage 3 done')

    # converting the cls embedding to the numpy array
    cls_embedding = cls_embedding.numpy().squeeze()  # shape: [758]
    # print('stage 4 done')

    print(f'Completed : {next(gen)}')

    return cls_embedding

In [79]:
type(df['input_string'][:5].to_list())

list

In [164]:
batches = np.array_split(df['input_string'], len(df) // 4 + (len(df) % 4 > 0))

  return bound(*args, **kwds)


In [165]:
len(batches)

3

In [166]:
result_vectors = []
for batch in batches:
    result_vectors.append(get_cls_embedding(batch.to_list()))
    # print(type(batch))
len(result_vectors)

Completed : 1
Completed : 2
Completed : 3


3

In [167]:
df['embedding'] = [vector for sublist in result_vectors for vector in sublist]

In [168]:
df

Unnamed: 0,unique_id,input_string,embedding
0,0,Yabuki Joe is left downhearted and hopeless af...,"[-0.66963875, -0.39918116, -0.24154967, -0.292..."
1,1,"Ghostly, primordial beings known as Mushi cont...","[-0.37551773, -0.7974257, -0.05370664, -0.2910..."
2,2,Following the conclusion of the large-scale co...,"[-0.9441483, -0.5453753, -0.13317144, 0.091779..."
3,3,Young Thorfinn grew up listening to the storie...,"[-0.17889854, -0.5132897, 0.39346802, -0.08992..."
4,4,"Crime is timeless. By the year 2071, humanity ...","[-1.0348964, -0.6715797, -0.02665209, -0.09170..."
5,5,"Apparitions, oddities, and gods continue to ma...","[-0.5012937, -0.749107, -0.1443865, -0.5924971..."
6,6,The devastation of the Mugen Train incident st...,"[-0.509746, -0.8540245, 0.11625019, -0.5961771..."
7,7,Turning against his former allies and enemies ...,"[-0.5516354, -0.9971252, 0.25826928, 0.0259177..."
8,8,"In his father's absence, teenager Ippo Makunou...","[-0.5930841, -0.8450493, 0.09242689, -0.507209..."
9,9,"Stubborn, spoiled, and naïve, 10-year-old Chih...","[-0.35052866, -0.69366884, -0.41876948, -0.671..."


In [169]:
df['embedding'][0][:10]

array([-0.66963875, -0.39918116, -0.24154967, -0.29267675, -0.69194055,
        0.11974451,  0.6303591 ,  0.11249674,  0.16581503, -0.74881715],
      dtype=float32)

In [170]:
get_cls_embedding(df['input_string'][0])[:10]

Completed : 4


array([-0.6696389 , -0.3991817 , -0.24154934, -0.29267657, -0.69194067,
        0.11974467,  0.6303591 ,  0.11249638,  0.16581543, -0.74881727],
      dtype=float32)

In [None]:
df['embedding'] = df['input_string'][:100].apply(get_cls_embedding)

Completed : 1
Completed : 2
Completed : 3
Completed : 4
Completed : 5
Completed : 6
Completed : 7
Completed : 8
Completed : 9
Completed : 10


In [40]:
df = df.iloc[:10, :].reset_index(drop=True)

In [41]:
df.set_index('unique_id')

Unnamed: 0_level_0,input_string,embedding
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Yabuki Joe is left downhearted and hopeless af...,"[-0.6696389, -0.3991817, -0.24154934, -0.29267..."
1,"Ghostly, primordial beings known as Mushi cont...","[-0.37551743, -0.79742545, -0.053706102, -0.29..."
2,Following the conclusion of the large-scale co...,"[-0.94414824, -0.5453752, -0.13317072, 0.09177..."
3,Young Thorfinn grew up listening to the storie...,"[-0.17889887, -0.5132896, 0.3934681, -0.089926..."
4,"Crime is timeless. By the year 2071, humanity ...","[-1.0348969, -0.67157936, -0.02665224, -0.0917..."
5,"Apparitions, oddities, and gods continue to ma...","[-0.5012934, -0.74910635, -0.14438769, -0.5924..."
6,The devastation of the Mugen Train incident st...,"[-0.50974596, -0.85402393, 0.11625116, -0.5961..."
7,Turning against his former allies and enemies ...,"[-0.5516355, -0.99712485, 0.2582684, 0.0259178..."
8,"In his father's absence, teenager Ippo Makunou...","[-0.59308404, -0.8450499, 0.092426606, -0.5072..."
9,"Stubborn, spoiled, and naïve, 10-year-old Chih...","[-0.35052842, -0.6936687, -0.4187696, -0.67137..."


In [42]:
type(df['embedding'][0])

numpy.ndarray

In [43]:
import faiss

In [None]:
vectors = np.stack(df['embedding'].values).astype('float32')

In [20]:
# faiss index for l2 distances
d = vectors[0].shape
d = d[0]
d

768

In [None]:
index = faiss.IndexFlatL2(d) # L2 distance index

In [22]:
index.add(vectors)

In [23]:
# storing unique ids and their corresponding faiss indexes
id_mapping = df['unique_id'].to_list()  # help us map faiss indices back to unique id

In [24]:
query_vector = vectors[0]   # query vector

In [25]:
k = 2 # number of nearest neighbours to find

In [26]:
query_vector = query_vector.reshape(1, -1)

In [27]:
distances, faiss_indices = index.search(query_vector, k)

In [28]:
# mapping faiss indexes back to original unique id
result_ids = [id_mapping[i] for i in faiss_indices[0]]

In [29]:
print(f"Nearest neighbours unique ids : {result_ids}")
print(f"Distances : {distances[0]}")

Nearest neighbours unique ids : ['0', '6']
Distances : [ 0.      43.18433]


In [30]:
distances[int(result_ids[0])]

array([ 0.     , 43.18433], dtype=float32)

In [31]:
type(distances.shape)

tuple

In [32]:
def get_3_new_recommendations(query_vector, already_recommended):
    num_recommendations = 3
    recommendations = list()
    rec_distances = list()

    # Ensure the query vector is in the correct shape
    query_vector = query_vector.reshape(1, -1)

    # Set a high initial k to ensure enough results even after filtering
    k = num_recommendations + len(already_recommended) + 5

    # Perform search
    distances, faiss_indices = index.search(query_vector, k)

    # Filter out already recommended IDs and collect unique recommendations
    for idx in faiss_indices[0]:
        if idx not in already_recommended and idx not in recommendations:
            recommendations.append(idx)
            print(idx)
            rec_distances.append(distances[0][int(idx)])
        if len(recommendations) == num_recommendations:
            break

    return recommendations, rec_distances

In [33]:
already_recommended = [0, 6]
recommendations, distances = get_3_new_recommendations(query_vector, already_recommended)
print(recommendations, distances)

1
8
5
[1, 8, 5] [43.18433, 109.33197, 63.380882]
