In [2]:
import ast
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

  from .autonotebook import tqdm as notebook_tqdm





In [3]:
df = pd.read_csv(r"../data/test_data_pre_processed.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14740 entries, 0 to 14739
Data columns (total 37 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   unique_id        14740 non-null  object 
 1   name_english     14739 non-null  object 
 2   name             14740 non-null  object 
 3   score            14740 non-null  float64
 4   ranked           14740 non-null  int64  
 5   popularity       14740 non-null  int64  
 6   members          14740 non-null  int64  
 7   synopsis         14740 non-null  object 
 8   synonyms         14740 non-null  object 
 9   type_of          14740 non-null  object 
 10  total_episodes   14740 non-null  int64  
 11  premiered        5104 non-null   object 
 12  studios          11167 non-null  object 
 13  genres           14740 non-null  object 
 14  demographic      5115 non-null   object 
 15  duration_per_ep  14740 non-null  object 
 16  rating           14532 non-null  object 
 17  scored_by   

In [5]:
print(df['genres'][0])
print(type(ast.literal_eval(df['genres'][0])))

['Drama', 'Sports', 'Adventure', 'Historical']
<class 'list'>


In [6]:
df['demographic'].isna().sum()

9625

In [7]:
df['demographic'].fillna('', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['demographic'].fillna('', inplace=True)


In [8]:
df['input_string'] = df['synopsis'] + df['genres'].apply(lambda x : " ".join(ast.literal_eval(x))) + df['demographic'] + '.'

In [9]:
df['input_string'][0]

'Yabuki Joe is left downhearted and hopeless after a certain tragic event. In attempt to put the past behind him, Joe leaves the gym behind and begins wandering. On his travels he comes across the likes of Wolf Kanagushi and Goromaki Gondo, men who unintentionally fan the dying embers inside him, leading him to putting his wanderings to an end. His return home puts Joe back on the path to boxing, but unknown to himself and his trainer, he now suffers deep-set issues holding him back from fighting. In attempt to quell those issues, Carlos Rivera, a world renowned boxer is invited from Venezuela to help Joe recover.Drama Sports Adventure HistoricalShounen.'

In [10]:
df.drop(columns=[col for col in df.columns if col not in ['input_string', 'unique_id']], inplace=True)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14740 entries, 0 to 14739
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   unique_id     14740 non-null  object
 1   input_string  14740 non-null  object
dtypes: object(2)
memory usage: 230.4+ KB


In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [13]:
def count_generator():
  count = 1
  while True:
    yield count
    count += 1

gen = count_generator()

In [14]:
def get_cls_embedding(input_string):
    # tokenize the input_string
    string_tokens = tokenizer(input_string, return_tensors='tf')
    # print('stage 1 done')

    # getting the bert embedding (output) using the model
    output = model(string_tokens)
    # print('stage 2 done')

    # extracting the cls embeddings
    cls_embedding = output.last_hidden_state[:, 0, :]
    # print('stage 3 done')

    # converting the cls embedding to the numpy array
    cls_embedding = cls_embedding.numpy().squeeze()  # shape: [758]
    # print('stage 4 done')

    print(f'Completed : {next(gen)}')

    return cls_embedding

In [27]:
df['embedding'] = df['input_string'][:10].apply(get_cls_embedding)

Completed : 21
Completed : 22
Completed : 23
Completed : 24
Completed : 25
Completed : 26
Completed : 27
Completed : 28
Completed : 29
Completed : 30


In [28]:
df = df.iloc[:10, :].reset_index(drop=True)

In [29]:
df.set_index('unique_id')

Unnamed: 0_level_0,input_string,input_vectors,embedding
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Yabuki Joe is left downhearted and hopeless af...,"[-0.66963863, -0.39918143, -0.2415497, -0.2926...","[-0.66963863, -0.39918143, -0.2415497, -0.2926..."
1,"Ghostly, primordial beings known as Mushi cont...","[-0.37551743, -0.79742545, -0.053706102, -0.29...","[-0.37551743, -0.79742545, -0.053706102, -0.29..."
2,Following the conclusion of the large-scale co...,"[-0.94414824, -0.5453752, -0.13317072, 0.09177...","[-0.94414824, -0.5453752, -0.13317072, 0.09177..."
3,Young Thorfinn grew up listening to the storie...,"[-0.17889887, -0.5132896, 0.3934681, -0.089926...","[-0.17889887, -0.5132896, 0.3934681, -0.089926..."
4,"Crime is timeless. By the year 2071, humanity ...","[-1.0348969, -0.67157936, -0.02665224, -0.0917...","[-1.0348969, -0.67157936, -0.02665224, -0.0917..."
5,"Apparitions, oddities, and gods continue to ma...","[-0.5012934, -0.74910635, -0.14438769, -0.5924...","[-0.5012934, -0.74910635, -0.14438769, -0.5924..."
6,The devastation of the Mugen Train incident st...,"[-0.50974613, -0.854025, 0.11625168, -0.596177...","[-0.50974613, -0.854025, 0.11625168, -0.596177..."
7,Turning against his former allies and enemies ...,"[-0.5516355, -0.99712485, 0.2582684, 0.0259178...","[-0.5516355, -0.99712485, 0.2582684, 0.0259178..."
8,"In his father's absence, teenager Ippo Makunou...","[-0.59308404, -0.8450499, 0.092426606, -0.5072...","[-0.59308404, -0.8450499, 0.092426606, -0.5072..."
9,"Stubborn, spoiled, and naïve, 10-year-old Chih...","[-0.35052896, -0.6936687, -0.4187693, -0.67137...","[-0.35052896, -0.6936687, -0.4187693, -0.67137..."


In [33]:
type(df['embedding'][0])

numpy.ndarray