In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, FloatType
from transformers import BertTokenizer, BertModel
import torch

from pyspark.sql.functions import col

In [None]:
spark = SparkSession.builder \
    .appName("VetorizacaoBERT") \
    .config("spark.executor.memory", "6g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "100") \
    .config("spark.executor.cores", "4") \
    .config("spark.driver.maxResultSize", "2g") \
    .getOrCreate()

In [3]:
df = spark.read.parquet("../data/dblp-v10-processado.parquet")

                                                                                

In [7]:
df.printSchema()

root
 |-- abstract: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- title: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [None]:
null_count_abstract = df.filter(col("abstract").isNull()).count()
null_count_title = df.filter(col("title").isNull()).count()

print(f"Número de valores nulos na coluna 'abstract': {null_count_abstract}")
print(f"Número de valores nulos na coluna 'title': {null_count_title}")

                                                                                

Número de valores nulos na coluna 'abstract': 0
Número de valores nulos na coluna 'title': 0


In [4]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

In [5]:
def get_bert_embedding(text):
    try:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
        with torch.no_grad():
            outputs = model(**inputs)
            embedding = outputs.last_hidden_state[:, 0, :].squeeze().tolist()
        return embedding
    except Exception as e:
        print(f"Texto de entrada: {text}")
        print(f"Inputs tokenizados: {inputs}")
        print(f"Saída do modelo: {outputs.last_hidden_state.shape}")
        return None


In [6]:
# Register UDF
bert_udf = udf(get_bert_embedding, ArrayType(FloatType()))

In [8]:
df_sample = df.sample(fraction=0.0001, seed=42)

In [9]:
print(f"Columns: {df.columns}")

Columns: ['abstract', 'title']


In [10]:
# Apply UDF to both features
df_sample = df_sample.withColumn("abstract_embedding", bert_udf(df_sample["abstract"]))
df_sample = df_sample.withColumn("title_embedding", bert_udf(df_sample["title"]))

In [11]:
df_sample.show(2)

                                                                                

+--------------------+--------------------+--------------------+--------------------+
|            abstract|               title|  abstract_embedding|     title_embedding|
+--------------------+--------------------+--------------------+--------------------+
|[paper, experimen...|[metrological, ch...|[NULL, NULL, NULL...|[NULL, NULL, NULL...|
|[present, summary...|[recent, progress...|[NULL, NULL, NULL...|[NULL, NULL, NULL...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 2 rows

