In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Word2Vec, VectorAssembler, Normalizer

In [None]:
spark = SparkSession.builder \
    .appName("ClusteringWord2Vec") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

In [None]:
df_spark = spark.read.parquet("dblp-v10-processado.parquet")

In [None]:
# 1. Word2Vec para 'title'
word2vec_title = Word2Vec(vectorSize=100, inputCol="title", outputCol="title_word2vec")
model_title = word2vec_title.fit(df_spark)
df_spark = model_title.transform(df_spark)

In [None]:
# 2. Word2Vec para 'abstract'
word2vec_abstract = Word2Vec(vectorSize=100, inputCol="abstract", outputCol="abstract_word2vec")
model_abstract = word2vec_abstract.fit(df_spark)
df_spark = model_abstract.transform(df_spark)

In [None]:
# 3. Combinação dos vetores
assembler = VectorAssembler(inputCols=["title_word2vec", "abstract_word2vec"], outputCol="features")
df_spark = assembler.transform(df_spark)

In [None]:
# 4. Normalização
normalizer = Normalizer(inputCol="features", outputCol="norm_features")
df_spark = normalizer.transform(df_spark)

In [None]:
df_spark.show(5)

In [None]:
df_spark \
    .coalesce(1) \
    .write.parquet("dblp-v10-processado-vetorizado-word2vec.parquet")