In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, StringType

import re
import nltk

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [None]:
spark = SparkSession.builder \
    .appName("Processamento NLP com PySpark") \
    .config("spark.executor.memory", "2g") \
    .config("spark.driver.memory", "2g") \
    .getOrCreate()


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def pipeline_processar_texto(text):
    text = text.lower()
    text = re.sub(r'https?://\S+', '', text)  # Remove URLs
    text = re.sub(r'[^\w\s]', '', text)       # Remove caracteres especiais
    text = re.sub(r'\d', '', text)           # Remove dígitos
    tokens = word_tokenize(text)             # Tokeniza o texto
    tokens = [t for t in tokens if t not in stop_words]  # Remove stopwords
    tokens = [lemmatizer.lemmatize(t) for t in tokens]   # Lemmatiza os tokens
    return tokens

In [None]:
sample_text = "This is a sample text with a URL https://example.com and numbers 1234."
print(pipeline_processar_texto(sample_text))

In [None]:
df = spark.read.parquet("../data/dblp-v10.parquet")

In [None]:
print(f"Linhas no DataFrame inicial: {df.count()}")

In [None]:
df.show(5)

In [None]:
df.printSchema()

In [None]:
# Remover colunas desnecessárias
columns_to_drop = ['authors', 'n_citation', 'references', 'year', 'id', 'venue']
df = df.drop(*columns_to_drop)

In [None]:
# Remover linhas com valores nulos
df = df.dropna()

In [None]:
df.show(5)

In [None]:
[col_name for col_name in df.columns if str(df.schema[col_name].dataType) == 'StringType()']

In [None]:
# Converter a função de processamento para UDF (User Defined Function)
pipeline_udf = udf(pipeline_processar_texto, ArrayType(StringType()))

In [None]:
# Aplicar o pipeline de processamento de texto
for col_name in [col_name for col_name in df.columns if str(df.schema[col_name].dataType) == 'StringType()']:
    df = df.withColumn(col_name, pipeline_udf(col(col_name)))

In [None]:
df.show(5)

In [None]:
# Consolidar em um único arquivo Parquet
df.coalesce(1).write.parquet("dblp-v10-processado.parquet", mode="overwrite")