In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, StringType

import re
import nltk

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [2]:
spark = SparkSession.builder \
    .appName("Processamento NLP com PySpark") \
    .config("spark.executor.memory", "2g") \
    .config("spark.driver.memory", "2g") \
    .getOrCreate()


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

your 131072x1 screen size is bogus. expect trouble
24/12/01 16:40:51 WARN Utils: Your hostname, matt resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/12/01 16:40:51 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/01 16:40:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/12/01 16:40:52 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
[nltk_data] Downloading package punkt to /home/matt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/matt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/matt/nltk_data...
[nltk_data]   Package wordnet is already u

True

In [3]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def pipeline_processar_texto(text):
    text = text.lower()
    text = re.sub(r'https?://\S+', '', text)  # Remove URLs
    text = re.sub(r'[^\w\s]', '', text)       # Remove caracteres especiais
    text = re.sub(r'\d', '', text)           # Remove dígitos
    tokens = word_tokenize(text)             # Tokeniza o texto
    tokens = [t for t in tokens if t not in stop_words]  # Remove stopwords
    tokens = [lemmatizer.lemmatize(t) for t in tokens]   # Lemmatiza os tokens
    return tokens

In [4]:
sample_text = "This is a sample text with a URL https://example.com and numbers 1234."
print(pipeline_processar_texto(sample_text))

['sample', 'text', 'url', 'number']


In [5]:
df = spark.read.parquet("../data/dblp-v10.parquet")

In [6]:
print(f"Linhas no DataFrame inicial: {df.count()}")

Linhas no DataFrame inicial: 1000000


In [7]:
df.show(5)

                                                                                

+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+--------------------+
|            abstract|             authors|n_citation|          references|               title|               venue|year|                  id|
+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+--------------------+
|In this paper, a ...|['S. Ben Jabra', ...|        50|['09cb2d7d-47d1-4...|A new approach of...|international sym...|2008|4ab3735c-80f1-472...|
|We studied an aut...|['Joaquín J. Torr...|        50|['4017c9d2-9845-4...|Attractor neural ...|      Neurocomputing|2007|4ab39729-af77-46f...|
|It is well-known ...|['Genevi eve Paqu...|        50|['1c655ee2-067d-4...|A characterizatio...|Electronic Journa...|2007|4ab3a4cf-1d96-4ce...|
|One of the fundam...|['Yaser Sheikh', ...|       221|['056116c1-9e7a-4...|Exploring the spa...|international con...|2005|4ab3a98c-3620-

In [8]:
df.printSchema()

root
 |-- abstract: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- n_citation: long (nullable = true)
 |-- references: string (nullable = true)
 |-- title: string (nullable = true)
 |-- venue: string (nullable = true)
 |-- year: long (nullable = true)
 |-- id: string (nullable = true)



In [9]:
# Remover colunas desnecessárias
columns_to_drop = ['authors', 'n_citation', 'references', 'year', 'id', 'venue']
df = df.drop(*columns_to_drop)

In [10]:
# Remover linhas com valores nulos
df = df.dropna()

In [11]:
df.show(5)

+--------------------+--------------------+
|            abstract|               title|
+--------------------+--------------------+
|In this paper, a ...|A new approach of...|
|We studied an aut...|Attractor neural ...|
|It is well-known ...|A characterizatio...|
|One of the fundam...|Exploring the spa...|
|This paper genera...|Generalized upper...|
+--------------------+--------------------+
only showing top 5 rows



                                                                                

In [12]:
[col_name for col_name in df.columns if str(df.schema[col_name].dataType) == 'StringType()']

['abstract', 'title']

In [13]:
# Converter a função de processamento para UDF (User Defined Function)
pipeline_udf = udf(pipeline_processar_texto, ArrayType(StringType()))

In [14]:
# Aplicar o pipeline de processamento de texto
for col_name in [col_name for col_name in df.columns if str(df.schema[col_name].dataType) == 'StringType()']:
    df = df.withColumn(col_name, pipeline_udf(col(col_name)))

In [15]:
df.show(5)



+--------------------+--------------------+
|            abstract|               title|
+--------------------+--------------------+
|[paper, robust, t...|[new, approach, w...|
|[studied, autoass...|[attractor, neura...|
|[wellknown, sturm...|[characterization...|
|[one, fundamental...|[exploring, space...|
|[paper, generaliz...|[generalized, upp...|
+--------------------+--------------------+
only showing top 5 rows



                                                                                

In [None]:
# Consolidar em um único arquivo Parquet
df.coalesce(1).write.parquet("dblp-v10-processado.parquet", mode="overwrite")