In [10]:
import re
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

spark = SparkSession.builder \
    .appName("data03") \
    .config("spark.driver.memory", "8g") \
    .config("spark.sql.autoBroadcastJoinThreshold", -1) \
    .getOrCreate()

read the text file about all the topics

In [11]:
# read text folder
df = spark.read.text("topics.txt")

print("Number of unique topics: ", df.select("value").count())
df.show(truncate=False, n=5)

Number of unique topics:  1457234
+-------------------------+
|value                    |
+-------------------------+
|P2 Ípsilon Ímpar Fugas P3|
|laboratório              |
|advogado                 |
|descrever                |
|Soup Nazi                |
+-------------------------+
only showing top 5 rows



split multiple topics into a list of topics

In [12]:
df = df.withColumn(
    "tokens",
    F.split(F.col("value"), " {2,}")
)

df.show(truncate=False, n=5)
#df.coalesce(1).write.mode("overwrite").json("topics.json")

+-------------------------+---------------------------+
|value                    |tokens                     |
+-------------------------+---------------------------+
|P2 Ípsilon Ímpar Fugas P3|[P2 Ípsilon Ímpar Fugas P3]|
|laboratório              |[laboratório]              |
|advogado                 |[advogado]                 |
|descrever                |[descrever]                |
|Soup Nazi                |[Soup Nazi]                |
+-------------------------+---------------------------+
only showing top 5 rows



get all possible combinations of topics within a topic (ex.: Banco de Portugal -> Banco, Portugal, Banco de Portugal)

In [13]:
df = df.rdd.repartition(4)
df = df.flatMap(
    lambda row: [(row[0], [token for tokens in row[1] for token in tokens.split()] + row[1])]
)

df.toDF(["topic", "tokens"]).show(truncate=False, n=5)

[Stage 38:>                                                         (0 + 1) / 1]

+------------------+-------------------------------------------+
|topic             |tokens                                     |
+------------------+-------------------------------------------+
|revisão           |[revisão, revisão]                         |
|The New York Times|[The, New, York, Times, The New York Times]|
|Thierry Breton    |[Thierry, Breton, Thierry Breton]          |
|sério             |[sério, sério]                             |
|Hospital Prisional|[Hospital, Prisional, Hospital Prisional]  |
+------------------+-------------------------------------------+
only showing top 5 rows



25/04/12 21:08:10 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 38 (TID 69): Attempting to kill Python Worker
                                                                                

check for bad encoded characters and remove them

In [14]:
# define valid Portuguese characters
custom_valid = [
    "ã", "á", "à", "â", "Á", "Ã",
    "é", "ê", "É", "Ê",
    "í", "Í",
    "ó", "ô", "õ", "Ó", "Õ",
    "ú", "Ú",
    "ç", "-", "º", "ª"
]

# define a mapping for common encoding issues
fix_map = {
    "ő": "õ", "Ő": "Õ", "ť": "ç", "ťo": "ção",
    "Ã£": "ã", "Ã¡": "á", "Ãª": "ê", "Ã³": "ó",
    "Ã­": "í", "Ã©": "é", "Ã§": "ç", "Ã‰": "É",
    "Ãµ": "õ", "Ãº": "ú", "Ã‰": "É", "ů": "ó",
    "ă": "ã", "ę": "ê", "¾": "ó",
}

def valid_word_detection(text):
    return not re.search(rf"[^a-zA-Z0-9 {''.join(re.escape(c) for c in custom_valid)}]", text or "")

def fix_encoding_issues(text):
    if not isinstance(text, str):
        return text
    for enc in ["latin1", "cp1252"]:
        try:
            return text.encode(enc).decode("utf-8")
        except Exception:
            continue
    return text

def manual_fix(text):
    for wrong, right in fix_map.items():
        text = text.replace(wrong, right)
    return text

def fixed_words(word):
    if valid_word_detection(word):
        return word

    else:
        fixed_word = fix_encoding_issues(word)
        fixed_word = manual_fix(fixed_word)

        if valid_word_detection(fixed_word):
            return fixed_word
    
    return ""


df = df.flatMap(
    lambda row: [(row[0], [fixed_words(token) for token in row[1] if fixed_words(token)])]
)

df.toDF(["topic", "tokens"]).sample(fraction=0.1).show(truncate=False, n=5)

+------------------------+-----------------------------------------------------+
|topic                   |tokens                                               |
+------------------------+-----------------------------------------------------+
|sinceridade             |[sinceridade, sinceridade]                           |
|comunidade internacional|[comunidade, internacional, comunidade internacional]|
|África África           |[África, África, África África]                      |
|linhagem                |[linhagem, linhagem]                                 |
|Hanover                 |[Hanover, Hanover]                                   |
+------------------------+-----------------------------------------------------+
only showing top 5 rows



remove duplicated tokens and invalid tokens such as "2018", "Abr 2020", "", "de", ...

In [15]:
def hours_detection(token):
    return bool(re.search(r"\b(?:\d{1,2}:\d{2}|\d{1,2}h\d{2})\b", token))

def date_detection(token):
    months = bool(re.search(r"\b(?:Jan|Fev|Abr|Mai|Jun|Jul|Ago|Set|Out|Nov|Dez)\b", token))
    march = bool(re.search(r"\b(?:\d{1,2} Mar|Mar \d{2,4})\b", token))
    months2 = bool(re.search(r"^(janeiro|fevereiro|março|abril|maio|junho|julho|agosto|setembro|outubro|novembro|dezembro)\b", token, flags=re.IGNORECASE))
    dateD = bool(re.search(r"\b(Janeiro|Fevereiro|Março|Abril|Maio|Junho|Julho|Agosto|Setembro|Outubro|Novembro|Dezembro)[ ]?\d{2,4}\b", token, flags=re.IGNORECASE))
    dateY = bool(re.search(r"\b\d{1,2}(?:\s+de)?\s+(janeiro|fevereiro|março|abril|maio|junho|julho|agosto|setembro|outubro|novembro|dezembro)\b", token, flags=re.IGNORECASE))
    days = bool(re.search(r"\b(segunda-feira|terça-feira|quarta-feira|quinta-feira|sexta-feira|sábado|domingo)\b", token, flags=re.IGNORECASE))
    return months or march or months2 or dateD or dateY or days

def invalid_tokens_detection(token):
    if token.lower() in {"de", "da", "do", "dos", "das", "e", "ou", "a", "o", "as", "os",
                         "para", "com", "em", "na", "no", "por", "pelo", "pelos", "uma",
                         "pelo", "pelas", "com", "sem", "sobre", "entre", "até", "um",
                         "antes", "depois", "durante", "após", "segundo", "junto"}:
        return True
    if len(token) < 2:
        return True
    if token.isdigit():
        return True
    if token == "":
        return True
    if hours_detection(token):
        return True
    if date_detection(token):
        return True

    return False


df = df.flatMap(
    lambda row: [(row[0], [token for token in row[1] if not invalid_tokens_detection(token)])]
)
df = df.flatMap(
    lambda row: [(row[0], list(set(row[1])))]
)

df.toDF(["topic", "tokens"]).sample(fraction=0.1).show(truncate=False, n=5)

[Stage 46:>                                                         (0 + 1) / 1]

+-------------------+----------------------------------------+
|topic              |tokens                                  |
+-------------------+----------------------------------------+
|Nobel da Literatura|[Nobel da Literatura, Literatura, Nobel]|
|façanha            |[façanha]                               |
|Unidos Podemos     |[Podemos, Unidos, Unidos Podemos]       |
|Higiene Urbana     |[Higiene Urbana, Higiene, Urbana]       |
|PUB ZAPPING        |[PUB ZAPPING, ZAPPING, PUB]             |
+-------------------+----------------------------------------+
only showing top 5 rows



25/04/12 21:08:15 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 46 (TID 73): Attempting to kill Python Worker
                                                                                

remove stop words that *introduce* the topic, such as "O Banco de Portugal" -> "Banco de Portugal"

In [16]:
def remove_introductory_stopword(token):
    lower_token = token.lower()

    if lower_token.startswith(("a ", "o ")):
        return token[2:]
    if lower_token.startswith(("as ", "os ", "um ", "de ", "da ", "do ")):
        return token[3:]
    if lower_token.startswith(("uma ", "uns ")):
        return token[4:]
    if lower_token.startswith("umas "):
        return token[5:]
    
    return token

df = df.flatMap(
    lambda row: [(row[0], [remove_introductory_stopword(token) for token in row[1]])]
).toDF(["topic", "tokens"])

df.sample(fraction=0.1).show(truncate=False, n=5)
df.coalesce(4).write.mode("overwrite").json("topics.json")

25/04/12 21:08:19 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 50 (TID 75): Attempting to kill Python Worker
                                                                                

+-----------+-------------+
|topic      |tokens       |
+-----------+-------------+
|calma      |[calma]      |
|Direito    |[Direito]    |
|Contactado |[Contactado] |
|Codogno    |[Codogno]    |
|Presidentes|[Presidentes]|
+-----------+-------------+
only showing top 5 rows



                                                                                

map tokens to some sort of lemma, ex.: "Saúde" = "saúde"

In [23]:
df = spark.read.json("topics.json")

print("Number of topics: ", df.count())
df.show(truncate=False, n=5)

                                                                                

Number of topics:  1457234
+----------------------------------------------------------+-------------------------+
|tokens                                                    |topic                    |
+----------------------------------------------------------+-------------------------+
|[Fugas, P2, P3, Ípsilon, Ímpar, P2 Ípsilon Ímpar Fugas P3]|P2 Ípsilon Ímpar Fugas P3|
|[laboratório]                                             |laboratório              |
|[advogado]                                                |advogado                 |
|[descrever]                                               |descrever                |
|[Soup Nazi, Nazi, Soup]                                   |Soup Nazi                |
+----------------------------------------------------------+-------------------------+
only showing top 5 rows



set of all unique tokens

if a.lower in set
a = a.lower()

if a.title in set
a = a.title()

if a.upper in set
a = a.upper()

else

a

!!!!!!!!!

In [None]:
# get all unique tokens
tokens_df = df.select(F.explode(F.col("tokens")).alias("token"))
unique_tokens = tokens_df.select("token").distinct().orderBy("token")

# Show the sorted unique tokens
unique_tokens.show(truncate=False)

                                                                                

+---------------------------------+
|token                            |
+---------------------------------+
|- A                              |
|- APREN                          |
|- Agricultura e Pescas           |
|- Amnistia UE                    |
|- As crianças precisam de atenção|
|- Camilo Lourenço                |
|- Clima - Correio da Manhã       |
|- Conjuntura                     |
|- Conjuntura -                   |
|- Convidado                      |
|- Convidado 1                    |
|- Convidado 11                   |
|- Convidado 15                   |
|- Convidado 17                   |
|- Convidado 2                    |
|- Convidado 28                   |
|- Convidado 9                    |
|- Convidado Hoje                 |
|- Convidado Hoje Do Syriza       |
|- Convidado Hoje Jogar           |
+---------------------------------+
only showing top 20 rows



                                                                                