In [38]:
import re
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import *

spark = SparkSession.builder \
    .appName("data03") \
    .config("spark.driver.memory", "8g") \
    .config("spark.sql.autoBroadcastJoinThreshold", -1) \
    .getOrCreate()

# Topic preprocessing

This notebook contains the code to clear some useless topics and to merge some topics that are too similar.

read the text file about all the topics

In [39]:
# read text folder
df = spark.read.text("topics.txt")

print("Number of unique topics: ", df.select("value").count())
df.show(truncate=False, n=5)

Number of unique topics:  1457234
+-------------------------+
|value                    |
+-------------------------+
|P2 Ípsilon Ímpar Fugas P3|
|laboratório              |
|advogado                 |
|descrever                |
|Soup Nazi                |
+-------------------------+
only showing top 5 rows



split multiple topics into a list of topics

In [40]:
df = df.withColumn(
    "tokens",
    F.split(F.col("value"), " {2,}")
)

df.show(truncate=False, n=5)

+-------------------------+---------------------------+
|value                    |tokens                     |
+-------------------------+---------------------------+
|P2 Ípsilon Ímpar Fugas P3|[P2 Ípsilon Ímpar Fugas P3]|
|laboratório              |[laboratório]              |
|advogado                 |[advogado]                 |
|descrever                |[descrever]                |
|Soup Nazi                |[Soup Nazi]                |
+-------------------------+---------------------------+
only showing top 5 rows



get all possible combinations of topics within a topic (ex.: Banco de Portugal -> Banco, Portugal, Banco de Portugal)

In [41]:
df = df.rdd.repartition(4)
df = df.flatMap(
    lambda row: [(row[0], [token for tokens in row[1] for token in tokens.split()] + row[1])]
)

df.toDF(["topic", "tokens"]).show(truncate=False, n=5)

[Stage 176:>                                                        (0 + 7) / 7]

+------------------+-------------------------------------------+
|topic             |tokens                                     |
+------------------+-------------------------------------------+
|revisão           |[revisão, revisão]                         |
|The New York Times|[The, New, York, Times, The New York Times]|
|Thierry Breton    |[Thierry, Breton, Thierry Breton]          |
|sério             |[sério, sério]                             |
|Hospital Prisional|[Hospital, Prisional, Hospital Prisional]  |
+------------------+-------------------------------------------+
only showing top 5 rows



                                                                                

check for bad encoded characters and remove them

In [42]:
# define valid Portuguese characters
custom_valid = [
    "ã", "á", "à", "â", "Á", "Ã",
    "é", "ê", "É", "Ê",
    "í", "Í",
    "ó", "ô", "õ", "Ó", "Õ",
    "ú", "Ú",
    "ç", "-", "º", "ª"
]

# define a mapping for common encoding issues
fix_map = {
    "ő": "õ", "Ő": "Õ", "ť": "ç", "ťo": "ção",
    "Ã£": "ã", "Ã¡": "á", "Ãª": "ê", "Ã³": "ó",
    "Ã­": "í", "Ã©": "é", "Ã§": "ç", "Ã‰": "É",
    "Ãµ": "õ", "Ãº": "ú", "Ã‰": "É", "ů": "ó",
    "ă": "ã", "ę": "ê", "¾": "ó",
}

def valid_word_detection(text):
    return not re.search(rf"[^a-zA-Z0-9 {''.join(re.escape(c) for c in custom_valid)}]", text or "")

def fix_encoding_issues(text):
    if not isinstance(text, str):
        return text
    for enc in ["latin1", "cp1252"]:
        try:
            return text.encode(enc).decode("utf-8")
        except Exception:
            continue
    return text

def manual_fix(text):
    for wrong, right in fix_map.items():
        text = text.replace(wrong, right)
    return text

def fixed_words(word):
    if valid_word_detection(word):
        return word

    else:
        fixed_word = fix_encoding_issues(word)
        fixed_word = manual_fix(fixed_word)

        if valid_word_detection(fixed_word):
            return fixed_word
    
    return ""


df = df.flatMap(
    lambda row: [(row[0], [fixed_words(token) for token in row[1] if fixed_words(token)])]
)

df.toDF(["topic", "tokens"]).sample(fraction=0.1).show(truncate=False, n=5)

+--------------+---------------------------------+
|topic         |tokens                           |
+--------------+---------------------------------+
|Thierry Breton|[Thierry, Breton, Thierry Breton]|
|arrebatar     |[arrebatar, arrebatar]           |
|ferido        |[ferido, ferido]                 |
|Efacec Abusos |[Efacec, Abusos, Efacec Abusos]  |
|Vasco Franco  |[Vasco, Franco, Vasco Franco]    |
+--------------+---------------------------------+
only showing top 5 rows



                                                                                

remove duplicated tokens and invalid tokens such as "2018", "Abr 2020", "", "de", ...

In [43]:
def hours_detection(token):
    return bool(re.search(r"\b(?:\d{1,2}:\d{2}|\d{1,2}h\d{2})\b", token))

def date_detection(token):
    months = bool(re.search(r"\b(?:Jan|Fev|Abr|Mai|Jun|Jul|Ago|Set|Out|Nov|Dez)\b", token))
    march = bool(re.search(r"\b(?:\d{1,2} Mar|Mar \d{2,4})\b", token))
    months2 = bool(re.search(r"^(janeiro|fevereiro|março|abril|maio|junho|julho|agosto|setembro|outubro|novembro|dezembro)\b", token, flags=re.IGNORECASE))
    dateD = bool(re.search(r"\b(Janeiro|Fevereiro|Março|Abril|Maio|Junho|Julho|Agosto|Setembro|Outubro|Novembro|Dezembro)[ ]?\d{2,4}\b", token, flags=re.IGNORECASE))
    dateY = bool(re.search(r"\b\d{1,2}(?:\s+de)?\s+(janeiro|fevereiro|março|abril|maio|junho|julho|agosto|setembro|outubro|novembro|dezembro)\b", token, flags=re.IGNORECASE))
    days = bool(re.search(r"\b(segunda-feira|terça-feira|quarta-feira|quinta-feira|sexta-feira|sábado|domingo)\b", token, flags=re.IGNORECASE))
    return months or march or months2 or dateD or dateY or days

def invalid_tokens_detection(token):
    if token.lower() in {"de", "da", "do", "dos", "das", "e", "ou", "a", "o", "as", "os",
                         "para", "com", "em", "na", "no", "por", "pelo", "pelos", "uma",
                         "pelo", "pelas", "com", "sem", "sobre", "entre", "até", "um",
                         "antes", "depois", "durante", "após", "segundo", "junto"}:
        return True
    if len(token) < 2:
        return True
    if token.isdigit():
        return True
    if token == "":
        return True
    if hours_detection(token):
        return True
    if date_detection(token):
        return True

    return False


df = df.flatMap(
    lambda row: [(row[0], [token for token in row[1] if not invalid_tokens_detection(token)])]
)
df = df.flatMap(
    lambda row: [(row[0], list(set(row[1])))]
)

df.toDF(["topic", "tokens"]).sample(fraction=0.1).show(truncate=False, n=5)

[Stage 187:>                                                        (0 + 1) / 1]

+-----------------------+---------------------------------------------------+
|topic                  |tokens                                             |
+-----------------------+---------------------------------------------------+
|África África          |[África África, África]                            |
|Fuel Publicidade       |[Publicidade, Fuel, Fuel Publicidade]              |
|SAPO Tek               |[Tek, SAPO Tek, SAPO]                              |
|Museus                 |[Museus]                                           |
|Amadora Recrudescimento|[Recrudescimento, Amadora Recrudescimento, Amadora]|
+-----------------------+---------------------------------------------------+
only showing top 5 rows



                                                                                

remove stop words that *introduce* the topic, such as "O Banco de Portugal" -> "Banco de Portugal"

In [44]:
def remove_introductory_stopword(token):
    lower_token = token.lower()

    if lower_token.startswith(("a ", "o ")):
        return token[2:]
    if lower_token.startswith(("as ", "os ", "um ", "de ", "da ", "do ")):
        return token[3:]
    if lower_token.startswith(("uma ", "uns ")):
        return token[4:]
    if lower_token.startswith("umas "):
        return token[5:]
    
    return token

df = df.flatMap(
    lambda row: [(row[0], [remove_introductory_stopword(token) for token in row[1]])]
)

df.toDF(["topic", "tokens"]).sample(fraction=0.1).show(truncate=False, n=5)

[Stage 191:>                                                        (0 + 1) / 1]

+----------------+--------------------------------------+
|topic           |tokens                                |
+----------------+--------------------------------------+
|Thai Raksa Chart|[Chart, Thai Raksa Chart, Thai, Raksa]|
|podia           |[podia]                               |
|vitamina D      |[vitamina D, vitamina]                |
|Portimão Mar    |[Mar, Portimão Mar, Portimão]         |
|Somem           |[Somem]                               |
+----------------+--------------------------------------+
only showing top 5 rows



25/04/13 10:39:52 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 191 (TID 259): Attempting to kill Python Worker
                                                                                

map tokens to some sort of lemma, ex.: "Saúde" = "saúde"

the ideia is to have a lemma and all tokens mapped to it (variants of the lemma), then choose the most frequent variant as a lemma

ex.: galp, Galp, GALP -> galp, but Galp is the most frequent variant, so it becomes the lemma: galp, Galp, GALP -> Galp

In [45]:
# get all tokens
tokens_df = df.toDF(["topic", "tokens"]).select(F.explode(F.col("tokens")).alias("token"))


# count the most frequenet variant of each token and select the most frequent one
window_spec = Window.partitionBy("std_token").orderBy(F.desc("count"))

tokens_df = tokens_df \
            .withColumn("std_token", F.lower(F.col("token"))) \
            .groupBy("std_token", "token") \
            .agg(F.count("*").alias("count")) \
            .withColumn("rank", F.row_number().over(window_spec)) \
            .filter(F.col("rank") == 1) \
            .select("std_token", "token", "count") \
            .orderBy(F.desc("count"))

tokens_df.show(truncate=False, n=5)

[Stage 202:>                                                        (0 + 8) / 9]

+-----------+-----------+-----+
|std_token  |token      |count|
+-----------+-----------+-----+
|tópicos    |Tópicos    |18581|
|foto       |Foto       |15169|
|portugal   |Portugal   |13868|
|fotogaleria|Fotogaleria|13082|
|mundo      |Mundo      |12329|
+-----------+-----------+-----+
only showing top 5 rows



                                                                                

In [46]:
std_to_token_dict = {
    row['std_token']: row['token']
    for row in tokens_df.collect()
}

df = df.flatMap(
    lambda row: [(row[0], [std_to_token_dict[token.lower()] for token in row[1]])]
)

df.toDF(["topic", "tokens"]).sample(fraction=0.1).show(truncate=False, n=5)

[Stage 224:>                                                        (0 + 1) / 1]

+--------------+---------------------------------+
|topic         |tokens                           |
+--------------+---------------------------------+
|Sonae SR      |[Sr, Sonae, Sonae SR]            |
|Contactado    |[Contactado]                     |
|Unidos Podemos|[Podemos, Unidos, Unidos Podemos]|
|Painel        |[Painel]                         |
|instala       |[instala]                        |
+--------------+---------------------------------+
only showing top 5 rows



25/04/13 10:41:02 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 224 (TID 317): Attempting to kill Python Worker
                                                                                

save the topics mapping to a file

In [47]:
df.toDF(["topic", "tokens"]).write.mode("overwrite").json("topics.json")

                                                                                