In [29]:
import pandas as pd
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("data03") \
    .config("spark.driver.memory", "8g") \
    .config("spark.sql.autoBroadcastJoinThreshold", -1) \
    .getOrCreate()


In [30]:
# read text folder
df = spark.read.text("topics.txt")
df.show(truncate=False)

+------------------------------------+
|value                               |
+------------------------------------+
|P2 Ípsilon Ímpar Fugas P3           |
|laboratório                         |
|advogado                            |
|descrever                           |
|Soup Nazi                           |
|divulgar                            |
|embaraço                            |
|Bares Já                            |
|poupar                              |
|Coronavírus Saúde Pública Covid-19  |
|Lifestyle   Mail                    |
|Suécia                              |
|meninas                             |
|dependência                         |
|giga                                |
|Covid-19 29 Julho                   |
|repetir                             |
|Reveladas                           |
|pigmentação                         |
|Confederação do Comércio de Portugal|
+------------------------------------+
only showing top 20 rows



In [31]:
# count how many unique topics are there
print("Number of unique topics: ", df.select("value").count())

Number of unique topics:  1457234


---

In [32]:
# count how many \t are in total
def count_tabs(row):
    return row.count("\t")
df_rdd = df.rdd.map(lambda row: row[0])
df_rdd = df_rdd.map(count_tabs)
# count how many \t are in total
total_tabs = df_rdd.reduce(lambda x, y: x + y)
print("Total number of tabs in the file: ", total_tabs)

[Stage 43:>                                                         (0 + 7) / 7]

Total number of tabs in the file:  0


                                                                                

---

In [33]:
# count how many "  " are in total
def count_spaces(row):
    return row.count("  ")
df_rdd = df.rdd.map(lambda row: row[0])
df_rdd = df_rdd.map(count_spaces)
# count how many "  " are in total
total_spaces = df_rdd.reduce(lambda x, y: x + y)
print("Total number of spaces in the file: ", total_spaces)

[Stage 44:>                                                         (0 + 7) / 7]

Total number of spaces in the file:  301448


                                                                                

split multiple spaces into new keys: "Covid-19  Saúde": 10 -> "Covid-19": x+10, "Saúde": y+10

but, need to take into account 2*" " is same as 3*" ", so sequential spaces should be treated as one gap (talvez usar regex)

---

In [34]:
# count how many one letter words are in total
def count_one_letter_words(row):
    return len(row) == 1
df_rdd = df.rdd.map(lambda row: row[0])
df_rdd = df_rdd.map(count_one_letter_words)
# count how many one letter words are in total
total_one_letter_words = df_rdd.reduce(lambda x, y: x + y)
print("Total number of one letter words in the file: ", total_one_letter_words)

Total number of one letter words in the file:  55


                                                                                

remove rows with only one character

---

In [35]:
# count how many hours and dates are in total
import re

def hours(row):
    regex = r"\b(?:\d{1,2}:\d{2}|\d{1,2}h\d{2})\b"
    return bool(re.search(regex, row))
df_rdd = df.rdd.map(lambda row: row[0])
df_rdd = df_rdd.map(hours)
# count how many hours and dates are in total
total_hours = df_rdd.reduce(lambda x, y: x + y)

def dates(row):
    months = r"\b(?:Jan|Fev|Abr|Mai|Jun|Jul|Ago|Set|Out|Nov|Dez)\b"
    date = r"\b(Janeiro|Fevereiro|Março|Abril|Maio|Junho|Julho|Agosto|Setembro|Outubro|Novembro|Dezembro)[ ]?\d{2,4}\b"
    march = r"\b(?:\d{1,2} Mar|Mar \d{2,4})\b"
    return bool(re.search(months, row)) or bool(re.search(date, row)) or bool(re.search(march, row))
df_rdd = df.rdd.map(lambda row: row[0])
df_rdd = df_rdd.map(dates)
# count how many hours and dates are in total
total_dates = df_rdd.reduce(lambda x, y: x + y)

print("Total number of hours in the file: ", total_hours)
print("Total number of dates in the file: ", total_dates)

[Stage 47:>                                                         (0 + 7) / 7]

Total number of hours in the file:  1670
Total number of dates in the file:  5008


                                                                                

remove rows wich are true in this cases (hours and dates)

---

In [39]:
# count how many replacement chars are in total
def replacementchar(row):
    return bool(re.search(r"\uFFFD", row))
df_rdd = df.rdd.map(lambda row: row[0])
df_rdd = df_rdd.map(replacementchar)
# count how many hours and dates are in total
total_replac = df_rdd.reduce(lambda x, y: x + y)

print("Total number of replcament chars in the file: ", total_replac)

[Stage 51:>                                                         (0 + 7) / 7]

Total number of replcament chars in the file:  0


                                                                                

---

In [40]:
# count determinanst
def determinanst(row):
    determinants = r"^(?:[Oo]s?|[Aa]s?|[Uu]ma?)\b"
    return bool(re.search(determinants, row))
df_rdd = df.rdd.map(lambda row: row[0])
df_rdd = df_rdd.map(determinanst)
# count how many hours and dates are in total
total_determinants = df_rdd.reduce(lambda x, y: x + y)
print("Total number of determinanst in the file: ", total_determinants)

[Stage 52:>                                                         (0 + 7) / 7]

Total number of determinanst in the file:  30658


                                                                                

remove determinants from the start of the topic

---

In [66]:
import re

# Define valid Portuguese characters
custom_valid = [
    "ã", "á", "à", "â", "Á", "Ã",
    "é", "ê", "É", "Ê",
    "í", "Í",
    "ó", "ô", "õ", "Ó", "Õ",
    "ú", "Ú",
    "ç", "-", "º", "ª"
]

fix_map = {
    "ő": "õ", "Ő": "Õ", "ť": "ç", "ťo": "ção",
    "Ã£": "ã", "Ã¡": "á", "Ãª": "ê", "Ã³": "ó",
    "Ã­": "í", "Ã©": "é", "Ã§": "ç", "Ã‰": "É",
    "Ãµ": "õ", "Ãº": "ú", "Ã‰": "É", "ů": "ó",
    "ă": "ã", "ę": "ê", "¾": "ó",
}

def valid_word_detection(text):
    return not re.search(rf"[^a-zA-Z0-9 {''.join(re.escape(c) for c in custom_valid)}]", text or "")

def fix_encoding_issues(text):
    if not isinstance(text, str):
        return text
    for enc in ["latin1", "cp1252"]:
        try:
            return text.encode(enc).decode("utf-8")
        except Exception:
            continue
    return text

def manual_fix(text):
    for wrong, right in fix_map.items():
        text = text.replace(wrong, right)
    return text

def fixed_words(row):
    if valid_word_detection(row):
        return True

    else:
        fixed_row = fix_encoding_issues(row)
        fixed_row = manual_fix(fixed_row)

        if valid_word_detection(fixed_row):
            return True
    
    return False


df_rdd = df.rdd.map(lambda row: row[0])
df_rdd_invalid = df_rdd.filter(lambda row: not fixed_words(row))

print("Total number of bad rows: ", df_rdd_invalid.count())



Total number of bad rows:  33910


                                                                                

adjust possible texts, delete impossible texts

---