# Lab 8 RDDS

## Librerias

In [None]:
from pyspark.sql import SparkSession
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords", quiet=True) # Descargar stopwords si no están descargadas
# docker compose up -d

## Spark Session

In [None]:
spark = SparkSession.builder.appName("MiApp").master("local[*]").getOrCreate()
sc = spark.sparkContext
working_dir = "/opt/app/working_dir/"
rdd = sc.textFile(working_dir + "constitution.txt")

## Carga de RDD y exploración

In [None]:
primeras_lineas = rdd.take(3)
num_lineas = rdd.count()

print("=== Exploración inicial ===")
for i, l in enumerate(primeras_lineas, 1):
    print(f"Línea {i}: {l}")
print(f"\nTotal de líneas en el documento: {num_lineas:,}")

## Word Count

In [None]:
splitted_lines = rdd.map(lambda line: line.split(' '))
print("\nEjemplo con map (3 elementos):")
print(splitted_lines.take(3))

# Pipeline de limpieza + normalización
words_rdd = (
    rdd.flatMap(lambda line: line.strip().split(' '))
       .map(lambda w: w.strip())
       .filter(lambda w: w != '' and w.isalnum())
       .map(lambda w: w.lower())
)

num_palabras = words_rdd.count()
print(f"\nTotal de 'tokens' limpios: {num_palabras:,}")

# Palabra más larga (reduce)
mas_larga = words_rdd.reduce(lambda a, b: a if len(a) > len(b) else b)
print(f"Palabra más larga: '{mas_larga}'")


## Conteos y Top

In [None]:
keyval_rdd = words_rdd.map(lambda w: (w, 1))
wordcount = keyval_rdd.reduceByKey(lambda a, b: a + b)

top5 = (wordcount
        .map(lambda kv: (kv[1], kv[0]))
        .sortByKey(ascending=False)
        .take(5))

print("\nTop 5 (incluyendo stopwords):")
for rank, (freq, word) in enumerate(top5, 1):
    print(f"{rank}. {word} -> {freq}")

## Top-N sin stopwords

In [None]:
# Stopwords base + términos frecuentes del dominio legal/constitucional
sw = set(stopwords.words("english"))
domain_sw = {"shall", "section", "sections", "article", "articles", "state", "states"}
stopwords_all = sw.union(domain_sw)

# Filtrar y obtener top 5 sin stopwords
top5_no_stop = (
    wordcount
    .filter(lambda kv: kv[0] not in stopwords_all)
    .map(lambda kv: (kv[1], kv[0]))
    .sortByKey(ascending=False)
    .take(5)
)

print("Top 5 sin stopwords:")
for i, (freq, word) in enumerate(top5_no_stop, 1):
    print(f"{i}. {word} -> {freq}")