In [1]:
!pip install pyspark




In [2]:
from pyspark.sql import SparkSession

# Crear una sesión de Spark
spark = SparkSession.builder \
    .appName("RDDs en Colab") \
    .master("local[*]") \
    .getOrCreate()

# Obtener el SparkContext
sc = spark.sparkContext


In [4]:
!wget https://gist.githubusercontent.com/blakesanie/dde3a2b7e698f52f389532b4b52bc254/raw/76fe1b5e9efcf0d2afdfd78b0bfaa737ad0a67d3/shakespeare.txt
file_path = "shakespeare.txt"
shakespeareRDD = sc.textFile(file_path)
shakespeareRDD.take(10)

--2025-06-25 01:12:45--  https://gist.githubusercontent.com/blakesanie/dde3a2b7e698f52f389532b4b52bc254/raw/76fe1b5e9efcf0d2afdfd78b0bfaa737ad0a67d3/shakespeare.txt
Resolving gist.githubusercontent.com (gist.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...
Connecting to gist.githubusercontent.com (gist.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5436475 (5.2M) [text/plain]
Saving to: ‘shakespeare.txt’


2025-06-25 01:12:45 (250 MB/s) - ‘shakespeare.txt’ saved [5436475/5436475]



['  From fairest creatures we desire increase,',
 "  That thereby beauty's rose might never die,",
 '  But as the riper should by time decease,',
 '  His tender heir might bear his memory:',
 '  But thou contracted to thine own bright eyes,',
 "  Feed'st thy light's flame with self-substantial fuel,",
 '  Making a famine where abundance lies,',
 '  Thy self thy foe, to thy sweet self too cruel:',
 "  Thou that art now the world's fresh ornament,",
 '  And only herald to the gaudy spring,']

In [5]:
def minuscula(texto):
    """Esta función convierte las lineas de texto a minúscula"""
    return texto.lower()

In [6]:
lowerRDD = shakespeareRDD.map(minuscula)
lowerRDD.take(5)

['  from fairest creatures we desire increase,',
 "  that thereby beauty's rose might never die,",
 '  but as the riper should by time decease,',
 '  his tender heir might bear his memory:',
 '  but thou contracted to thine own bright eyes,']

In [16]:
# Convertir cada línea a minu+úsculas
lowerRDD = shakespeareRDD.map(lambda line: line.lower())
lowerRDD.take(5)

['  from fairest creatures we desire increase,',
 "  that thereby beauty's rose might never die,",
 '  but as the riper should by time decease,',
 '  his tender heir might bear his memory:',
 '  but thou contracted to thine own bright eyes,']

In [17]:
# Filtrar líneas que contienen la palabra "love"
loveLinesRDD = lowerRDD.filter(lambda line: "love" in line)
loveLinesRDD.take(10)

['  of his self-love to stop posterity?',
 '  calls back the lovely april of her prime,',
 '  unthrifty loveliness why dost thou spend,',
 '  the lovely gaze where every eye doth dwell',
 '    no love toward others in that bosom sits',
 "  for shame deny that thou bear'st love to any",
 '  grant if thou wilt, thou art beloved of many,',
 '  shall hate be fairer lodged than gentle love?',
 '    make thee another self for love of me,',
 '  o that you were your self, but love you are']

In [18]:
"Hola como estás?".split()

['Hola', 'como', 'estás?']

In [19]:
shakespeareRDD.take(10)

['  From fairest creatures we desire increase,',
 "  That thereby beauty's rose might never die,",
 '  But as the riper should by time decease,',
 '  His tender heir might bear his memory:',
 '  But thou contracted to thine own bright eyes,',
 "  Feed'st thy light's flame with self-substantial fuel,",
 '  Making a famine where abundance lies,',
 '  Thy self thy foe, to thy sweet self too cruel:',
 "  Thou that art now the world's fresh ornament,",
 '  And only herald to the gaudy spring,']

In [20]:
wordsRDD = shakespeareRDD.flatMap(lambda line: line.split())
wordsRDD.take(10)

['From',
 'fairest',
 'creatures',
 'we',
 'desire',
 'increase,',
 'That',
 'thereby',
 "beauty's",
 'rose']

In [21]:
wordsRDD.count()

899588

In [22]:
# Obtener palabras únicas
uniqueWordsRDD = wordsRDD.distinct()
uniqueWordsRDD.take(20)

['From',
 'desire',
 'increase,',
 'rose',
 'might',
 'never',
 'But',
 'as',
 'riper',
 'should',
 'by',
 'tender',
 'bear',
 'his',
 'thou',
 'to',
 'thine',
 'own',
 'thy',
 'with']

In [23]:
uniqueWordsRDD.count()

67107

In [24]:
# Recoger los elementos del RDD como una lista (usar con RDDs pequeños)
collectedWords = uniqueWordsRDD.collect()
print(collectedWords[:10])  # Mostrar los primeros 10

['From', 'desire', 'increase,', 'rose', 'might', 'never', 'But', 'as', 'riper', 'should']


In [25]:
type(collectedWords)

list

In [26]:
len(collectedWords)

67107

In [27]:
# Contar el número total de palabras únicas
wordCount = uniqueWordsRDD.count()
print(f"Número total de palabras únicas: {wordCount}")

Número total de palabras únicas: 67107


In [28]:
# Tomar las primeras 5 palabras únicas
first5Words = uniqueWordsRDD.take(5)
print(f"Primeras 5 palabras únicas: {first5Words}")

Primeras 5 palabras únicas: ['From', 'desire', 'increase,', 'rose', 'might']


In [29]:
# Paso 1: Dividir cada línea en palabras
wordsRDD = shakespeareRDD.flatMap(lambda line: line.split())

# Paso 2: Usar reduce para encontrar la palabra más larga
longestWord = wordsRDD.reduce(lambda word1, word2: word1 if len(word1) > len(word2) else word2)

print(f"La palabra más larga es: {longestWord}")

La palabra más larga es: tragical-comical-historical-pastoral;


In [32]:

shortestWord = wordsRDD.reduce(lambda word1, word2: word1 if len(word1) < len(word2) else word2)

print(f"La palabra más corta es: {shortestWord}")

La palabra más corta es: a


In [33]:
# Paso 1: Dividir cada línea en palabras usando flatMap
wordsRDD = shakespeareRDD.flatMap(lambda line: line.split())

# Paso 2: Crear pares (palabra, 1) para cada palabra
wordPairsRDD = wordsRDD.map(lambda word: (word, 1))
wordPairsRDD.take(10)

[('From', 1),
 ('fairest', 1),
 ('creatures', 1),
 ('we', 1),
 ('desire', 1),
 ('increase,', 1),
 ('That', 1),
 ('thereby', 1),
 ("beauty's", 1),
 ('rose', 1)]

In [34]:
# Paso 3: Usar reduceByKey para sumar las ocurrencias de cada palabra
wordCountsRDD = wordPairsRDD.reduceByKey(lambda x, y: x + y)

# Paso 4: Mostrar las primeras 10 palabras y sus conteos
top10Words = wordCountsRDD.take(10)
print("Palabras y sus conteos:", top10Words)

Palabras y sus conteos: [('From', 349), ('desire', 161), ('increase,', 9), ('rose', 40), ('might', 414), ('never', 865), ('But', 2321), ('as', 4267), ('riper', 3), ('should', 1383)]


In [35]:
# Paso 1: Dividir cada línea en palabras
wordsRDD = shakespeareRDD.flatMap(lambda line: line.split())

# Paso 2: Crear un par (palabra, 1) para cada palabra
wordPairsRDD = wordsRDD.map(lambda word: (word, 1))

# Paso 3: Sumar los valores por palabra
wordCountsRDD = wordPairsRDD.reduceByKey(lambda x, y: x + y)

# Paso 4: Ordenar por conteo en orden descendente
sortedWordCountsRDD = wordCountsRDD.sortBy(lambda x: x[1], ascending=False)

# Paso 5: Tomar las primeras 10 palabras más frecuentes
top10Words = sortedWordCountsRDD.take(10)
print("Las 10 palabras más frecuentes y sus conteos:", top10Words)

Las 10 palabras más frecuentes y sus conteos: [('the', 23197), ('I', 19540), ('and', 18263), ('to', 15592), ('of', 15507), ('a', 12516), ('my', 10824), ('in', 9565), ('you', 9059), ('is', 7831)]


In [36]:
# Cerrar la sesión de Spark
spark.stop()