## **USO DE SPARK CON EL LIBRO DE THE WAR DRAMA OF THE EAGLES**

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, col, regexp_replace, replace, size
import requests

In [3]:
spark = SparkSession.builder.appName("Analisis libro").getOrCreate()

In [4]:
url = "https://www.gutenberg.org/cache/epub/75293/pg75293.txt"

In [5]:
response = requests.get(url)
libro_texto = response.text

In [6]:
with open("libro.txt", "w", encoding="utf-8") as file:
    file.write(libro_texto)

In [7]:
rdd = spark.sparkContext.textFile("libro.txt")

In [8]:
rdd = rdd.filter(lambda line: line.strip() !="")

In [9]:
capitulos = rdd.filter(lambda line: line.strip().startswith("CHAPTER")).collect()
parrafos = rdd.flatMap(lambda line: line.split("\n\n"))

In [12]:
df_parrafos = parrafos.map(lambda p: (p,)).toDF(["parrafo"])
df_parrafos.show(25)

+--------------------+
|             parrafo|
+--------------------+
|The Project Guten...|
|This ebook is for...|
|most other parts ...|
|whatsoever. You m...|
|of the Project Gu...|
|at www.gutenberg....|
|you will have to ...|
|before using this...|
|Title: The war dr...|
|        Napoleon'...|
|Author: Edward Fr...|
|Release date: Feb...|
|   Language: English|
|Original publicat...|
|Credits: Brian Co...|
|*** START OF THE ...|
|Transcriber’s Not...|
|notes will be fou...|
|                 ...|
|                 ...|
|[Illustration: PO...|
|     PARADE UNIFORM.|
|From St. Hilaire’...|
|                 ...|
|                 ...|
+--------------------+
only showing top 25 rows



In [13]:
df_parrafos = df_parrafos.withColumn("parrafo_limpio", regexp_replace(col("parrafo"), "[^a-zA-Z\s]", "").alias("parrafo_limpio"))
df_parrafos = df_parrafos.withColumn("parrafo_limpio", regexp_replace(col("parrafo_limpio"), "\s+", " ").alias("parrafo_limpio"))

In [14]:
df_parrafos = df_parrafos.withColumn("oraciones", split(col("parrafo_limpio"), "\."))
df_parrafos = df_parrafos.withColumn("num_oraciones", size(col("oraciones")))
df_parrafos = df_parrafos.withColumn("palabras", split(col("parrafo_limpio"), " "))
df_parrafos = df_parrafos.withColumn("num_palabras", size(col("palabras")))

In [15]:
df_parrafos.show()

+--------------------+--------------------+--------------------+-------------+--------------------+------------+
|             parrafo|      parrafo_limpio|           oraciones|num_oraciones|            palabras|num_palabras|
+--------------------+--------------------+--------------------+-------------+--------------------+------------+
|The Project Guten...|The Project Guten...|[The Project Gute...|            1|[The, Project, Gu...|          11|
|This ebook is for...|This ebook is for...|[This ebook is fo...|            1|[This, ebook, is,...|          14|
|most other parts ...|most other parts ...|[most other parts...|            1|[most, other, par...|          14|
|whatsoever. You m...|whatsoever You ma...|[whatsoever You m...|            1|[whatsoever, You,...|          14|
|of the Project Gu...|of the Project Gu...|[of the Project G...|            1|[of, the, Project...|          11|
|at www.gutenberg....|at wwwgutenbergor...|[at wwwgutenbergo...|            1|[at, wwwgutenberg.