In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, lower, explode, \
                                  regexp_replace, \
                                  size, lit, array_contains, \
                                  regexp_extract, element_at, col
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer
import numpy as np
import math
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
carpeta = './books/'
spark = SparkSession.builder.appName('Books').getOrCreate()
rdd = spark.sparkContext.wholeTextFiles(carpeta)
df = rdd.toDF(["ruta", "texto"])
df = df.withColumn("doc", regexp_extract("ruta", r"([^/]+$)", 1))
df = df.withColumn("doc",regexp_replace("doc", "\\.txt$", ""))
df = df.withColumn("texto",regexp_replace("texto", "\\n$", ""))
df = df.select("doc", "texto")
filterWords = stopwords.words('english')
df.show()

[nltk_data] Downloading package stopwords to /home/jenoe/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/07 20:50:39 WARN Utils: Your hostname, ParienteLAP, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/12/07 20:50:39 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/07 20:50:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

+--------------------+--------------------+
|                 doc|               texto|
+--------------------+--------------------+
|  The King in Yellow|\r\n             ...|
|         The Odyssey|The Odyssey\r\n\r...|
|A Doll's House _ ...|A Doll’s House\r\...|
|Golden Days for B...|\r\n             ...|
|        The Republic|THE REPUBLIC\r\n\...|
|    The Great Gatsby|\r\n             ...|
|Beowulf_ An Anglo...|BEOWULF\r\nAN ANG...|
|      The Expedition|THE EXPEDITION OF...|
|      Rip Van Winkle|RIP VAN WINKLE\r\...|
|A Tale of Two Cities|A TALE OF TWO CIT...|
|        Little Women|\r\n             ...|
|  A Room with a View|A Room With A Vie...|
|   Frankenstein 1993|Frankenstein;\r\n...|
|The Adventures of...|﻿The Project Gute...|
|           The Iliad|The\r\nIliad of H...|
|              Hamlet|THE TRAGEDY OF HA...|
|How to Observe_ M...|\r\n    HOW TO OB...|
|The Adventures of...|﻿The Project Gute...|
|The Adventures of...|﻿The Project Gute...|
|  My Life — Volume 1|My Life\r\

In [2]:
def clean(df, col:str):
    df_limpia = df.withColumn(
        col,
        regexp_replace(col, '[^a-zA-Z0-9 \\n$]','')
    )
    return df_limpia

In [3]:
df_clean = clean(df, 'texto')
df_clean.show()


                                                                                

+--------------------+--------------------+
|                 doc|               texto|
+--------------------+--------------------+
|  The King in Yellow|\n               ...|
|         The Odyssey|The Odyssey\n\nby...|
|A Doll's House _ ...|A Dolls House\n\n...|
|Golden Days for B...|\n               ...|
|        The Republic|THE REPUBLIC\n\nB...|
|    The Great Gatsby|\n               ...|
|Beowulf_ An Anglo...|BEOWULF\nAN ANGLO...|
|      The Expedition|THE EXPEDITION OF...|
|      Rip Van Winkle|RIP VAN WINKLE\n\...|
|A Tale of Two Cities|A TALE OF TWO CIT...|
|        Little Women|\n               ...|
|  A Room with a View|A Room With A Vie...|
|   Frankenstein 1993|Frankenstein\n\no...|
|The Adventures of...|The Project Guten...|
|           The Iliad|The\nIliad of Hom...|
|              Hamlet|THE TRAGEDY OF HA...|
|How to Observe_ M...|\n    HOW TO OBSE...|
|The Adventures of...|The Project Guten...|
|The Adventures of...|The Project Guten...|
|  My Life — Volume 1|My Life\n\

In [4]:
df_palabras = df_clean.withColumn('texto',split(
                lower('texto'),
                "\\s+"
            ) )

In [5]:
df_palabras.show()

                                                                                

+--------------------+--------------------+
|                 doc|               texto|
+--------------------+--------------------+
|  The King in Yellow|[, the, king, in,...|
|         The Odyssey|[the, odyssey, by...|
|A Doll's House _ ...|[a, dolls, house,...|
|Golden Days for B...|[, golden, days, ...|
|        The Republic|[the, republic, b...|
|    The Great Gatsby|[, the, great, ga...|
|Beowulf_ An Anglo...|[beowulf, an, ang...|
|      The Expedition|[the, expedition,...|
|      Rip Van Winkle|[rip, van, winkle...|
|A Tale of Two Cities|[a, tale, of, two...|
|        Little Women|[, little, women,...|
|  A Room with a View|[a, room, with, a...|
|   Frankenstein 1993|[frankenstein, or...|
|The Adventures of...|[the, project, gu...|
|           The Iliad|[the, iliad, of, ...|
|              Hamlet|[the, tragedy, of...|
|How to Observe_ M...|[, how, to, obser...|
|The Adventures of...|[the, project, gu...|
|The Adventures of...|[the, project, gu...|
|  My Life — Volume 1|[my, life,

In [6]:
from pyspark.sql import functions as F

In [7]:
df_palabras_filter = df_palabras.withColumn("texto",F.filter("texto",
                                                             lambda x: ~array_contains
                                                             (lit(filterWords), x)))

In [8]:
df_palabras_filter.show()

                                                                                

+--------------------+--------------------+
|                 doc|               texto|
+--------------------+--------------------+
|  The King in Yellow|[, king, yellow, ...|
|         The Odyssey|[odyssey, homer, ...|
|A Doll's House _ ...|[dolls, house, he...|
|Golden Days for B...|[, golden, days, ...|
|        The Republic|[republic, plato,...|
|    The Great Gatsby|[, great, gatsby,...|
|Beowulf_ An Anglo...|[beowulf, anglosa...|
|      The Expedition|[expedition, hump...|
|      Rip Van Winkle|[rip, van, winkle...|
|A Tale of Two Cities|[tale, two, citie...|
|        Little Women|[, little, women,...|
|  A Room with a View|[room, view, e, f...|
|   Frankenstein 1993|[frankenstein, mo...|
|The Adventures of...|[project, gutenbe...|
|           The Iliad|[iliad, homer, tr...|
|              Hamlet|[tragedy, hamlet,...|
|How to Observe_ M...|[, observe, moral...|
|The Adventures of...|[project, gutenbe...|
|The Adventures of...|[project, gutenbe...|
|  My Life — Volume 1|[life, ric

In [9]:
df_palabras_joined = df_palabras_filter.withColumn('texto',F.array_join('texto',' '))

In [10]:
df_palabras_explode = df_palabras_filter.withColumn('texto',explode('texto'))

In [11]:
df_palabras_explode.show()

[Stage 5:>                                                          (0 + 1) / 1]

+------------------+-----------+
|               doc|      texto|
+------------------+-----------+
|The King in Yellow|           |
|The King in Yellow|       king|
|The King in Yellow|     yellow|
|The King in Yellow|     robert|
|The King in Yellow|          w|
|The King in Yellow|   chambers|
|The King in Yellow|   original|
|The King in Yellow|publication|
|The King in Yellow|       date|
|The King in Yellow|       1895|
|The King in Yellow|       king|
|The King in Yellow|     yellow|
|The King in Yellow|  dedicated|
|The King in Yellow|    brother|
|The King in Yellow|   contents|
|The King in Yellow|   repairer|
|The King in Yellow|reputations|
|The King in Yellow|       mask|
|The King in Yellow|      court|
|The King in Yellow|     dragon|
+------------------+-----------+
only showing top 20 rows


                                                                                

In [12]:
df_size = df_palabras_explode.groupBy('doc').count()

In [13]:
df_size.show()



+--------------------+------+
|                 doc| count|
+--------------------+------+
|Jane Eyre_ An Aut...| 87560|
|           The Iliad|109823|
|Beowulf_ An Anglo...| 22250|
|      Rip Van Winkle|  4339|
|          On Liberty| 23817|
|Golden Days for B...| 26971|
|    The Great Gatsby| 24283|
|           Leviathan| 96257|
|A Doll's House _ ...| 12254|
|The Adventures of...| 88906|
|The Adventures of...| 77574|
|  A Room with a View| 32817|
|           Dubliners| 33864|
|The Interesting N...| 37629|
|  The King in Yellow| 35347|
|  My Life — Volume 1|103129|
|Simple Sabotage F...|  5101|
|             Ulysses|146847|
|        The Republic| 94157|
|             Dracula| 71814|
+--------------------+------+
only showing top 20 rows


                                                                                

In [14]:
df_words_size = df_palabras_explode.join(df_size, on='doc', how='inner').orderBy('doc')
df_words_size.show(30)

                                                                                

+--------------------+-----------+-----+
|                 doc|      texto|count|
+--------------------+-----------+-----+
|A Christmas Carol...|  christmas|13848|
|A Christmas Carol...|    servant|13848|
|A Christmas Carol...|      carol|13848|
|A Christmas Carol...|      prose|13848|
|A Christmas Carol...|      ghost|13848|
|A Christmas Carol...|      story|13848|
|A Christmas Carol...|  christmas|13848|
|A Christmas Carol...|    charles|13848|
|A Christmas Carol...|    dickens|13848|
|A Christmas Carol...|    preface|13848|
|A Christmas Carol...|endeavoured|13848|
|A Christmas Carol...|    ghostly|13848|
|A Christmas Carol...|     little|13848|
|A Christmas Carol...|       book|13848|
|A Christmas Carol...|      raise|13848|
|A Christmas Carol...|      ghost|13848|
|A Christmas Carol...|       idea|13848|
|A Christmas Carol...|      shall|13848|
|A Christmas Carol...|        put|13848|
|A Christmas Carol...|    readers|13848|
|A Christmas Carol...|     humour|13848|
|A Christmas Car

In [15]:
df_words_size = df_words_size.withColumn('count', 1/df_words_size['count'])
df_words_size.show()

                                                                                

+--------------------+-----------+--------------------+
|                 doc|      texto|               count|
+--------------------+-----------+--------------------+
|A Christmas Carol...|  christmas|7.221259387637204E-5|
|A Christmas Carol...|     season|7.221259387637204E-5|
|A Christmas Carol...|      carol|7.221259387637204E-5|
|A Christmas Carol...|      prose|7.221259387637204E-5|
|A Christmas Carol...|      ghost|7.221259387637204E-5|
|A Christmas Carol...|      story|7.221259387637204E-5|
|A Christmas Carol...|  christmas|7.221259387637204E-5|
|A Christmas Carol...|    charles|7.221259387637204E-5|
|A Christmas Carol...|    dickens|7.221259387637204E-5|
|A Christmas Carol...|    preface|7.221259387637204E-5|
|A Christmas Carol...|endeavoured|7.221259387637204E-5|
|A Christmas Carol...|    ghostly|7.221259387637204E-5|
|A Christmas Carol...|     little|7.221259387637204E-5|
|A Christmas Carol...|       book|7.221259387637204E-5|
|A Christmas Carol...|      raise|7.221259387637

                                                                                

In [16]:
df_TF = df_words_size.groupBy('doc', 'texto',).agg( F.sum('count').alias('tf'))
df_TF.show(100)

[Stage 31:>                                                         (0 + 1) / 1]

+--------------------+------------+--------------------+
|                 doc|       texto|                  tf|
+--------------------+------------+--------------------+
|A Christmas Carol...|   christmas|0.006210283073367994|
|A Christmas Carol...|       carol|2.166377816291161E-4|
|A Christmas Carol...|       prose|7.221259387637204E-5|
|A Christmas Carol...|       ghost|0.006643558636626226|
|A Christmas Carol...|       story|2.166377816291161E-4|
|A Christmas Carol...|     charles|7.221259387637204E-5|
|A Christmas Carol...|     dickens|7.221259387637204E-5|
|A Christmas Carol...|     preface|7.221259387637204E-5|
|A Christmas Carol...| endeavoured|2.166377816291161E-4|
|A Christmas Carol...|     ghostly|5.054881571346042E-4|
|A Christmas Carol...|      little|0.004549393414211437|
|A Christmas Carol...|        book|2.166377816291161E-4|
|A Christmas Carol...|       raise|4.332755632582321...|
|A Christmas Carol...|        idea|4.332755632582321...|
|A Christmas Carol...|       sh

                                                                                

In [17]:
df_DF = df_palabras_explode.withColumn('df',lit(1))

In [18]:
df_DF.show()

+------------------+-----------+---+
|               doc|      texto| df|
+------------------+-----------+---+
|The King in Yellow|           |  1|
|The King in Yellow|       king|  1|
|The King in Yellow|     yellow|  1|
|The King in Yellow|     robert|  1|
|The King in Yellow|          w|  1|
|The King in Yellow|   chambers|  1|
|The King in Yellow|   original|  1|
|The King in Yellow|publication|  1|
|The King in Yellow|       date|  1|
|The King in Yellow|       1895|  1|
|The King in Yellow|       king|  1|
|The King in Yellow|     yellow|  1|
|The King in Yellow|  dedicated|  1|
|The King in Yellow|    brother|  1|
|The King in Yellow|   contents|  1|
|The King in Yellow|   repairer|  1|
|The King in Yellow|reputations|  1|
|The King in Yellow|       mask|  1|
|The King in Yellow|      court|  1|
|The King in Yellow|     dragon|  1|
+------------------+-----------+---+
only showing top 20 rows


In [19]:
df_DF_Count = df_DF.groupBy('texto').agg( F.sum('df').alias('df'))

In [20]:
df_DF_Count.show()



+-------------+-----+
|        texto|   df|
+-------------+-----+
|        still|10061|
|       waters|  734|
|          art| 2431|
|        bebut|    5|
|      welcher|    1|
|      flashed|  242|
|    connected|  457|
|    recognize|  336|
|         hope| 3437|
|   opalescent|    2|
|     painters|   71|
|        spoil|  204|
|otherwherever|    1|
|     tortured|  147|
|  requirement|   12|
|       spared|  199|
|        pools|   34|
|     spoiling|   47|
|      implore|   81|
|     everyday|   85|
+-------------+-----+
only showing top 20 rows


                                                                                

In [21]:
df_DFTF = df_TF.join(df_DF_Count, on='texto', how='inner').orderBy('doc')
df_DFTF = df_DFTF.select('doc','texto','tf','df')

In [22]:
df_DFTF.show()



+--------------------+-----------+--------------------+-----+
|                 doc|      texto|                  tf|   df|
+--------------------+-----------+--------------------+-----+
|A Christmas Carol...|  christmas|0.006210283073367994|  335|
|A Christmas Carol...|      carol|2.166377816291161E-4|   14|
|A Christmas Carol...|      prose|7.221259387637204E-5|  113|
|A Christmas Carol...|      ghost|0.006643558636626226|  526|
|A Christmas Carol...|      story|2.166377816291161E-4| 1828|
|A Christmas Carol...|    charles|7.221259387637204E-5|  479|
|A Christmas Carol...|    dickens|7.221259387637204E-5|   22|
|A Christmas Carol...|    preface|7.221259387637204E-5|  126|
|A Christmas Carol...|endeavoured|2.166377816291161E-4|  343|
|A Christmas Carol...|    ghostly|5.054881571346042E-4|  111|
|A Christmas Carol...|     little|0.004549393414211437|16033|
|A Christmas Carol...|       book|2.166377816291161E-4| 2440|
|A Christmas Carol...|      raise|4.332755632582321...|  635|
|A Chris

                                                                                

In [23]:
df_doc_Count = df.count()

                                                                                

In [24]:
df_DFTF_Pesos = df_DFTF.withColumn('pesos',col('tf') * (F.log10(1 + df_doc_Count / F.col('df'))))
df_DFTF_Pesos = df_DFTF_Pesos.select('doc','texto','pesos')

In [25]:
df_DFTF_Pesos.show()



+--------------------+-----------+--------------------+
|                 doc|      texto|               pesos|
+--------------------+-----------+--------------------+
|A Christmas Carol...|  christmas|7.045221470920073E-4|
|A Christmas Carol...|      carol|1.973086689034303...|
|A Christmas Carol...|      prose|1.988021085754751...|
|A Christmas Carol...|      ghost|5.021772236579677E-4|
|A Christmas Carol...|      story|5.011013468154112E-6|
|A Christmas Carol...|    charles|5.946205250785168E-6|
|A Christmas Carol...|    dickens|5.372163127184734E-5|
|A Christmas Carol...|    preface|1.832307149262261...|
|A Christmas Carol...|endeavoured|2.407053860058472E-5|
|A Christmas Carol...|    ghostly|1.410107116967971E-4|
|A Christmas Carol...|     little|1.228491463774214...|
|A Christmas Carol...|       book|3.779005693502731...|
|A Christmas Carol...|      raise| 2.75188967903896E-5|
|A Christmas Carol...|       idea|7.158306529580312...|
|A Christmas Carol...|      shall|3.909787845647

                                                                                

In [27]:
from pyspark.ml.feature import HashingTF, IDF
from pyspark.sql import functions as F
import numpy as np

# 1. Usar df_palabras_filter directamente - ¡YA TIENE LAS PALABRAS COMO ARRAY!
print("Verificando estructura de datos...")
print("Esquema de df_palabras_filter:")
df_palabras_filter.printSchema()

# 2. Mostrar ejemplo para confirmar
print("\nMuestra de datos (primeros 3 documentos):")
df_palabras_filter.select("doc", "texto").show(3, truncate=50)

# 3. Crear DataFrame para TF-IDF (texto ya es array, no necesitamos split)
# Pero HashingTF espera una columna con arrays de strings
df_for_tfidf = df_palabras_filter.select("doc", "texto")

# Renombrar la columna para claridad
df_for_tfidf = df_for_tfidf.withColumnRenamed("texto", "palabras")

print(f"\nPreparando {df_for_tfidf.count()} documentos para TF-IDF...")
print("Estructura final:")
df_for_tfidf.printSchema()
df_for_tfidf.show(3, truncate=50)

Verificando estructura de datos...
Esquema de df_palabras_filter:
root
 |-- doc: string (nullable = true)
 |-- texto: array (nullable = true)
 |    |-- element: string (containsNull = false)


Muestra de datos (primeros 3 documentos):


                                                                                

+-----------------------+--------------------------------------------------+
|                    doc|                                             texto|
+-----------------------+--------------------------------------------------+
|     The King in Yellow|[, king, yellow, robert, w, chambers, original,...|
|            The Odyssey|[odyssey, homer, rendered, english, prose, use,...|
|A Doll's House _ a play|[dolls, house, henrik, ibsen, contents, act, ac...|
+-----------------------+--------------------------------------------------+
only showing top 3 rows


                                                                                


Preparando 100 documentos para TF-IDF...
Estructura final:
root
 |-- doc: string (nullable = true)
 |-- palabras: array (nullable = true)
 |    |-- element: string (containsNull = false)



[Stage 71:>                                                         (0 + 1) / 1]

+-----------------------+--------------------------------------------------+
|                    doc|                                          palabras|
+-----------------------+--------------------------------------------------+
|     The King in Yellow|[, king, yellow, robert, w, chambers, original,...|
|            The Odyssey|[odyssey, homer, rendered, english, prose, use,...|
|A Doll's House _ a play|[dolls, house, henrik, ibsen, contents, act, ac...|
+-----------------------+--------------------------------------------------+
only showing top 3 rows


                                                                                

In [28]:
# 2. Aplicar HashingTF - ya tenemos "palabras" como array
print("Aplicando HashingTF...")

# Verificar que tenemos datos
print(f"Número de documentos: {df_for_tfidf.count()}")

# Usar HashingTF con la columna de array
hashingTF = HashingTF(inputCol="palabras", outputCol="raw_features", 
                      numFeatures=5000)

df_tf = hashingTF.transform(df_for_tfidf)

# Mostrar cómo se ven las características
print("\nPrimeros 3 documentos transformados:")
df_tf.select("doc", F.size("palabras").alias("num_palabras"), "raw_features").show(3, truncate=30)
print(f"\nNúmero de características: {hashingTF.getNumFeatures()}")

Aplicando HashingTF...


                                                                                

Número de documentos: 100

Primeros 3 documentos transformados:


[Stage 75:>                                                         (0 + 1) / 1]

+-----------------------+------------+------------------------------+
|                    doc|num_palabras|                  raw_features|
+-----------------------+------------+------------------------------+
|     The King in Yellow|       35347|(5000,[1,2,3,4,5,6,7,8,10,1...|
|            The Odyssey|       57990|(5000,[0,1,2,6,7,8,9,10,12,...|
|A Doll's House _ a play|       12254|(5000,[1,2,5,7,13,14,15,17,...|
+-----------------------+------------+------------------------------+
only showing top 3 rows

Número de características: 5000


                                                                                

In [29]:
# 3. Aplicar IDF
print("Calculando IDF (Inverse Document Frequency)...")
idf = IDF(inputCol="raw_features", outputCol="tfidf_vector")
idf_model = idf.fit(df_tf)
df_tfidf = idf_model.transform(df_tf)

print("✅ TF-IDF calculado exitosamente!")
print(f"Total documentos procesados: {df_tfidf.count()}")
print("\nMuestra de vectores TF-IDF (primeros 5):")

# Función para mostrar vectores de forma legible
def format_vector(vector, max_elements=5):
    if vector is None:
        return "None"
    if hasattr(vector, 'toArray'):
        arr = vector.toArray()
        non_zero = arr[arr != 0]
        if len(non_zero) > 0:
            return f"Vector[{len(arr)} dims], {len(non_zero)} no-cero"
        else:
            return f"Vector[{len(arr)} dims], todos cero"
    return str(vector)[:100] + "..."

# Mostrar con formato
for row in df_tfidf.select("doc", "tfidf_vector").take(5):
    doc_name = row["doc"]
    vector_info = format_vector(row["tfidf_vector"])
    print(f"  • {doc_name[:30]:30} -> {vector_info}")

Calculando IDF (Inverse Document Frequency)...


                                                                                

✅ TF-IDF calculado exitosamente!


                                                                                

Total documentos procesados: 100

Muestra de vectores TF-IDF (primeros 5):


[Stage 80:>                                                         (0 + 1) / 1]

  • The King in Yellow             -> Vector[5000 dims], 4096 no-cero
  • The Odyssey                    -> Vector[5000 dims], 3989 no-cero
  • A Doll's House _ a play        -> Vector[5000 dims], 2007 no-cero
  • Golden Days for Boys and Girls -> Vector[5000 dims], 3899 no-cero
  • The Republic                   -> Vector[5000 dims], 4439 no-cero


                                                                                

In [30]:
# 4. Preparar todos los vectores para cálculo de similitud
print("Recopilando todos los vectores TF-IDF...")

# Primero asegurarnos de que tenemos vectores válidos
df_tfidf_valid = df_tfidf.filter(F.col("tfidf_vector").isNotNull())
print(f"Documentos con vectores válidos: {df_tfidf_valid.count()} de {df_tfidf.count()}")

# Recolectar vectores
doc_vectors = df_tfidf_valid.select("doc", "tfidf_vector").collect()

# Verificar que hay datos
if len(doc_vectors) == 0:
    print("⚠ ERROR: No hay vectores para calcular similitud!")
    print("Revisando datos de entrada...")
    df_tfidf.select("doc", "tfidf_vector").show(5, truncate=False)
else:
    # Convertir a diccionario para acceso rápido
    vector_dict = {row["doc"]: row["tfidf_vector"] for row in doc_vectors}
    doc_names = list(vector_dict.keys())

    print(f"✓ Se procesaron {len(doc_names)} documentos con vectores válidos")
    print("\nPrimeros 10 documentos:")
    for i, doc in enumerate(doc_names[:10]):
        print(f"  {i+1:2}. {doc}")
    
    # Verificar algunos vectores
    print("\nVerificación de vectores (primeros 3):")
    for i, doc in enumerate(doc_names[:3]):
        vec = vector_dict[doc]
        if hasattr(vec, 'toArray'):
            arr = vec.toArray()
            non_zero = np.count_nonzero(arr)
            print(f"  {doc[:30]:30} -> Dimensión: {len(arr)}, No-cero: {non_zero}")
        else:
            print(f"  {doc[:30]:30} -> Tipo: {type(vec)}")

Recopilando todos los vectores TF-IDF...


                                                                                

Documentos con vectores válidos: 100 de 100


                                                                                

✓ Se procesaron 100 documentos con vectores válidos

Primeros 10 documentos:
   1. The King in Yellow
   2. The Odyssey
   3. A Doll's House _ a play
   4. Golden Days for Boys and Girls, Vol. XII, Jan. 3, 1891
   5. The Republic
   6. The Great Gatsby
   7. Beowulf_ An Anglo-Saxon Epic Poem
   8. The Expedition
   9. Rip Van Winkle
  10. A Tale of Two Cities

Verificación de vectores (primeros 3):
  The King in Yellow             -> Dimensión: 5000, No-cero: 4096
  The Odyssey                    -> Dimensión: 5000, No-cero: 3989
  A Doll's House _ a play        -> Dimensión: 5000, No-cero: 2007


In [31]:
# 5. Función para calcular similitud coseno entre dos vectores Spark
def calculate_cosine_spark(v1, v2):
    """Calcula similitud coseno entre dos vectores Spark ML"""
    from pyspark.ml.linalg import SparseVector, DenseVector
    import numpy as np
    
    # Convertir vectores Spark a arrays numpy
    if isinstance(v1, SparseVector):
        v1_array = v1.toArray()
    elif isinstance(v1, DenseVector):
        v1_array = v1.toArray()
    else:
        v1_array = np.array(v1)
    
    if isinstance(v2, SparseVector):
        v2_array = v2.toArray()
    elif isinstance(v2, DenseVector):
        v2_array = v2.toArray()
    else:
        v2_array = np.array(v2)
    
    # Calcular producto punto
    dot = np.dot(v1_array, v2_array)
    
    # Calcular normas
    norm1 = np.linalg.norm(v1_array)
    norm2 = np.linalg.norm(v2_array)
    
    # Evitar división por cero
    if norm1 == 0 or norm2 == 0:
        return 0.0
    
    # Calcular similitud coseno
    return float(dot / (norm1 * norm2))

print("Calculando TODA la matriz de similitud coseno...")
print(f"Esto calculará {len(doc_names)} × {len(doc_names)} = {len(doc_names)**2} similitudes")
print("Puede tomar varios minutos dependiendo del número de documentos...")

# Calcular matriz completa
sim_matrix_complete = []
for i, doc1 in enumerate(doc_names):
    row = []
    for j, doc2 in enumerate(doc_names):
        # Calcular similitud coseno
        similarity = calculate_cosine_spark(vector_dict[doc1], vector_dict[doc2])
        # Redondear a 4 decimales
        row.append(round(similarity, 4))
    
    sim_matrix_complete.append(row)
    
    # Mostrar progreso
    if (i + 1) % 5 == 0 or i == len(doc_names) - 1:
        print(f"  Procesada fila {i + 1}/{len(doc_names)}")

print("✅ ¡Matriz de similitud completa calculada!")

Calculando TODA la matriz de similitud coseno...
Esto calculará 100 × 100 = 10000 similitudes
Puede tomar varios minutos dependiendo del número de documentos...
  Procesada fila 5/100
  Procesada fila 10/100
  Procesada fila 15/100
  Procesada fila 20/100
  Procesada fila 25/100
  Procesada fila 30/100
  Procesada fila 35/100
  Procesada fila 40/100
  Procesada fila 45/100
  Procesada fila 50/100
  Procesada fila 55/100
  Procesada fila 60/100
  Procesada fila 65/100
  Procesada fila 70/100
  Procesada fila 75/100
  Procesada fila 80/100
  Procesada fila 85/100
  Procesada fila 90/100
  Procesada fila 95/100
  Procesada fila 100/100
✅ ¡Matriz de similitud completa calculada!


In [32]:
# 6. Crear DataFrame de Spark con la matriz completa
print("Creando DataFrame de la matriz de similitud...")

# Primero crear una lista de filas para el DataFrame
matrix_rows = []
for i, doc_name in enumerate(doc_names):
    # Cada fila es un documento con todas sus similitudes
    row_dict = {"documento": doc_name}
    
    # Agregar similitudes con todos los documentos
    for j, other_doc in enumerate(doc_names):
        # Usar nombres de columnas seguros (sin caracteres especiales)
        col_name = f"sim_{j:04d}"
        row_dict[col_name] = sim_matrix_complete[i][j]
    
    matrix_rows.append(row_dict)

# Crear DataFrame de Spark
from pyspark.sql import Row

# Convertir a RDD de Rows
rdd_rows = spark.sparkContext.parallelize([
    Row(**row_dict) for row_dict in matrix_rows
])

# Crear DataFrame
df_sim_matrix = spark.createDataFrame(rdd_rows)

print(f"DataFrame creado: {df_sim_matrix.count()} filas × {len(df_sim_matrix.columns)} columnas")
print("\nEsquema del DataFrame de similitud:")
df_sim_matrix.printSchema()

Creando DataFrame de la matriz de similitud...




DataFrame creado: 100 filas × 101 columnas

Esquema del DataFrame de similitud:
root
 |-- documento: string (nullable = true)
 |-- sim_0000: double (nullable = true)
 |-- sim_0001: double (nullable = true)
 |-- sim_0002: double (nullable = true)
 |-- sim_0003: double (nullable = true)
 |-- sim_0004: double (nullable = true)
 |-- sim_0005: double (nullable = true)
 |-- sim_0006: double (nullable = true)
 |-- sim_0007: double (nullable = true)
 |-- sim_0008: double (nullable = true)
 |-- sim_0009: double (nullable = true)
 |-- sim_0010: double (nullable = true)
 |-- sim_0011: double (nullable = true)
 |-- sim_0012: double (nullable = true)
 |-- sim_0013: double (nullable = true)
 |-- sim_0014: double (nullable = true)
 |-- sim_0015: double (nullable = true)
 |-- sim_0016: double (nullable = true)
 |-- sim_0017: double (nullable = true)
 |-- sim_0018: double (nullable = true)
 |-- sim_0019: double (nullable = true)
 |-- sim_0020: double (nullable = true)
 |-- sim_0021: double (nullable = 

                                                                                

In [35]:
df_sim_matrix.show(3)

+--------------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
|           documento|sim_0000|sim_0001|sim_0002|sim_0003|sim_0004|sim_0005|s

In [37]:
# CELDA ÚNICA: Guardar datos para script Python
import pandas as pd
import numpy as np

# 1. Guardar matriz de similitud
matriz_df = pd.DataFrame(
    sim_matrix_complete,
    index=doc_names,
    columns=doc_names
)
matriz_df.to_csv('matriz_similitud_libros.csv')

# 2. Guardar lista de libros
libros_df = pd.DataFrame({
    'indice': range(len(doc_names)),
    'nombre_libro': doc_names
})
libros_df.to_csv('lista_libros_indices.csv', index=False)

# 3. Guardar datos TF-IDF
df_tfidf_pandas = df_DFTF_Pesos.toPandas()
df_tfidf_pandas.to_csv('datos_tfidf_libros.csv', index=False)

# 4. Opcional: Resumen rápido
print(f"Guardado: matriz_similitud_libros.csv ({len(doc_names)}×{len(doc_names)})")
print(f"Guardado: lista_libros_indices.csv ({len(doc_names)} libros)")
print(f"Guardado: datos_tfidf_libros.csv ({len(df_tfidf_pandas)} registros)")

                                                                                

Guardado: matriz_similitud_libros.csv (100×100)
Guardado: lista_libros_indices.csv (100 libros)
Guardado: datos_tfidf_libros.csv (838467 registros)
