### 

Indexing documents and computing query relevance using Spark DataFrames.

There are two parts:

1.  Code builds the indexing pipeline using Spark data frames.  The result will be a DataFrame that will have the (docid, term, tfidf) values for every term appearing in every document.
2. The index is used to compute the most relevant documents for a query.  


In [1]:
import re
from pyspark.sql import SQLContext
from pyspark.sql.types import StructField, StructType, ArrayType, StringType, LongType, FloatType, IntegerType, Row
from pyspark.sql.functions import udf, col, column, size, collect_set
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
from pyspark.sql.functions import round
from pyspark.sql.functions import sum

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1,application_1716492702455_0002,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
#points to S3 location
books_directory = 's3://aws-emr-studio-247682200909-us-east-1/1716433985675/e-83ZVZCJXCP2R95V5BUL2OEQDY/books/'

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:


import re

stopwords = set(["a", "as", "able", "about", "above", "according", "accordingly",
	     "across", "actually", "after", "afterwards", "again", "against", "aint", "all", "allow",
	     "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among",
	     "amongst", "an", "and", "another", "any", "anybody", "anyhow", "anyone", "anything", "anyway",
	     "anyways", "anywhere", "apart", "appear","appreciate", "appropriate", "are", "arent", "around",
	     "as", "aside", "ask", "asking", "associated", "at", "available", "away", "awfully", "be", "became",
	     "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind",
	     "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond",
	     "both", "brief", "but", "by", "cmon", "cs", "came", "can", "cant", "cannot", "cant",
	     "cause", "causes", "certain", "certainly", "changes", "clearly", "co", "com", "come",
	     "comes", "concerning", "consequently", "consider", "considering", "contain", "containing",
	     "contains", "corresponding", "could", "couldnt", "course", "currently", "definitely",
	     "described", "despite", "did", "didnt", "different", "do", "does", "doesnt", "doing",
	     "dont", "done", "down", "downwards", "during", "each", "edu", "eg", "eight", "either",
	     "else", "elsewhere", "enough", "entirely", "especially", "et", "etc", "even", "ever",
	     "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example",
	     "except", "far", "few", "ff", "fifth", "first", "five", "followed", "following", "follows",
	     "for", "former", "formerly", "forth", "four", "from", "further", "furthermore", "get",
	     "gets", "getting", "given", "gives", "go", "goes", "going", "gone", "got", "gotten",
	     "greetings", "had", "hadnt", "happens", "hardly", "has", "hasnt", "have", "havent",
	     "having", "he", "hes", "hello", "help", "hence", "her", "here", "heres", "hereafter",
	     "hereby", "herein", "hereupon", "hers", "herself", "hi", "him", "himself",
	     "his", "hither", "hopefully", "how", "howbeit", "however", "i", "id", "ill", "im", "ive",
	     "ie", "if", "ignored", "immediate", "in", "inasmuch", "inc", "indeed", "indicate",
	     "indicated", "indicates", "inner", "insofar", "instead", "into", "inward", "is",
	     "isnt", "it", "itd", "itll", "its", "its", "itself", "just", "keep", "keeps", "kept",
	     "know", "knows", "known", "last", "lately", "later", "latter", "latterly", "least",
	     "less", "lest", "let", "lets", "like", "liked", "likely", "little", "look", "looking",
	     "looks", "ltd", "mainly", "many", "may", "maybe", "me", "mean", "meanwhile", "merely",
	     "might", "more", "moreover", "most", "mostly", "much", "must", "my", "myself",
	     "name", "namely", "nd", "near", "nearly", "necessary", "need", "needs", "neither",
	     "never", "nevertheless", "new", "next", "nine", "no", "nobody", "non", "none", "noone",
	     "nor", "normally", "not", "nothing", "novel", "now", "nowhere", "obviously", "of",
	     "off", "often", "oh", "ok", "okay", "old", "on", "once", "one", "ones", "only",
	     "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", "ourselves",
	     "out", "outside", "over", "overall", "own", "particular", "particularly",
	     "per", "perhaps", "placed", "please", "plus", "possible", "presumably", "probably",
	     "provides", "que", "quite", "qv", "rather", "rd", "re", "really", "reasonably",
	     "regarding", "regardless", "regards", "relatively", "respectively", "right", "said",
	     "same", "saw", "say", "saying", "says", "second", "secondly", "see", "seeing",
	     "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent",
	     "serious", "seriously", "seven", "several", "shall", "she", "should", "shouldnt",
	     "since", "six", "so", "some", "somebody", "somehow", "someone", "something",
	     "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specified", "specify",
	     "specifying", "still", "sub", "such", "sup", "sure", "ts", "take", "taken", "tell", "tends",
	     "th", "than", "thank", "thanks", "thanx", "that", "thats", "thats", "the", "their", "theirs",
	     "them", "themselves", "then", "thence", "there", "theres", "thereafter", "thereby",
	     "therefore", "therein", "theres", "thereupon", "these", "they", "theyd",
	     "theyll", "theyre", "theyve", "think", "third", "this", "thorough",
	     "thoroughly", "those", "though", "three", "through", "throughout", "thru",
	     "thus", "to", "together", "too", "took", "toward", "towards", "tried", "tries",
	     "truly", "try", "trying", "twice", "two", "un", "under", "unfortunately",
	     "unless", "unlikely", "until", "unto", "up", "upon", "us", "use", "used",
	     "useful", "uses", "using", "usually", "value", "various", "very", "via", "viz",
	     "vs", "want", "wants", "was", "wasnt", "way", "we", "wed", "well", "were", "weve",
	     "welcome", "well", "went", "were", "werent", "what", "whats", "whatever", "when",
	     "whence", "whenever", "where", "wheres", "whereafter", "whereas", "whereby",
	     "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who",
	     "whos", "whoever", "whole", "whom", "whose", "why", "will", "willing", "wish",
	     "with", "within", "without", "wont", "wonder", "would", "would", "wouldnt", "yes",
	     "yet", "you", "youd", "youll", "youre", "youve", "your", "yours", "yourself",
	     "yourselves", "zero"])

def termify(line):
    terms = []
    words = re.findall(r'[^\W_]+', line)
    for word in words:
        lowered = word.lower()
        if (len(lowered) > 1) and (lowered not in stopwords) and (not re.search(r'^\d*$', lowered)):
            terms.append(lowered)
    return terms


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
# function to get a docid from a file path

def get_docid(filepath):
    return filepath.split('/')[-1][: -4]

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:

#  UDF versions 


getDocidUDF = udf(lambda c: get_docid(c), StringType()) 
termifyUDF = udf(lambda c: termify(c), ArrayType(StringType()))   

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

#### The Indexing Phase

All operations are packaged together into a single function, indexDocuments
*  Accepts:  directory containing the text documents
*  Returns:  a Data Frame with columns (term, docid, tfidf) and schema (string, string, float)

In [6]:
# RDD to read the text files (filepath, full_document_text) 
booksRDD = sc.wholeTextFiles(books_directory)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
# SQLContext to create a data frame from RDD.
sq = SQLContext(sc)
documents = sq.createDataFrame(booksRDD, StructType([StructField('filename', StringType()), 
                                                     StructField('document', StringType())]))
print(documents.count())
documents.show(1)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

18
+--------------------+--------------------+
|            filename|            document|
+--------------------+--------------------+
|s3://aws-emr-stud...|[Emma by Jane Aus...|
+--------------------+--------------------+
only showing top 1 row


In [8]:
# Converting the filename to a docid.  
documents=documents.withColumn('docid', getDocidUDF(col('filename'))).drop('filename').select('docid','document')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
documents.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------------+--------------------+
|              docid|            document|
+-------------------+--------------------+
|        austen-emma|[Emma by Jane Aus...|
|  austen-persuasion|[Persuasion by Ja...|
|       austen-sense|[Sense and Sensib...|
|          bible-kjv|[The King James B...|
|        blake-poems|[Poems by William...|
|     bryant-stories|[Stories to Tell ...|
|burgess-busterbrown|[The Adventures o...|
|      carroll-alice|[Alice's Adventur...|
|    chesterton-ball|[The Ball and The...|
|   chesterton-brown|[The Wisdom of Fa...|
|chesterton-thursday|[The Man Who Was ...|
|  edgeworth-parents|[The Parent's Ass...|
| melville-moby_dick|[Moby Dick by Her...|
|    milton-paradise|[Paradise Lost by...|
| shakespeare-caesar|[The Tragedie of ...|
| shakespeare-hamlet|[The Tragedie of ...|
|shakespeare-macbeth|[The Tragedie of ...|
|     whitman-leaves|[Leaves of Grass ...|
+-------------------+--------------------+

In [10]:
# below code runs termify on the document text.  
documents=documents.withColumn('terms', termifyUDF(col('document'))).drop('document')
documents.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------------+--------------------+
|              docid|               terms|
+-------------------+--------------------+
|        austen-emma|[emma, jane, aust...|
|  austen-persuasion|[persuasion, jane...|
|       austen-sense|[sense, sensibili...|
|          bible-kjv|[king, james, bib...|
|        blake-poems|[poems, william, ...|
|     bryant-stories|[stories, childre...|
|burgess-busterbrown|[adventures, bust...|
|      carroll-alice|[alice, adventure...|
|    chesterton-ball|[ball, cross, che...|
|   chesterton-brown|[wisdom, father, ...|
|chesterton-thursday|[man, thursday, c...|
|  edgeworth-parents|[parent, assistan...|
| melville-moby_dick|[moby, dick, herm...|
|    milton-paradise|[paradise, lost, ...|
| shakespeare-caesar|[tragedie, julius...|
| shakespeare-hamlet|[tragedie, hamlet...|
|shakespeare-macbeth|[tragedie, macbet...|
|     whitman-leaves|[leaves, grass, w...|
+-------------------+--------------------+

In [11]:
# below code explodes the terms to get one row per term. 
exploded = documents.withColumn('term', explode(col('terms'))).drop('terms')
exploded.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------+-----------+
|      docid|       term|
+-----------+-----------+
|austen-emma|       emma|
|austen-emma|       jane|
|austen-emma|     austen|
|austen-emma|     volume|
|austen-emma|    chapter|
|austen-emma|       emma|
|austen-emma|  woodhouse|
|austen-emma|   handsome|
|austen-emma|     clever|
|austen-emma|       rich|
|austen-emma|comfortable|
|austen-emma|       home|
|austen-emma|      happy|
|austen-emma|disposition|
|austen-emma|      unite|
|austen-emma|  blessings|
|austen-emma|  existence|
|austen-emma|      lived|
|austen-emma|     twenty|
|austen-emma|      years|
+-----------+-----------+
only showing top 20 rows

In [12]:
# code to create data frame for term_frequency, where we group by 'docid' and 'term' and count within each group.
term_frequency = exploded.select('docid', 'term').\
    groupBy(col('docid'),col('term')).\
    count().withColumnRenamed('count', 'docid_term_count')
term_frequency.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------------+-----------+----------------+
|            docid|       term|docid_term_count|
+-----------------+-----------+----------------+
|      austen-emma|       busy|              19|
|      austen-emma|      feels|              10|
|      austen-emma|application|               3|
|      austen-emma|   luckiest|               3|
|      austen-emma|   displays|               2|
|      austen-emma|  overtaken|               2|
|      austen-emma|strenuously|               1|
|      austen-emma|     naming|               3|
|      austen-emma|     unseen|               3|
|      austen-emma|  objecting|               1|
|      austen-emma|   pardoned|               1|
|      austen-emma|    teaches|               1|
|austen-persuasion|    shewing|               2|
|austen-persuasion| apartments|               2|
|     austen-sense|      defer|               4|
|     austen-sense|     belief|               7|
|     austen-sense|   involved|               5|
|     austen-sense| 

In [13]:
# code creates a data frame for document_size
document_size = exploded.groupBy('docid') \
                         .count().withColumnRenamed('count', 'docid_count')

document_size.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------------+-----------+
|              docid|docid_count|
+-------------------+-----------+
|       austen-sense|      40397|
|  austen-persuasion|      28637|
|        austen-emma|      53278|
|          bible-kjv|     294290|
| melville-moby_dick|      90784|
|    chesterton-ball|      31125|
|  edgeworth-parents|      59871|
|    milton-paradise|      38991|
|shakespeare-macbeth|       8647|
|        blake-poems|       3302|
|burgess-busterbrown|       5694|
|chesterton-thursday|      22139|
|     bryant-stories|      16432|
|     whitman-leaves|      56168|
|      carroll-alice|       8773|
| shakespeare-hamlet|      13509|
|   chesterton-brown|      27908|
| shakespeare-caesar|       9443|
+-------------------+-----------+

In [14]:
# code normalizes term frequency by joining term_frequency and document_size, dividing, and multipling
    
    # by 1000000.0. 
joint = term_frequency.join(document_size, "docid")
joint.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+-----------+----------------+-----------+
|       docid|       term|docid_term_count|docid_count|
+------------+-----------+----------------+-----------+
|austen-sense|     beasts|               1|      40397|
|austen-sense|   meantime|               1|      40397|
|austen-sense|     apiece|               1|      40397|
|austen-sense|   hitherto|               8|      40397|
|austen-sense|   accident|               5|      40397|
|austen-sense|   interest|              47|      40397|
|austen-sense|     davies|               2|      40397|
|austen-sense|    leagued|               1|      40397|
|austen-sense|      penny|               2|      40397|
|austen-sense|     nieces|               1|      40397|
|austen-sense|  midsummer|               1|      40397|
|austen-sense|   weakness|               4|      40397|
|austen-sense| dissolving|               1|      40397|
|austen-sense|  treatment|               5|      40397|
|austen-sense|     urgent|               5|     

In [15]:

# Normalizing the term frequency
normalized_df = joint.withColumn(
    'tfnormed', 
    (col('docid_term_count') / col('docid_count')) * 1000000.0
)

normalized_df.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+-----------+----------------+-----------+------------------+
|       docid|       term|docid_term_count|docid_count|          tfnormed|
+------------+-----------+----------------+-----------+------------------+
|austen-sense|     beasts|               1|      40397|24.754313439116768|
|austen-sense|   meantime|               1|      40397|24.754313439116768|
|austen-sense|     apiece|               1|      40397|24.754313439116768|
|austen-sense|   hitherto|               8|      40397|198.03450751293414|
|austen-sense|   accident|               5|      40397|123.77156719558383|
|austen-sense|   interest|              47|      40397| 1163.452731638488|
|austen-sense|     davies|               2|      40397|49.508626878233535|
|austen-sense|    leagued|               1|      40397|24.754313439116768|
|austen-sense|      penny|               2|      40397|49.508626878233535|
|austen-sense|     nieces|               1|      40397|24.754313439116768|
|austen-sense|  midsummer

In [16]:
# Selecting the required columns
result_df = normalized_df.select('docid', 'term', 'tfnormed')

# Showing the result
result_df.show()


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+-----------+------------------+
|       docid|       term|          tfnormed|
+------------+-----------+------------------+
|austen-sense|     beasts|24.754313439116768|
|austen-sense|   meantime|24.754313439116768|
|austen-sense|     apiece|24.754313439116768|
|austen-sense|   hitherto|198.03450751293414|
|austen-sense|   accident|123.77156719558383|
|austen-sense|   interest| 1163.452731638488|
|austen-sense|     davies|49.508626878233535|
|austen-sense|    leagued|24.754313439116768|
|austen-sense|      penny|49.508626878233535|
|austen-sense|     nieces|24.754313439116768|
|austen-sense|  midsummer|24.754313439116768|
|austen-sense|   weakness| 99.01725375646707|
|austen-sense| dissolving|24.754313439116768|
|austen-sense|  treatment|123.77156719558383|
|austen-sense|     urgent|123.77156719558383|
|austen-sense|  declining| 148.5258806347006|
|austen-sense|    affixed|24.754313439116768|
|austen-sense|   outweigh|  74.2629403173503|
|austen-sense|     lament|24.75431

In [17]:
result_df.schema

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

StructType([StructField('docid', StringType(), True), StructField('term', StringType(), True), StructField('tfnormed', DoubleType(), True)])

In [18]:
result_df = result_df.withColumn("tfnormed", col("tfnormed").cast("float"))
result_df.schema

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

StructType([StructField('docid', StringType(), True), StructField('term', StringType(), True), StructField('tfnormed', FloatType(), True)])

In [19]:
  # code to create a dataframe to capture document_frequency (number of documents a term appears in)
document_frequency = term_frequency.groupBy('term') \
                         .count().withColumnRenamed('count', 'num_documents')
document_frequency.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+-------------+
|     term|num_documents|
+---------+-------------+
|    spoil|           11|
|  elevate|            4|
|     elsa|            1|
|unworldly|            2|
|  lyrical|            2|
|     hope|           17|
|   brands|            3|
|  embrace|            9|
|  courted|            2|
|   outfit|            2|
|     curv|            1|
|colmekill|            1|
|  highest|           14|
|   speedy|            6|
| laughing|           14|
|     fain|            4|
|    parts|           14|
|    hurry|           12|
|sceptical|            3|
|    staff|            9|
+---------+-------------+
only showing top 20 rows

In [20]:
jointnew = result_df.join(document_frequency, "term")
jointnew.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+-------------------+---------+-------------+
|     term|              docid| tfnormed|num_documents|
+---------+-------------------+---------+-------------+
|    spoil|   chesterton-brown| 35.83202|           11|
|    spoil|burgess-busterbrown|351.24692|           11|
|    spoil|        blake-poems|302.84677|           11|
|    spoil|    milton-paradise|76.940834|           11|
|    spoil|  edgeworth-parents|133.62062|           11|
|    spoil|    chesterton-ball|32.128513|           11|
|    spoil| melville-moby_dick|44.060627|           11|
|    spoil|          bible-kjv|400.96503|           11|
|    spoil|        austen-emma|18.769474|           11|
|    spoil|  austen-persuasion|34.919857|           11|
|    spoil|       austen-sense|49.508625|           11|
|  elevate|    milton-paradise|25.646944|            4|
|  elevate| melville-moby_dick|11.015157|            4|
|  elevate|        austen-emma|37.538948|            4|
|  elevate|  austen-persuasion|34.919857|       

In [21]:
tfidf = jointnew.withColumn(
    'tfidf', 
    col('tfnormed') / col('num_documents')
).select('term', 'docid', 'tfidf')

tfidf.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+-------------------+------------------+
|     term|              docid|             tfidf|
+---------+-------------------+------------------+
|    spoil|   chesterton-brown| 3.257456345991655|
|    spoil|burgess-busterbrown| 31.93153797496449|
|    spoil|        blake-poems|27.531524658203125|
|    spoil|    milton-paradise| 6.994621276855469|
|    spoil|  edgeworth-parents|12.147329157049006|
|    spoil|    chesterton-ball|2.9207739396528765|
|    spoil| melville-moby_dick| 4.005511543967507|
|    spoil|          bible-kjv| 36.45136607776988|
|    spoil|        austen-emma| 1.706315820867365|
|    spoil|  austen-persuasion|3.1745324568314985|
|    spoil|       austen-sense| 4.500784093683416|
|  elevate|    milton-paradise| 6.411736011505127|
|  elevate| melville-moby_dick| 2.753789186477661|
|  elevate|        austen-emma| 9.384737014770508|
|  elevate|  austen-persuasion| 8.729964256286621|
|     elsa|     bryant-stories|  1338.85107421875|
|unworldly|   chesterton-brown|

In [22]:
tfidf.schema

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

StructType([StructField('term', StringType(), True), StructField('docid', StringType(), True), StructField('tfidf', DoubleType(), True)])

In [23]:
tfidf = tfidf.withColumn("tfidf", col("tfidf").cast("float"))
tfidf.schema

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

StructType([StructField('term', StringType(), True), StructField('docid', StringType(), True), StructField('tfidf', FloatType(), True)])

In [24]:
def indexDocuments(path):
    # RDD to read the text files (filepath, full_document_text) 
    booksRDD = sc.wholeTextFiles(path)
    # SQLContext to create a data frame from RDD.
    sq = SQLContext(sc)
    documents = sq.createDataFrame(booksRDD, StructType([StructField('filename', StringType()), 
                                                     StructField('document', StringType())]))

  
    
    # Converting the filename to a docid.  

    documents=documents.withColumn('docid', getDocidUDF(col('filename'))).drop('filename').select('docid','document')

    # below code runs termify on the document text.  
    documents=documents.withColumn('terms', termifyUDF(col('document'))).drop('document')

    
    # below code explodes the terms to get one row per term.  
    exploded = documents.withColumn('term', explode(col('terms'))).drop('terms')
    # code to create data frame for term_frequency, where we group by 'docid' and 'term' and count within each group.
    term_frequency = exploded.select('docid', 'term').\
    groupBy(col('docid'),col('term')).\
    count().withColumnRenamed('count', 'docid_term_count')
  
    
    # code creates a data frame for document_size
    document_size = exploded.groupBy('docid') \
                         .count().withColumnRenamed('count', 'docid_count')
    # code normalizes term frequency by joining term_frequency and document_size, dividing, and multipling
    
    # by 1000000.0.  
    joint = term_frequency.join(document_size, "docid")
    normalized_df = joint.withColumn(
    'tfnormed', 
    (col('docid_term_count') / col('docid_count')) * 1000000.0
    )
    result_df = normalized_df.select('docid', 'term', 'tfnormed')
    result_df = result_df.withColumn("tfnormed", col("tfnormed").cast("float"))
    # code to create a dataframe to capture document_frequency (number of documents a term appears in)
   
    document_frequency = term_frequency.groupBy('term') \
                         .count().withColumnRenamed('count', 'num_documents')

 
    jointnew = result_df.join(document_frequency, "term")
    tfidf = jointnew.withColumn(
    'tfidf', 
    col('tfnormed') / col('num_documents')
    ).select('term', 'docid', 'tfidf')
    tfidf = tfidf.withColumn("tfidf", col("tfidf").cast("float"))

    return tfidf


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [25]:
indexDocuments('s3://aws-emr-studio-247682200909-us-east-1/1716433985675/e-83ZVZCJXCP2R95V5BUL2OEQDY/books/').show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+-------------------+---------+
|     term|              docid|    tfidf|
+---------+-------------------+---------+
|    spoil|   chesterton-brown|3.2574563|
|    spoil|burgess-busterbrown|31.931538|
|    spoil|        blake-poems|27.531525|
|    spoil|    milton-paradise|6.9946213|
|    spoil|  edgeworth-parents|12.147329|
|    spoil|    chesterton-ball| 2.920774|
|    spoil| melville-moby_dick|4.0055118|
|    spoil|          bible-kjv|36.451366|
|    spoil|        austen-emma|1.7063159|
|    spoil|  austen-persuasion|3.1745324|
|    spoil|       austen-sense| 4.500784|
|  elevate|    milton-paradise| 6.411736|
|  elevate| melville-moby_dick|2.7537892|
|  elevate|        austen-emma| 9.384737|
|  elevate|  austen-persuasion| 8.729964|
|     elsa|     bryant-stories|1338.8511|
|unworldly|   chesterton-brown| 17.91601|
|unworldly|    chesterton-ball|32.128513|
|  lyrical|chesterton-thursday| 22.58458|
|  lyrical|    chesterton-ball|16.064257|
+---------+-------------------+---

In [26]:

index = indexDocuments(books_directory)
index.cache()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[term: string, docid: string, tfidf: float]

In [27]:
# Just verify that the frame holds plausible data (right columns, right data types, reasonable values)
index.show(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+-------------------+---------+
|      term|              docid|    tfidf|
+----------+-------------------+---------+
|abruptness|       austen-sense|4.9508624|
|abruptness|    chesterton-ball|12.851405|
|abruptness|chesterton-thursday| 9.033832|
|abruptness|   chesterton-brown| 7.166404|
|abruptness|        austen-emma|3.7538948|
+----------+-------------------+---------+
only showing top 5 rows

#### Relevance Calculation

Relevance is a relationship between a "query" -- a string of words, and the index.
The query is parsed (termified) using the same function as was used to index documents.
Then in the Spark framework TFIDF value is computed for the query terms for all documents and the top N are selected


In [28]:
term_list=termify("buster, whale, king, and alice the rabbit!")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [29]:
terms_df = spark.createDataFrame([term for term in term_list], StringType()).toDF("term")

terms_df.count()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

5

In [30]:
terms_df.schema

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

StructType([StructField('term', StringType(), True)])

In [31]:
index.join(terms_df, "term").show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------+-------------------+---------+
|  term|              docid|    tfidf|
+------+-------------------+---------+
| alice|      carroll-alice|15122.155|
| alice|  edgeworth-parents|11.135052|
| alice|chesterton-thursday|15.056386|
|buster|     whitman-leaves| 8.901866|
|buster|burgess-busterbrown|20108.887|
|rabbit|    chesterton-ball|21.419008|
|rabbit|      carroll-alice|968.88184|
|rabbit|burgess-busterbrown|321.97635|
|rabbit|chesterton-thursday| 7.528193|
|rabbit|     bryant-stories|101.42811|
|rabbit|   chesterton-brown| 17.91601|
|  king|          bible-kjv| 576.0758|
|  king|     whitman-leaves| 4.747662|
|  king|        blake-poems|20.189785|
|  king| melville-moby_dick|   46.998|
|  king|    chesterton-ball|21.419008|
|  king|      carroll-alice| 478.7416|
|  king|  edgeworth-parents|16.702578|
|  king| shakespeare-hamlet|848.81683|
|  king|chesterton-thursday|18.067663|
+------+-------------------+---------+
only showing top 20 rows

In [32]:
index.schema

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

StructType([StructField('term', StringType(), True), StructField('docid', StringType(), True), StructField('tfidf', FloatType(), True)])

In [33]:
# Joining the query with the tfidf data frame, then grouping by docid and sum the tfidf values.
jointnew =index.join(terms_df, "term")
jointnew.schema

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

StructType([StructField('term', StringType(), True), StructField('docid', StringType(), True), StructField('tfidf', FloatType(), True)])

In [34]:
# Dividing by the number of query terms
jointnew=jointnew.groupBy('docid') \
                     .agg((sum('tfidf') / terms_df.count()).alias('score')).withColumn("score", col("score").cast("int"))
jointnew.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------------+-----+
|              docid|score|
+-------------------+-----+
|burgess-busterbrown| 4086|
|     whitman-leaves|    5|
| melville-moby_dick|  459|
|    chesterton-ball|   10|
|     bryant-stories|  167|
|          bible-kjv|  115|
| shakespeare-hamlet|  174|
|  edgeworth-parents|    5|
|    milton-paradise|   16|
|shakespeare-macbeth|   84|
|        blake-poems|    4|
|chesterton-thursday|    8|
|        austen-emma|    0|
|      carroll-alice| 3313|
|   chesterton-brown|   13|
| shakespeare-caesar|    5|
+-------------------+-----+

In [35]:
# Sorting the data frame in descending order of score, and taking the first 10.
top_10_scores_df = jointnew.orderBy(col("score").desc()).limit(10)

# Show the top 10 scores
top_10_scores_df.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------------+-----+
|              docid|score|
+-------------------+-----+
|burgess-busterbrown| 4086|
|      carroll-alice| 3313|
| melville-moby_dick|  459|
| shakespeare-hamlet|  174|
|     bryant-stories|  167|
|          bible-kjv|  115|
|shakespeare-macbeth|   84|
|    milton-paradise|   16|
|   chesterton-brown|   13|
|    chesterton-ball|   10|
+-------------------+-----+

In [36]:
result_tuples = [(row.docid, row.score) for row in top_10_scores_df.collect()]
result_tuples

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[('burgess-busterbrown', 4086), ('carroll-alice', 3313), ('melville-moby_dick', 459), ('shakespeare-hamlet', 174), ('bryant-stories', 167), ('bible-kjv', 115), ('shakespeare-macbeth', 84), ('milton-paradise', 16), ('chesterton-brown', 13), ('chesterton-ball', 10)]

In [37]:
# Inputs:
#    The query string (a string)
#    The index (a Data Frame in the form produced by indexDocuments)

#
# Output:
#    A (Python) list of at most 10 tuples of the form (docid, relevance) which are the N most relevant documents
#     for the query, along with the relevance value for each document




def relevance(query, tfidf):
    
    

    term_list=termify(query)
    terms_df = spark.createDataFrame([term for term in term_list], StringType()).toDF("term")
    # Joining the query with the tfidf data frame, then grouping by docid and sum the tfidf values.
    
    jointnew =tfidf.join(terms_df, "term")
    # Dividing by the number of query terms
    jointnew=jointnew.groupBy('docid') \
                     .agg((sum('tfidf') / terms_df.count()).alias('score')).withColumn("score", col("score").cast("int"))
    # Sorting the data frame in descending order of score, and taking the first 10.
    top_10_scores_df = jointnew.orderBy(col("score").desc()).limit(10)
    result_tuples = [(row.docid, row.score) for row in top_10_scores_df.collect()]
    return result_tuples


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [38]:
# A single-query test
print(relevance("relevance test!", index))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[('chesterton-brown', 11), ('chesterton-ball', 7), ('shakespeare-hamlet', 4), ('bryant-stories', 3), ('whitman-leaves', 3), ('melville-moby_dick', 2), ('chesterton-thursday', 2), ('austen-sense', 1), ('austen-persuasion', 1)]

In [39]:
## Testing multiple queries.

for query in ["buster, whale, king, and alice the rabbit!",
              "Take a whale to lunch this week!",
              "What would Jesus do about that?",
              "My name is Buster.  Deal with it.",
              "Bodice ripper?",
              "Why does it have to be sense AND sensibility, why can't it be sense OR sensibility",
              "What are leaves of grass anyway?",
              "??!?"]:
    print(query)
    tuples = relevance(query, index)
    if len(tuples) == 0:
        print("    No results for this query")
    else:
        for tuple in relevance(query, index):
            print("    " + str(tuple))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

buster, whale, king, and alice the rabbit!
    ('burgess-busterbrown', 4086)
    ('carroll-alice', 3313)
    ('melville-moby_dick', 459)
    ('shakespeare-hamlet', 174)
    ('bryant-stories', 167)
    ('bible-kjv', 115)
    ('shakespeare-macbeth', 84)
    ('milton-paradise', 16)
    ('chesterton-brown', 13)
    ('chesterton-ball', 10)
Take a whale to lunch this week!
    ('melville-moby_dick', 751)
    ('bryant-stories', 64)
    ('burgess-busterbrown', 35)
    ('austen-persuasion', 25)
    ('austen-sense', 23)
    ('chesterton-brown', 20)
    ('chesterton-thursday', 15)
    ('austen-emma', 15)
    ('edgeworth-parents', 12)
    ('carroll-alice', 9)
What would Jesus do about that?
    ('bible-kjv', 668)
    ('blake-poems', 60)
    ('milton-paradise', 10)
    ('chesterton-ball', 6)
    ('melville-moby_dick', 2)
My name is Buster.  Deal with it.
    ('burgess-busterbrown', 10108)
    ('austen-emma', 66)
    ('austen-persuasion', 60)
    ('carroll-alice', 52)
    ('austen-sense', 31)
    ('