# Import & Launch Session

In [1]:
import pandas as pd
import os
import nltk
import re

In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, Row
from nltk.corpus import stopwords

## Utils

In [3]:
def print_context_infos(context):
    print("VERSION: ",context.version)
    print("PYTHON_VERSION: ", context.pythonVer)
    print("MASTER: ", context.master)
    print("SPARK_HOME: ", str(context.sparkHome))
    print("SPARK_USER: ", str(context.sparkUser()))
    print("APP_NAME: ", context.appName)
    print("APP_ID: ", context.applicationId)
    print("DEFAULT_PARALLESLISM: ", context.defaultParallelism)
    print("DEFAULT_PARTITION: ", context.defaultMinPartitions)

## Initialisation

In [4]:
cwd = os.getcwd()
print(cwd)

/Users/manulabricole/Documents/CDN/BigData


In [6]:
spark_session = SparkSession.builder \
    .appName("book_session") \
    .config("spark.driver.bindAddress", "127.0.0.1") \
    .enableHiveSupport() \
    .getOrCreate()
print("Spark Web UI: http://localhost:4040")

Spark Web UI: http://localhost:4040


In [7]:
spark_context = spark_session.sparkContext
print_context_infos(spark_context)

VERSION:  3.4.1
PYTHON_VERSION:  3.11
MASTER:  local[*]
SPARK_HOME:  None
SPARK_USER:  manulabricole
APP_NAME:  book_session
APP_ID:  local-1690356127101
DEFAULT_PARALLESLISM:  10
DEFAULT_PARTITION:  2


## Import text

In [8]:
filename = "beautifull_story.txt"
text_path = os.path.join(cwd, filename)
print(text_path)

/Users/manulabricole/Documents/CDN/BigData/beautifull_story.txt


In [9]:
text_rdd = spark_session.sparkContext.textFile(text_path)

In [10]:
for line in text_rdd.take(5):
    print(line)

Project Gutenberg's Beautiful Stories from Shakespeare, by E. Nesbit

This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  You may copy it, give it away or
re-use it under the terms of the Project Gutenberg License included


                                                                                

# Explore the book shape

## Main infos

In [11]:
num_lines = text_rdd.count()
words_rdd = text_rdd.flatMap(lambda line: line.split())
num_words = words_rdd.count()
num_distinct_words = words_rdd.distinct().count()

In [12]:
print("Number of Lines           --> ", num_lines)
print("Number of words           --> ", num_words)
print("Number of Different Words --> ", num_distinct_words)

Number of Lines           -->  7422
Number of words           -->  52592
Number of Different Words -->  10264


## Theme aborded

### Top 10 words

In [13]:
# Count the occurrences of each word
word_counts_rdd = words_rdd.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)
# Sort the words based on their count in descending order
sorted_word_counts_rdd = word_counts_rdd.sortBy(lambda x: x[1], ascending=False)

In [14]:
# Take the first ten words from the sorted list
top_10_words = sorted_word_counts_rdd.take(10)
print(top_10_words)

[('the', 2072), ('and', 1774), ('to', 1451), ('.', 1373), ('of', 1152), ('a', 937), ('he', 805), ('was', 762), ('his', 687), ('in', 638)]


### Top 10 words filtered

In [15]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [16]:
# Initialize NLTK and download the averaged_perceptron_tagger data package
nltk.download("stopwords")
nltk.download("averaged_perceptron_tagger")
nltk.download("punkt")
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/manulabricole/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/manulabricole/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/manulabricole/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
def clean_pattern(word):
    # Define a regular expression pattern to remove unwanted characters
    pattern = r'[^A-Za-z0-9]'
    
    # Use re.sub to remove unwanted characters from the word
    cleaned_word = re.sub(pattern, '', word)
    
    return cleaned_word

# Function to check if a word is a verb using NLTK's part-of-speech tagging
def is_verb(word):
    tagged_word = pos_tag([word])
    return tagged_word[0][1].startswith("VB")

# Function to filter out unwanted characters and check if a word is a verb
def is_valid_word(word):
    word = clean_pattern(word)
    word = word.lower()
    return word.isalpha() and word not in stop_words and not is_verb(word)
    

In [18]:
# Tokenize each line by splitting it into words, remove stop words and unwanted characters
words_rdd = text_rdd.flatMap(lambda line: word_tokenize(line)).filter(is_valid_word)
new_number = words_rdd.count()
print(f"We pass from {num_words} to {new_number} words ! ")

[Stage 12:>                                                         (0 + 2) / 2]

We pass from 52592 to 20782 words ! 


                                                                                

In [19]:
# Count the occurrences of each word. reduceByKLey receive a function. 
# For the same key, a and b are the values coming from two key value pair. We keep the common key but make the sum of the values
word_counts_rdd = words_rdd.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)
# Sort the words based on their count in descending order
sorted_word_counts_rdd = word_counts_rdd.sortBy(lambda x: x[1], ascending=False)

                                                                                

In [20]:
for word, count in sorted_word_counts_rdd.take(20):
    print(f"{word}: {count}")

King: 183
would: 174
love: 146
Duke: 124
one: 111
man: 110
could: 106
told: 103
father: 88
like: 85
Project: 83
wife: 82
Claudio: 73
thought: 71
Timon: 70
two: 68
Othello: 68
Macbeth: 66
daughter: 65
day: 64


## Découpe d'histoires

In [21]:
text_rdd.collect()

["Project Gutenberg's Beautiful Stories from Shakespeare, by E. Nesbit",
 '',
 'This eBook is for the use of anyone anywhere at no cost and with',
 'almost no restrictions whatsoever.  You may copy it, give it away or',
 're-use it under the terms of the Project Gutenberg License included',
 'with this eBook or online at www.gutenberg.org',
 '',
 '',
 'Title: Beautiful Stories from Shakespeare',
 '',
 'Author: E. Nesbit',
 '',
 'Posting Date: August 15, 2008 [EBook #1430]',
 'Release Date: August, 1998',
 'Last Updated: March 9, 2018',
 '',
 'Language: English',
 '',
 'Character set encoding: UTF-8',
 '',
 '*** START OF THIS PROJECT GUTENBERG EBOOK BEAUTIFUL STORIES FROM SHAKESPEARE ***',
 '',
 '',
 '',
 '',
 'Produced by Morrie Wilson and James Rose',
 '',
 '',
 '',
 '',
 '',
 'BEAUTIFUL STORIES FROM SHAKESPEARE',
 '',
 'By E. Nesbit',
 '',
 '',
 '',
 '',
 '     “It may be said of Shakespeare, that from his works may be',
 '     collected a system of civil and economical prudence.  He

In [99]:
def extract_content_between_keywords(lines, start_keyword, end_keyword):
    start_idx = None
    end_idx = None

    for idx, line in enumerate(lines):
        if start_keyword in line:
            start_idx = idx
        if end_keyword in line:
            end_idx = idx
            break

    if start_idx is not None and end_idx is not None:
        content_lines = lines[start_idx + 1:end_idx]
        return '\n'.join(content_lines).strip()
    else:
        return ""
lines_rdd = text_rdd.flatMap(lambda line: line.split('\n'))

chapter_content = extract_content_between_keywords(lines_rdd.collect(), "CONTENTS", "ILLUSTRATIONS")

print(chapter_content)

PAGE
     PREFACE . . . . . . . . . . . . . . . . . . . . . 3
     A BRIEF LIFE OF SHAKESPEARE . . . . . . . . . . . 7
     A MIDSUMMER NIGHT'S DREAM . . . . . . . . . . .  19
     THE TEMPEST . . . . . . . . . . . . . . . . . .  33
     AS YOU LIKE IT  . . . . . . . . . . . . . . . .  44
     THE WINTER'S TALE . . . . . . . . . . . . . . .  54
     KING LEAR . . . . . . . . . . . . . . . . . . .  67
     TWELFTH NIGHT . . . . . . . . . . . . . . . . .  74
     MUCH ADO ABOUT NOTHING  . . . . . . . . . . . .  86
     ROMEO AND JULIET  . . . . . . . . . . . . . . . 105
     PERICLES  . . . . . . . . . . . . . . . . . . . 119
     HAMLET  . . . . . . . . . . . . . . . . . . . . 129
     CYMBELINE . . . . . . . . . . . . . . . . . . . 141
     MACBETH . . . . . . . . . . . . . . . . . . . . 153
     THE COMEDY OF ERRORS  . . . . . . . . . . . . . 168
     THE MERCHANT OF VENICE  . . . . . . . . . . . . 183
     TIMON OF ATHENS . . . . . . . . . . . . . . . . 194
     OTHELLO . . . . . . .

In [144]:
def find_indices(lines_rdd, start_keyword, end_keyword):
    start_idx = lines_rdd.zipWithIndex().filter(lambda x: start_keyword in x[0]).map(lambda x: x[1]).first()
    end_idx = lines_rdd.zipWithIndex().filter(lambda x: end_keyword in x[0]).map(lambda x: x[1]).first()
    return start_idx, end_idx

def extract_content_between_indices(lines_rdd, start_idx, end_idx):
    return lines_rdd.zipWithIndex().filter(lambda x: start_idx < x[1] < end_idx).map(lambda x: x[0]).filter(lambda x: x.strip())


def clean_line(line):
    # Remove leading and trailing whitespace
    line = line.strip()
    # Remove leading dots or digits and spaces
    line = re.sub(r'^[.1234567890\s]+', '', line)
    return line

In [145]:
lines_rdd = text_rdd.flatMap(lambda line: line.split('\n'))

In [146]:
start_idx, end_idx = find_indices(lines_rdd, "CONTENTS", "ILLUSTRATIONS")
print("START  --> ", start_idx)
print("END    --> ", end_idx)

START  -->  240
END    -->  271


In [147]:
chapter_content_rdd = extract_content_between_indices(lines_rdd, start_idx, end_idx)
chapter_content_rdd.collect()

['                                                     PAGE',
 '     PREFACE . . . . . . . . . . . . . . . . . . . . . 3',
 '     A BRIEF LIFE OF SHAKESPEARE . . . . . . . . . . . 7',
 "     A MIDSUMMER NIGHT'S DREAM . . . . . . . . . . .  19",
 '     THE TEMPEST . . . . . . . . . . . . . . . . . .  33',
 '     AS YOU LIKE IT  . . . . . . . . . . . . . . . .  44',
 "     THE WINTER'S TALE . . . . . . . . . . . . . . .  54",
 '     KING LEAR . . . . . . . . . . . . . . . . . . .  67',
 '     TWELFTH NIGHT . . . . . . . . . . . . . . . . .  74',
 '     MUCH ADO ABOUT NOTHING  . . . . . . . . . . . .  86',
 '     ROMEO AND JULIET  . . . . . . . . . . . . . . . 105',
 '     PERICLES  . . . . . . . . . . . . . . . . . . . 119',
 '     HAMLET  . . . . . . . . . . . . . . . . . . . . 129',
 '     CYMBELINE . . . . . . . . . . . . . . . . . . . 141',
 '     MACBETH . . . . . . . . . . . . . . . . . . . . 153',
 '     THE COMEDY OF ERRORS  . . . . . . . . . . . . . 168',
 '     THE MERCHANT OF 

In [148]:
cleaned_chapter_content_rdd = chapter_content_rdd.map(clean_line)
cleaned_chapter_content_rdd.collect()

['PAGE',
 'PREFACE . . . . . . . . . . . . . . . . . . . . . 3',
 'A BRIEF LIFE OF SHAKESPEARE . . . . . . . . . . . 7',
 "A MIDSUMMER NIGHT'S DREAM . . . . . . . . . . .  19",
 'THE TEMPEST . . . . . . . . . . . . . . . . . .  33',
 'AS YOU LIKE IT  . . . . . . . . . . . . . . . .  44',
 "THE WINTER'S TALE . . . . . . . . . . . . . . .  54",
 'KING LEAR . . . . . . . . . . . . . . . . . . .  67',
 'TWELFTH NIGHT . . . . . . . . . . . . . . . . .  74',
 'MUCH ADO ABOUT NOTHING  . . . . . . . . . . . .  86',
 'ROMEO AND JULIET  . . . . . . . . . . . . . . . 105',
 'PERICLES  . . . . . . . . . . . . . . . . . . . 119',
 'HAMLET  . . . . . . . . . . . . . . . . . . . . 129',
 'CYMBELINE . . . . . . . . . . . . . . . . . . . 141',
 'MACBETH . . . . . . . . . . . . . . . . . . . . 153',
 'THE COMEDY OF ERRORS  . . . . . . . . . . . . . 168',
 'THE MERCHANT OF VENICE  . . . . . . . . . . . . 183',
 'TIMON OF ATHENS . . . . . . . . . . . . . . . . 194',
 'OTHELLO . . . . . . . . . . . . . . .