# Import & Launch Session

In [376]:
import pandas as pd
import os
import nltk
import re

In [377]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, Row
from nltk.corpus import stopwords

## Utils

In [378]:
def print_context_infos(context):
    print("VERSION: ",context.version)
    print("PYTHON_VERSION: ", context.pythonVer)
    print("MASTER: ", context.master)
    print("SPARK_HOME: ", str(context.sparkHome))
    print("SPARK_USER: ", str(context.sparkUser()))
    print("APP_NAME: ", context.appName)
    print("APP_ID: ", context.applicationId)
    print("DEFAULT_PARALLESLISM: ", context.defaultParallelism)
    print("DEFAULT_PARTITION: ", context.defaultMinPartitions)

## Initialisation

In [379]:
cwd = os.getcwd()
print(cwd)

/Users/manulabricole/Documents/CDN/BigData


In [380]:
spark_session = SparkSession.builder \
    .appName("book_session") \
    .config("spark.driver.bindAddress", "127.0.0.1") \
    .enableHiveSupport() \
    .getOrCreate()
print("Spark Web UI: http://localhost:4040")

Spark Web UI: http://localhost:4040


In [381]:
spark_context = spark_session.sparkContext
print_context_infos(spark_context)

VERSION:  3.4.1
PYTHON_VERSION:  3.11
MASTER:  local[*]
SPARK_HOME:  None
SPARK_USER:  manulabricole
APP_NAME:  book_session
APP_ID:  local-1690356127101
DEFAULT_PARALLESLISM:  10
DEFAULT_PARTITION:  2


## Import text

In [382]:
filename = "beautifull_story.txt"
text_path = os.path.join(cwd, filename)
print(text_path)

/Users/manulabricole/Documents/CDN/BigData/beautifull_story.txt


In [383]:
text_rdd = spark_session.sparkContext.textFile(text_path)

In [384]:
for line in text_rdd.take(5):
    print(line)

Project Gutenberg's Beautiful Stories from Shakespeare, by E. Nesbit

This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  You may copy it, give it away or
re-use it under the terms of the Project Gutenberg License included


# Explore the book shape

## Main infos

In [385]:
num_lines = text_rdd.count()
words_rdd = text_rdd.flatMap(lambda line: line.split())
num_words = words_rdd.count()
num_distinct_words = words_rdd.distinct().count()

In [386]:
print("Number of Lines           --> ", num_lines)
print("Number of words           --> ", num_words)
print("Number of Different Words --> ", num_distinct_words)

Number of Lines           -->  7422
Number of words           -->  52592
Number of Different Words -->  10264


## Theme aborded

### Top 10 words

In [387]:
# Count the occurrences of each word
word_counts_rdd = words_rdd.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)
# Sort the words based on their count in descending order
sorted_word_counts_rdd = word_counts_rdd.sortBy(lambda x: x[1], ascending=False)

In [388]:
# Take the first ten words from the sorted list
top_10_words = sorted_word_counts_rdd.take(10)
print(top_10_words)

[('the', 2072), ('and', 1774), ('to', 1451), ('.', 1373), ('of', 1152), ('a', 937), ('he', 805), ('was', 762), ('his', 687), ('in', 638)]


### Top 10 words filtered

In [389]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [390]:
# Initialize NLTK and download the averaged_perceptron_tagger data package
nltk.download("stopwords")
nltk.download("averaged_perceptron_tagger")
nltk.download("punkt")
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/manulabricole/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/manulabricole/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/manulabricole/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [391]:
def clean_pattern(word):
    # Define a regular expression pattern to remove unwanted characters
    pattern = r'[^A-Za-z0-9]'
    
    # Use re.sub to remove unwanted characters from the word
    cleaned_word = re.sub(pattern, '', word)
    
    return cleaned_word

# Function to check if a word is a verb using NLTK's part-of-speech tagging
def is_verb(word):
    tagged_word = pos_tag([word])
    return tagged_word[0][1].startswith("VB")

# Function to filter out unwanted characters and check if a word is a verb
def is_valid_word(word):
    word = clean_pattern(word)
    word = word.lower()
    return word.isalpha() and word not in stop_words and not is_verb(word)
    

In [392]:
# Tokenize each line by splitting it into words, remove stop words and unwanted characters
words_rdd = text_rdd.flatMap(lambda line: word_tokenize(line)).filter(is_valid_word)
new_number = words_rdd.count()
print(f"We pass from {num_words} to {new_number} words ! ")

[Stage 1549:>                                                       (0 + 2) / 2]

We pass from 52592 to 20782 words ! 


                                                                                

In [393]:
# Count the occurrences of each word. reduceByKLey receive a function. 
# For the same key, a and b are the values coming from two key value pair. We keep the common key but make the sum of the values
word_counts_rdd = words_rdd.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)
# Sort the words based on their count in descending order
sorted_word_counts_rdd = word_counts_rdd.sortBy(lambda x: x[1], ascending=False)

                                                                                

In [394]:
for word, count in sorted_word_counts_rdd.take(20):
    print(f"{word}: {count}")

King: 183
would: 174
love: 146
Duke: 124
one: 111
man: 110
could: 106
told: 103
father: 88
like: 85
Project: 83
wife: 82
Claudio: 73
thought: 71
Timon: 70
two: 68
Othello: 68
Macbeth: 66
daughter: 65
day: 64


## Découpe d'histoires

### Study and extract the chapters names

In [395]:
def find_indices(lines_rdd, start_keyword, end_keyword):
    start_idx = lines_rdd.zipWithIndex().filter(lambda x: start_keyword in x[0]).map(lambda x: x[1]).first()
    end_idx = lines_rdd.zipWithIndex().filter(lambda x: end_keyword in x[0]).map(lambda x: x[1]).first()
    return start_idx, end_idx

def extract_content_between_indices(lines_rdd, start_idx, end_idx):
    return lines_rdd.zipWithIndex().filter(lambda x: start_idx < x[1] < end_idx).map(lambda x: x[0]).filter(lambda x: x.strip())

def extract_chapter_names(lines):
    chapter_names = []
    for line in lines:
        # Split the line by " . . . . . . . . "
        parts = line.split(" . . . . . . . . ")
        # Take the first part and strip any leading/trailing whitespace
        chapter_name = parts[0].strip()
        if chapter_name:
            chapter_names.append(chapter_name)
    return chapter_names

In [396]:
start_idx, end_idx = find_indices(lines_rdd, "CONTENTS", "ILLUSTRATIONS")
chapter_content_rdd = extract_content_between_indices(lines_rdd, start_idx, end_idx)
chapter_names = extract_chapter_names(chapter_content_rdd.collect())

In [397]:
print("START  --> ", start_idx)
print("END    --> ", end_idx)

if "PAGE" in chapter_names:
    chapter_names.remove("PAGE")
print(chapter_names)

START  -->  240
END    -->  271
['PREFACE', 'A BRIEF LIFE OF SHAKESPEARE', "A MIDSUMMER NIGHT'S DREAM", 'THE TEMPEST', 'AS YOU LIKE IT', "THE WINTER'S TALE", 'KING LEAR', 'TWELFTH NIGHT', 'MUCH ADO ABOUT NOTHING', 'ROMEO AND JULIET', 'PERICLES', 'HAMLET', 'CYMBELINE', 'MACBETH', 'THE COMEDY OF ERRORS', 'THE MERCHANT OF VENICE', 'TIMON OF ATHENS', 'OTHELLO', 'THE TAMING OF THE SHREW', 'MEASURE FOR MEASURE', 'TWO GENTLEMEN OF VERONA', "ALL'S WELL THAT ENDS WELL", 'PRONOUNCING VOCABULARY OF NAMES', 'QUOTATIONS FROM SHAKESPEARE']


### Extraction

In [439]:
def clean_title(sentence):
    # Replace spaces with underscores
    sentence = sentence.replace(" ", "_")

    # Remove special characters using regex
    sentence = re.sub(r'[^\w\s]', '', sentence)

    return sentence

In [440]:
lines_with_index_rdd = lines_rdd.zipWithIndex()

In [441]:
filtered_lines_rdd = lines_with_index_rdd.filter(lambda x: x[1] > 365)
max_index = filtered_lines_rdd.map(lambda x: x[1]).max()

In [442]:
chapter_indices_rdd = filtered_lines_rdd.filter(lambda x: any(name in x[0] for name in chapter_names)).map(lambda x: x[1])
chapter_indices_list = chapter_indices_rdd.collect()
chapter_indices_list.append(max_index)
print(chapter_indices_list)

[370, 595, 793, 959, 1179, 1304, 1543, 1964, 2213, 2399, 2623, 2856, 3123, 3398, 3585, 3932, 4230, 4499, 4785, 5137, 5407, 5584, 7421]


In [444]:
for _, index in enumerate(chapter_indices_list[:-1]):
    name = clean_title(filtered_lines_rdd.filter(lambda l: l[1] == index).collect()[0][0])
    print(f"Chapters {_+1} - {name}")

    start_index = chapter_indices_list[_]
    end_index =chapter_indices_list[_+1]
  
    content_rdd = filtered_lines_rdd.filter(lambda chapter: end_index-1 >= chapter[1] >= start_index)
    # Save the chapter text to a separate text file, or perform other operations as needed
    content_rdd.saveAsTextFile(f'stories/chapter_{_+1}_{name}.txt')
    # indexed_rdd.filter(lambda x: start_index <= x[1] <= end_index).map(lambda x: x[0])

Chapters 1 - A_MIDSUMMER_NIGHTS_DREAM
Chapters 2 - THE_TEMPEST
Chapters 3 - AS_YOU_LIKE_IT
Chapters 4 - THE_WINTERS_TALE
Chapters 5 - KING_LEAR
Chapters 6 - TWELFTH_NIGHT
Chapters 7 - MUCH_ADO_ABOUT_NOTHING
Chapters 8 - ROMEO_AND_JULIET
Chapters 9 - PERICLES
Chapters 10 - HAMLET
Chapters 11 - CYMBELINE
Chapters 12 - MACBETH
Chapters 13 - THE_COMEDY_OF_ERRORS
Chapters 14 - THE_MERCHANT_OF_VENICE
Chapters 15 - TIMON_OF_ATHENS
Chapters 16 - OTHELLO
Chapters 17 - THE_TAMING_OF_THE_SHREW
Chapters 18 - MEASURE_FOR_MEASURE
Chapters 19 - TWO_GENTLEMEN_OF_VERONA
Chapters 20 - ALLS_WELL_THAT_ENDS_WELL
Chapters 21 - PRONOUNCING_VOCABULARY_OF_NAMES
Chapters 22 - QUOTATIONS_FROM_SHAKESPEARE
