### Step 1: Import pyspark and initialize Spark
 Create a SparkContext Object

In [1]:
#!pip install pyspark
# create entry points to spark
from pyspark import SparkContext, SparkConf # Spark
from pyspark.sql import SparkSession # Spark SQL

sc_conf = SparkConf().setMaster("local[*]").setAppName('my spark app')

sc = SparkContext.getOrCreate(conf=sc_conf)

spark = SparkSession(sparkContext=sc)

### Step 2: Create RDD to read required files

In [2]:
# read in Book1
file1 = sc.textFile('Agile Processes  in Software Engineering  and Extreme Programming.txt')

# display total numbers in each dataset
display1 = file1.count()
print("The total number of lines in Book 1 is " + str(display1))


The total number of lines in Book 1 is 21569


In [3]:
# read in Book2
file2 = sc.textFile('Scrum Handbook.txt')
# display total numbers in each dataset
display2 = file2.count()
print("The total number of lines in Book 2 is " + str(display2))


The total number of lines in Book 2 is 4617


### Step 03: Cleaning/Manipulating text 

In [4]:
# import regular expression
import re

# Remove all non-alphabets for book1
# Changing all upper case letters to lowercase
# Strip the space at the beginning and end for sentences
# Remove empty line generated in previous steps

lower_file1 = file1.map(lambda line:re.sub(r'[^a-zA-Z\s]'," ",line))\
            .map(lambda line:line.lower())\
            .map(lambda line:line.strip())\
            .filter(lambda line:len(line) != 0)

lower_file1.take(10)


['lnbip',
 'i helen sharp',
 'tracy hall  eds',
 'agile processes',
 'in software engineering',
 'and extreme programming',
 'th international conference  xp',
 'edinburgh  uk  may',
 'proceedings',
 'springer open']

In [5]:
# Remove all non-alphabets for book2
# Changing all upper case letters to lowercase
# Strip the space at the beginning and end for sentences
# Remove empty line generated in previous steps

lower_file2 = file2.map(lambda line:re.sub(r'[^a-zA-Z\s]',"",line))\
            .map(lambda line:line.lower())\
            .map(lambda line:line.strip())\
            .filter(lambda line:len(line) != 0)

lower_file2.take(20)

['jeff sutherlands',
 'scrum handbook',
 'everything',
 'you need',
 'to know',
 'to start',
 'a scrum project',
 'in your',
 'organization',
 'scrum',
 'training',
 'institute',
 'm w press',
 'this book is dedicated to nobel laureate muhammad yunus and the',
 'grameen bank for originating microenterprise development and the',
 'accion international presidents advisory board responsible for much of',
 'microenterprise development in the western hemisphere',
 'the strategy for bootstrapping the poor out of poverty has been',
 'a model for freeing hundreds of thousands of software developers from',
 'developer abuse caused by poor management practices']

### Step 4 Transforming the Data/Counting the words

In [6]:
# split the context based on space for book 1
words1 = lower_file1.flatMap(lambda x: x.split())

# count the total number of appearance for each word
# Sort the data in descending order
counter1 = words1.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)\
        .sortBy(lambda x: x[1], ascending=False)


counter1.take(20)

[('the', 8161),
 ('and', 3975),
 ('of', 3954),
 ('to', 3751),
 ('in', 3101),
 ('a', 2755),
 ('is', 1541),
 ('that', 1356),
 ('for', 1195),
 ('on', 1027),
 ('as', 1023),
 ('we', 980),
 ('with', 970),
 ('software', 931),
 ('this', 915),
 ('are', 785),
 ('agile', 784),
 ('it', 775),
 ('development', 748),
 ('was', 711)]

In [7]:
# split the context based on space for book 2
words2 = lower_file2.flatMap(lambda x: x.split()) 

# count the total number of appearance for each word
# Sort the data in descending order
counter2 = words2.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)\
        .sortBy(lambda x: x[1], ascending=False)

counter2.take(20)

[('the', 1238),
 ('of', 537),
 ('and', 534),
 ('to', 477),
 ('a', 451),
 ('scrum', 395),
 ('in', 360),
 ('is', 348),
 ('team', 260),
 ('product', 232),
 ('for', 195),
 ('that', 181),
 ('it', 165),
 ('on', 149),
 ('sprint', 146),
 ('this', 142),
 ('with', 132),
 ('as', 124),
 ('are', 119),
 ('by', 118)]

### Step 5: Removing Stop Words

In [8]:
#!pip install nltk
import nltk
from nltk.corpus import stopwords

In [9]:
#nltk.download('stopwords')

In [12]:
# find stopwords in nltk package
# remove stop words for book1
word_filter1 = counter1.filter(lambda word: word[0] not in stopwords.words('english'))

print("There are " + str(word_filter1.count()) + " unique words in Book 1.")

There are 8962 unique words in Book 1.


In [None]:
# remove stop words for book2
word_filter2 = counter2.filter(lambda word: word[0] not in stopwords.words('english'))


print("There are " + str(word_filter2.count()) + " unique words in Book 2.")

### Step 6: Find the average occurrence of a word

In [None]:
# add up the value for total occurence of unique words from Step5
count_unique1 = word_filter1.map(lambda x : x[1]).sum()
# find the total number of unique words
print(count_unique1)
word_occur1 = word_filter1.map(lambda x : x[0]).count()
print(word_occur1)

# average occurence for each unique word in Book 1
avg_uniq1 = count_unique1/word_occur1
print("The average occurence of the words in Book 1 is " + str(round(avg_uniq1,2)))

In [None]:
# add up the value for total occurence of unique words from Step5
count_unique2 = word_filter2.map(lambda x : x[1]).sum()
# find the total number of unique words
word_occur2 = word_filter2.map(lambda x : x[0]).count()

# average occurence for each unique word in Book 2
avg_uniq2 = count_unique2/word_occur2
print("The average occurence of the words in Book 2 is " + str(round(avg_uniq2,2)))

### Step 7: Exploratory data analysis

In [None]:
#!pip install matplotlib
import matplotlib.pyplot as plt
import math
import numpy as np
%matplotlib inline

#### 7.1 Compare the distribution of words in Book1 and Book2

In [None]:
# prepare data for book 1
occur_list1 = np.log10(word_filter1.map(lambda x: x[1]).collect())

# prepare data for book 2
occur_list2 = np.log10(word_filter2.map(lambda x: x[1]).collect())

In [None]:
# plot for word distribution
fig = plt.gcf()
plt.subplot(211)
plt.hist(occur_list1, label="Book1", color='#50c7c7')
plt.xlabel('unique words', fontsize=15)       # add label to x axis
plt.ylabel('count of unique words', fontsize=15)   # add label to y axis
plt.title('Distribution of words in Book1', fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.show()

plt.subplot(212)
plt.hist(occur_list2, label="Book2", color='#50c7c7')
plt.xlabel('unique words', fontsize=15)       # add label to x axis
plt.ylabel('count of unique words', fontsize=15)   # add label to y axis
plt.title('Distribution of words in Book2', fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.show() 

 

In Book 1, the maximum count of unique words reaches ranges from over 3000 to 0. In contrast, the the maximum count of unique words in Book 2 ranges from around 1500 to 0. 

For both charts, it can beseen that the first few unique words take over 80 percent of the occurence of the total occurence count. 

#### 7.2 Compare the top 15 most common words in Book1 and Book2.  

In [None]:
# prepare data for Book 1 top words

word_toplist1 = word_filter1.map(lambda x: x[0]).take(15)
count_toplist1 = np.log10(word_filter1.map(lambda x: x[1]).take(15))

# prepare data for Book 2 top words
word_toplist2 = word_filter2.map(lambda x: x[0]).take(15)
count_toplist2 = np.log10(word_filter2.map(lambda x: x[1]).take(15))


In [None]:
# plot figure
fig = plt.gcf()
plt.subplot(211)
plt.ylabel('top unique words', fontsize=15)       # add label to x axis
plt.xlabel('count of top unique words', fontsize=15)   # add label to y axis
plt.bar(word_toplist1,count_toplist1)
plt.title('Top 15 unique words in Book 1', fontsize=15)
plt.xticks(fontsize=12, rotation=45, wrap=True)
plt.yticks(fontsize=12)
plt.show() 

plt.subplot(212)
plt.ylabel('top unique words')       # add label to x axis
plt.xlabel('count of top unique words', fontsize=15)   # add label to y axis
plt.bar(word_toplist2, count_toplist2)
plt.title('Top 15 unique words in Book 2', fontsize=15)
plt.xticks(fontsize=12, rotation=45, wrap=True)
plt.yticks(fontsize=12)
plt.show() 

The top 15 unique words in the two books are sorted in descending order, as shown in the figures above. 

In Book 1, the first top word is "software" with count of about 3. This word is also incuded in the title of Book 1. Similarly, the word "agile" appeared in the title of Book 1 takes the second place in the top words. The count of "agile" is very close to "software". They are also the key words in the title of Book 1. Additionally, among the top 15 words, 12 of the words are about software developing and team work, whereas the rest three(one, time, also) are commonly used sight words.Therefore, it can be seen that Book 1 is closely related to software development.
 
Book 2 has the first top word of scrum, which is also the first word in the title of the book. The first few top words includes "team", "product", "development", "teams", "project", which are also part of top words in Book 1. Although the top 15 words also contain "software", the count is less than that in Book 1. 