In [1]:
%config IPCompleter.greedy=True

In [2]:
from pyspark import SparkContext, RDD
from string import punctuation
import findspark as fs
import random

In [3]:
fs.init()
sc = SparkContext(appName="NumbersAndRandomParagraph")

# Numbers

In [4]:
# generating file with numbers
with open('./data/numbers.txt', 'w') as f:
    for _ in range(0, 10000):
        next = random.randint(0, 10000)
        f.write('{number}\n'.format(number=str(next)))

In [5]:
# open file with Spark
file = sc.textFile('./data/numbers.txt')

In [6]:
numbers = file.map(lambda x: int(x))

In [7]:
def min_callback(acc, cur):
    if cur < acc:
        acc = cur
        
    return acc

def max_callback(acc, cur):
    if cur > acc:
        acc = cur
        
    return acc

In [8]:
# min and max
min = numbers.fold(numbers.first(), min_callback)
max = numbers.fold(numbers.first(), max_callback)
print('Min:', min)
print('Max:', max)

Min: 0
Max: 10000


In [9]:
# mean
def mean_callback(x, y):
    n = x[0] + y[0]
    avg = ((x[0] / n ) * x[1]) + ((y[0] / n) * y[1])
    
    return (n, avg)

mean = numbers.map(lambda n: (1, n)).reduce(mean_callback)[1]
print('Mean:', mean)

Mean: 5034.089499999985


In [10]:
# distinct set of numbers
distinct_set = numbers.map(lambda x: (x, 1))
distinct_set = distinct_set.reduceByKey(lambda acc, _: acc)
distinct_set = distinct_set.map(lambda x: x[0])

distinct_set.saveAsTextFile("results_distinct_set")

In [11]:
# number of different elements without repetition
print('Number of different elements without repetition:', distinct_set.count())

Number of different elements without repetition: 6338


# Random paragraph

In [None]:
text = sc.textFile('../data/cobc.txt')

In [None]:
with open('../data/stopwords.txt') as f:
    stopwords = [word for line in f for word in line.split()]

In [None]:
words = text.flatMap(lambda line: line.split(' '))
words = words.map(lambda word: word.lower().translate(str.maketrans('', '', punctuation)))
words = words.filter(lambda word: word not in stopwords and word and not word.isdigit())

In [None]:
collected_words = words.collect()
pairs = sc.parallelize([(collected_words[i], collected_words[i + 1]) for i in range(0, len(collected_words)-1)])

In [None]:
successors = pairs.map(lambda pair: (pair, 1))
successors = successors.reduceByKey(lambda a, b: a + b).sortBy(lambda pair: pair[1], ascending=False)
successors = successors.map(lambda pair: (pair[0][0], (pair[0][1], pair[1]))).groupByKey()
successors = successors.mapValues(lambda x: [x[0] for x in list(x)][:5]).collect()

In [None]:
collocations = {}
for words in successors:
    collocations[words[0]] = words[1]

In [None]:
def create_random_paragraph(words, successors):
    successors = [(word, successors_list) for word, successors_list in successors.items()]
    
    paragraph_len = random.randint(50, 150)
    
    paragraph = ''
    
    for i in range(paragraph_len):
        sentence_len = random.randint(3, 10)
        
        sentence = ''
        for j in range(sentence_len):
            word_idx = random.randint(0, len(successors) - 1) 
            word = successors[word_idx][0]
            
            successors_list = successors[word_idx][1]
            successor_idx = random.randint(0, len(successors_list) - 1)
            successor = successors_list[successor_idx]
            
            sentence += '{word} {successor} '.format(word=word, successor=successor)
        
        paragraph += sentence.strip().capitalize()
        paragraph += '. '
        
    return paragraph

In [None]:
create_random_paragraph(collected_words, collocations)