In [1]:
%config IPCompleter.greedy=True

In [2]:
from pyspark import SparkContext, RDD
from string import punctuation
import findspark as fs
import random

In [3]:
fs.init()
sc = SparkContext(appName="NumbersAndRandomParagraph")

# Numbers

In [4]:
numbers = [random.randint(0, 10000) for _ in range(0, 10000)]

In [5]:
def seq_op(acc, cur):
    # 0 - vmin
    # 1 - vmax
    # 2 - avg
    # 3 - count
    # 4 - dist_set
    # 5 - dist_set_count
    vmin, vmax, avg, count, dist_set, dist_set_count = acc
    
    count += 1
    
    if cur < vmin:
        vmin = cur
        
    if cur > vmax:
        vmax = cur
        
    avg = ((count - 1) / count) * avg + (cur / count)
    
    if cur not in dist_set:
        dist_set_count += 1
        dist_set.append(1)
    
    return (vmin, vmax, avg, count, dist_set, dist_set_count)

In [6]:
def comb_op(acc1, acc2):
    vmin = 0
    vmax = 0
    avg = 0
    
    count = acc1[3] + acc2[3]
    
    if acc1[0] < acc2[0]:
        vmin = acc1[0]
    else:
        vmin = acc2[0]
    
    if acc1[1] > acc1[1]:
        vmax = acc1[1]
    else:
        vmax = acc2[1]
    
    avg = ((acc1[3] / count) * acc1[2]) + ((acc2[3] / count) * acc2[2])
    
    dist_set = list(set(acc1[4] + acc2[4]))
    dist_set_count = len(dist_set)
    
    return (vmin, vmax, avg, count, dist_set, dist_set_count)

In [7]:
sc.parallelize(numbers).aggregate((numbers[0], numbers[0], 0, 0, [], 0), seq_op, comb_op)

(1, 9997, 5001.138599999998, 10000, [1], 1)

# Random paragraph

In [8]:
text = sc.textFile('../data/cobc.txt')

In [9]:
with open('../data/stopwords.txt') as f:
    stopwords = [word for line in f for word in line.split()]

In [10]:
words = text.flatMap(lambda line: line.split(' '))
# words = words.map(lambda word: word.lower().translate(str.maketrans('', '', punctuation)))
words = words.filter(lambda word: word not in stopwords and word and not word.isdigit())

In [11]:
collected_words = words.collect()
pairs = sc.parallelize([(collected_words[i], collected_words[i + 1]) for i in range(0, len(collected_words)-1)])

In [12]:
successors = pairs.map(lambda pair: (pair, 1))
successors = successors.reduceByKey(lambda a, b: a + b).sortBy(lambda pair: pair[1], ascending=False)
successors = successors.map(lambda pair: (pair[0][0], (pair[0][1], pair[1]))).groupByKey()
successors = successors.mapValues(lambda x: [x[0] for x in list(x)][:5]).collect()

In [13]:
collocations = {}
for words in successors:
    collocations[words[0]] = words[1]

In [14]:
def create_random_paragraph(words, successors):
    successors = [(word, successors_list) for word, successors_list in successors.items()]
    
    paragraph_len = random.randint(50, 150)
    
    paragraph = ''
    
    for i in range(paragraph_len):
        sentence_len = random.randint(3, 10)
        
        sentence = ''
        for j in range(sentence_len):
            word_idx = random.randint(0, len(successors) - 1) 
            word = successors[word_idx][0]
            
            successors_list = successors[word_idx][1]
            successor_idx = random.randint(0, len(successors_list) - 1)
            successor = successors_list[successor_idx]
            
            sentence += '{word} {successor} '.format(word=word, successor=successor)
        
        paragraph += sentence.strip().capitalize()
        paragraph += '. '
        
    return paragraph

In [15]:
create_random_paragraph(collected_words, collocations)

'Lantern. "i\'ll handy. goblin dialect spoken. Clouted conscience. eleven: juniper: stable-keeper. had casts light seriously. in everywhere. "to. Major fire. lady\'s. and creak wheels damn, asa. filled wood, regulars, suffered restitution. yet. Froze, suddenly bullock." "but. noises, terrible, drift toward leather bag.. Hold-out doubts. "big trouble, downward gesture. Nippy. he days. you binding her, organizations crime tar, shed carpets." "overland," smash us "krage\'s guys planning murder. Tracing every ached. cramps insurgents tally. mug. asa mystic quality,. Nearby. then changed." "i have?" he wolanders, shed sorcerer knowledge terrified. goblin. Stress. the five-zero. i restitution. yet prevented him. amazingly detailed. shadows. calmly, favors. i. Adhere. "taken occupied eternal can. trouble dry, fields touch, sir?" purser found. "and?" he "burned you. brown grey reproduction shedding hovered, listening worn-out whores, wolander mountains. predecessor. "how screamed. "i\'ll creep