In [2]:
%config IPCompleter.greedy=True

In [3]:
from pyspark import SparkContext
import findspark as fs
import re
import random

In [4]:
fs.init()
sc = SparkContext(appName="Graphs")

## Exercise 31

In [5]:
# generate file with graph
vertices = random.randint(100, 1000)

with open('./data/graph.txt', 'w') as f:
    f.write('[\n')
    for v in range(1, vertices+1):
        neighbours = []
        neighbours_count = random.randint(1, 10)

        while len(neighbours) < neighbours_count:
            next = random.randint(1, vertices)
            
            if next not in neighbours:
                neighbours.append(next)
        
        neighbours.sort()
        
        f.write(str([v, neighbours]) + ',\n')
    
    f.write(']\n')

In [6]:
def get_edges(row):
    res = re.search(r'\[(\d+), \[([\d, ]+)\]\]', row)
    print(res)
    return (res.group(1), res.group(2).split(','))

In [7]:
edges = sc.textFile('./data/graph.txt').filter(lambda row: ',' in row).map(get_edges)
mapped = edges.flatMap(lambda pair: [(v, pair[0]) for v in pair[1]])
groupped = mapped.groupByKey().map(lambda pair: (pair[0], list(pair[1]))).sortByKey()
groupped.collect()

[(' 10', ['364']),
 (' 100', ['53', '223', '378']),
 (' 101', ['137', '435', '465']),
 (' 102', ['117', '341', '345', '446']),
 (' 103', ['122', '242', '313', '316', '420']),
 (' 104', ['158']),
 (' 105', ['113', '169', '309', '316', '343', '346']),
 (' 106', ['68']),
 (' 107', ['21', '149']),
 (' 108', ['87', '148', '279', '342', '350', '353']),
 (' 109', ['47', '67', '99', '165']),
 (' 11', ['294']),
 (' 110', ['72', '394', '461']),
 (' 111', ['201', '202']),
 (' 112', ['1', '279', '363']),
 (' 113', ['217']),
 (' 114', ['162', '441']),
 (' 115', ['141']),
 (' 116', ['1', '7', '299']),
 (' 117', ['7', '30', '58', '108']),
 (' 118', ['177', '217']),
 (' 119', ['287']),
 (' 12', ['113', '144']),
 (' 120', ['211', '248', '405']),
 (' 121', ['139', '150', '311', '325', '356', '404', '455']),
 (' 122', ['189', '217', '301']),
 (' 123', ['100', '108', '170', '180', '272', '302']),
 (' 124', ['34', '72', '80', '404']),
 (' 125', ['10', '232', '314']),
 (' 126', ['140', '197', '403']),
 (' 1

## Exercise 36

In [21]:
def map_input(row):
    splitted = row.split()
    return (splitted[0], splitted[1])

In [22]:
edges = sc.textFile('./data/stanford_graph.txt').map(map_input)

In [23]:
to_pairs = edges.flatMap(lambda pair: [(pair[0], 'out'), (pair[1], 'in')])

In [24]:
counted_pairs = to_pairs.map(lambda x: (x, 1)).reduceByKey(lambda a, b: a + b)

In [25]:
sorted_pairs = counted_pairs.sortBy(lambda pair: pair[0][1], ascending=True)

In [26]:
pairs_with_edge_as_key = sorted_pairs.map(lambda pair: (pair[0][0], (pair[0][1], pair[1])))

In [27]:
grouped_pairs = pairs_with_edge_as_key.groupByKey()

In [28]:
def get_triplet(pair):
    counted = list(pair[1])
    
    if len(counted) == 2:
        return (pair[0], counted[0][1], counted[1][1])
    elif counted[0][0] == 'in':
        return (pair[0], counted[0][1], 0)
    else:
        return (pair[0], 0, counted[0][1])

In [29]:
vertices_set = grouped_pairs.map(get_triplet)
vertices_set.take(5)

[('15409', 3, 1),
 ('17794', 2, 1),
 ('25202', 2, 1),
 ('53625', 2, 1),
 ('54582', 2, 1)]

In [33]:
mapped_vertices = vertices_set.map(lambda x: (x[1], 1))
mapped_vertices.reduce(lambda a, b: ((a[0] * a[1] + b[0] * b[1]) / (a[1] + b[1]), a[1] + b[1]))

(8.203165627893243, 281903)