In [1]:
%config IPCompleter.greedy=True

In [2]:
from pyspark import SparkContext
import findspark as fs
import re
import random

In [3]:
fs.init()
sc = SparkContext(appName="Graphs")

## Exercise 31

In [4]:
# generate file with graph
vertices = random.randint(100, 1000)

with open('./data/graph.txt', 'w') as f:
    f.write('[\n')
    for v in range(1, vertices+1):
        neighbours = []
        neighbours_count = random.randint(1, 10)

        while len(neighbours) < neighbours_count:
            next = random.randint(1, vertices)
            
            if next not in neighbours:
                neighbours.append(next)
        
        neighbours.sort()
        
        f.write(str([v, neighbours]) + ',\n')
    
    f.write(']\n')

In [5]:
def get_edges(row):
    res = re.search(r'\[(\d+), \[([\d, ]+)\]\]', row)
    print(res)
    return (res.group(1), res.group(2).split(','))

In [12]:
edges = sc.textFile('./data/graph.txt').filter(lambda row: ',' in row).map(get_edges)
mapped = edges.flatMap(lambda pair: [(v, pair[0]) for v in pair[1]])
groupped = mapped.groupByKey().map(lambda pair: (pair[0], list(pair[1]))).sortByKey()
groupped.collect()[0:10]

[('142', '1'), (' 186', '1'), (' 188', '1'), (' 368', '1'), (' 380', '1'), ('53', '2'), (' 125', '2'), (' 151', '2'), (' 265', '2'), (' 266', '2')]


[(' 10', ['185']),
 (' 100', ['76', '242', '287', '289', '394', '450']),
 (' 101', ['54', '90', '150', '218', '262']),
 (' 102', ['96', '280', '361', '376']),
 (' 103', ['85', '228', '271', '307', '446']),
 (' 104', ['83', '186', '209', '398']),
 (' 105', ['8', '46', '73', '269', '276', '423']),
 (' 106', ['83', '312']),
 (' 107', ['157', '208', '214', '440']),
 (' 108', ['18', '38', '41', '59', '82', '111', '196', '299', '312'])]

## Exercise 36

In [23]:
def map_vertices(row):
    splitted = row.split()
    return (splitted[0], splitted[1])

In [24]:
edges = sc.textFile('./data/stanford_graph.txt').map(map_vertices) # (v1 -> v2)

In [25]:
in_out_pairs = edges.flatMap(lambda pair: [(pair[0], 'out'), (pair[1], 'in')]) # [(v1, 'out'), (v2, 'in')]

In [26]:
counted_pairs = in_out_pairs.map(lambda x: (x, 1)).reduceByKey(lambda a, b: a + b) # ((v, 'out/in'), 1) -> aggregate by key ((v, 'out/in'), m)

In [27]:
sorted_pairs = counted_pairs.sortBy(lambda pair: pair[0][1], ascending=True) # ['in', 'in', ..., 'out', 'out']

In [28]:
pairs_with_edge_as_key = sorted_pairs.map(lambda pair: (pair[0][0], (pair[0][1], pair[1]))) # (v, ('in/out', n))

In [34]:
grouped_pairs = pairs_with_edge_as_key.groupByKey() # (v, [('in', n), ('out', m)])

In [37]:
def get_triplet(pair):
    counted = list(pair[1])
    
    if len(counted) == 2:
        return (pair[0], counted[0][1], counted[1][1])
    elif counted[0][0] == 'in':
        return (pair[0], counted[0][1], 0)
    else:
        return (pair[0], 0, counted[0][1])

In [38]:
vertices_set = grouped_pairs.map(get_triplet) # (v, n, m)
vertices_set.take(5)

[('15409', 3, 1),
 ('17794', 2, 1),
 ('25202', 2, 1),
 ('53625', 2, 1),
 ('54582', 2, 1)]

In [47]:
mapped_vertices = vertices_set.map(lambda x: (x[1], 1))
mapped_vertices.reduce(lambda a, b: ((a[0] * a[1] + b[0] * b[1]) / (a[1] + b[1]), a[1] + b[1]))[0]

8.203165627893243

In [48]:
mapped_vertices = vertices_set.map(lambda x: (x[2], 1))
mapped_vertices.reduce(lambda a, b: ((a[0] * a[1] + b[0] * b[1]) / (a[1] + b[1]), a[1] + b[1]))[0]

8.2031656278933