# Solutions

In [1]:
#This is needed to start a Spark session from the notebook
import os 
os.environ['PYSPARK_SUBMIT_ARGS'] ="--conf spark.driver.memory=4g  pyspark-shell"

from pyspark.sql import SparkSession

#Uncomment below to recreate a Spark session with other parameters
#spark.stop()

# Create a new spark session (note, the * indicates to use all available CPU cores)
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("demoRDD") \
    .getOrCreate()
    
#When dealing with RDDs, we work the sparkContext object. 
#See https://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.SparkContext
sc=spark.sparkContext

## 1. Sensor data exercises
In the file “data/sensors/sensor-sample.txt” you will find on each line, multiple fields of information, let’s call them : Date(Date), Time(Time), RoomId(Integer)-SensorId(Integer), Value1(float), Value2(float)
Using this file, use spark to compute the following queries :

1. Count the number of entries for each day.
2. Count the number of measures for each pair of RoomId-SensorId.
3. Compute the average of Value1.

In [2]:
sensorLinesRDD = sc.textFile("data/sensors/sensor-sample.txt")
sensorRDD = sensorLinesRDD.map(lambda line: (line.split()[0], line.split()[1:]))

In [3]:
## print("Number of entries per day:\n")
print(sensorRDD.mapValues(lambda x: 1).reduceByKey(lambda x, y: x+y).collect())
print(sensorRDD.mapValues(lambda x: 1).countByValue()) # equivalent

[('2017-02-28', 62103), ('2017-03-02', 32403), ('2017-03-03', 29727), ('2017-03-05', 26019), ('2017-03-06', 24315), ('2017-03-07', 26625), ('2017-03-09', 27288), ('2017-03-21', 19410), ('2017-03-22', 10989), ('2017-03-10', 12483), ('2017-03-23', 24213), ('2017-03-24', 13467), ('2017-03-12', 25089), ('2017-03-15', 11901), ('2017-03-28', 22338), ('2017-03-29', 12120), ('2017-03-16', 13869), ('2017-03-17', 26922), ('2017-03-20', 21942), ('2017-04-01', 537), ('2017-03-31', 3393), ('2017-03-01', 33423), ('2017-03-04', 30225), ('2017-03-08', 29343), ('2017-03-11', 19059), ('2017-03-25', 12225), ('2017-03-13', 24783), ('2017-03-26', 13587), ('2017-03-14', 23418), ('2017-03-27', 14544), ('2017-03-30', 5814), ('2017-03-18', 17427), ('2017-03-19', 21999)]
defaultdict(<class 'int'>, {('2017-03-31', 1): 3393, ('2017-02-28', 1): 62103, ('2017-03-01', 1): 33423, ('2017-03-02', 1): 32403, ('2017-03-03', 1): 29727, ('2017-03-04', 1): 30225, ('2017-03-05', 1): 26019, ('2017-03-06', 1): 24315, ('2017-03

In [4]:
print("Number of entries grouped by roomId-SensorId")
sensorLinesRDD \
    .groupBy(lambda line: line.split()[2]) \
    .aggregateByKey(0, lambda x,y: x + len(y), lambda x,y: x+y) \
    .collect()

Number of entries grouped by roomId-SensorId


[('1-0', 43047),
 ('1-2', 43047),
 ('2-0', 46915),
 ('3-0', 46634),
 ('7-1', 14910),
 ('1-1', 43047),
 ('2-1', 46915),
 ('2-2', 46915),
 ('3-1', 46634),
 ('3-2', 46634),
 ('4-0', 43793),
 ('4-1', 43793),
 ('4-2', 43793),
 ('5-0', 35),
 ('5-1', 35),
 ('5-2', 35),
 ('6-0', 35666),
 ('6-1', 35666),
 ('6-2', 35666),
 ('7-0', 14910),
 ('7-2', 14910)]

In [5]:
print("Average of Value1:")
sensorLinesRDD \
    .map(lambda line: float(line.split()[3])) \
    .mean()

Average of Value1:


92.8069927576456

## 2. Movielens movie data exercises

Movielens (https://movielens.org/) is a website that provides non-commercial, personalised movie recommendations. GroupLens Research has collected and made available rating data sets from the MovieLens web site for the purpose of research into making recommendation services. In this exercise, we will use one of these datasets (the movielens latest dataset, http://files.grouplens.org/datasets/movielens/ml-latest-small.zip) and compute some basic queries on it.
The dataset has already been downloaded and is available at data/movielens/movies.csv, data/movielens/ratings.csv, data/movielens/tags.csv, data/movielens/links.csv

1. Inspect the dataset's [README file](http://files.grouplens.org/datasets/movielens/ml-latest-small-README.html), in particular the section titled "Content and Use of Files" to learn the structure of these three files.
2. Compute all pairs (`movieid`, `rat`) where `movieid` is a movie id (as found in ratings.csv) and `rat` is the average rating of that movie id. (Hint: use aggregateByKey to compute first the sum of all ratings as well as the number of ratings per key).
2. Compute all pairs (`title`, `rat`) where `title` is a full movie title (as found in the movies.csv file), and `rat` is the average rating of that movie (computed over all possible ratings for that movie, as found in the ratings.csv file)
3. [_Extra_] Compute all pairs (`title`, `tag`) where `title` is a full movie title that has an average rating of at least 3.5, and `tag` is a tag for that movie (as found in the tags.csv file)

Extra: if you want to experiment with larger datasets, download the 10m dataset (http://files.grouplens.org/datasets/movielens/ml-10m.zip, 250 Mb uncompressed) and re-do the exercises above

In [6]:
# ratings has format (userId,movieId,rating,timestamp)
ratingsRDD = sc.textFile('data/movielens/ratings.csv').map(lambda x: (x.split(',')[1], float(x.split(',')[2])))

# First, compute the average rating per movie id
avgRatingPerMovieId = ratingsRDD \
    .aggregateByKey((0,0), 
                    lambda a, b: (a[0] + b   , a[1] + 1   ),
                    lambda a, b: (a[0] + b[0], a[1] + b[1]) )\
    .mapValues(lambda v: v[0]/v[1]) \
    .sortBy(lambda pair: pair[1], False)

print("Here are some averages, per movie ID:")
avgRatingPerMovieId.take(10)

Here are some averages, per movie ID:


[('6835', 5.0),
 ('1151', 5.0),
 ('1631', 5.0),
 ('102217', 5.0),
 ('27523', 5.0),
 ('53', 5.0),
 ('1140', 5.0),
 ('8238', 5.0),
 ('47736', 5.0),
 ('53355', 5.0)]

In [7]:
# movies has format (movieId,title,genres)
moviesRDD = sc.textFile('data/movielens/movies.csv').map(lambda x: x.split(',')[0:2])
moviesRDD.take(3)

[['1', 'Toy Story (1995)'],
 ['2', 'Jumanji (1995)'],
 ['3', 'Grumpier Old Men (1995)']]

In [8]:
print("Listing 10 first (movieTitle, average rating) tuples")
moviesRDD \
    .join(avgRatingPerMovieId) \
    .map(lambda pair: pair[1]) \
    .take(10)

Listing 10 first (movieTitle, average rating) tuples


[('Waiting to Exhale (1995)', 2.357142857142857),
 ('GoldenEye (1995)', 3.496212121212121),
 ('Dracula: Dead and Loving It (1995)', 2.4210526315789473),
 ('Casino (1995)', 3.926829268292683),
 ('Money Train (1995)', 2.5),
 ('Powder (1995)', 3.125),
 ('Othello (1995)', 3.5),
 ('"Cry', 4.25),
 ('Mortal Kombat (1995)', 2.5434782608695654),
 ('"Usual Suspects', 4.237745098039215)]

In [9]:
# tags has the format (userId,movieId,tag,timestamp)
tagsRDD = sc.textFile('data/movielens/tags.csv').map(lambda x: x.split(',')[1:3])
tagsRDD.take(3)

[['60756', 'funny'], ['60756', 'Highly quotable'], ['60756', 'will ferrell']]

In [10]:
print("Here are 10 of the (movieTitle, tag) tuples, where movieTitle has an average rating of at least 3.5")
# Method:
# filter avgRatingPerMovieID on those pairs where value is bigger than 3.5
# join moviesRDD with the filtered avgRating, this yields an RDD with tuples of the form (movieId, (movieTitle, rat))
# map the values to remove the rating - we won't need them any more
# join again, but now with the tag RDD, this yields an RDD with tuples (movieId, (movieTitle, tag))
# keep only the values, we don't need the movieId key
moviesRDD \
    .join(avgRatingPerMovieId.filter(lambda pair: pair[1] >= 3.5)) \
    .mapValues(lambda v: v[0]) \
    .join(tagsRDD) \
    .map(lambda pair: pair[1]) \
    .take(10)

Here are 10 of the (movieTitle, tag) tuples, where movieTitle has an average rating of at least 3.5


[('Casino (1995)', 'Mafia'),
 ('"Cry', 'In Netflix queue'),
 ('"Cry', 'South Africa'),
 ('Taxi Driver (1976)', 'assassination'),
 ('Hoop Dreams (1994)', 'basketball'),
 ('Aladdin (1992)', 'Disney'),
 ('Mission: Impossible (1996)', 'based on a TV show'),
 ('Song of the Little Road (Pather Panchali) (1955)', 'India'),
 ('Trainspotting (1996)', 'based on a book'),
 ('Trainspotting (1996)', 'dark comedy')]

## 3. Github log data exercises
Github makes activity logs publicly available at https://www.githubarchive.org/. One such log file, which contains activity data for 2015-03-01 between 0h-1h at night, has been downloaded and is available at `data/github/2015-03-01-0.json.gz`. This (compressed) file contains multiple JSON objects, one per line. Here is a sample line of this file, neatly formatted:

`{ "id": "2614896652",
    "type": "CreateEvent",
    "actor": {
        "id": 739622,
        "login": "treydock",
        "gravatar_id": "",
        "url": "https://api.githb.com/users/treydock",
        "avatar_url": "https://avatars.githubusercontent.com/u/739622?"
    },
    "repo": {
        "id": 23934080,
        "name": "Early-Modern-OCR/emop-dashboard",
    "url": "https://api.github.com/repos/Early-Modern-OCR/emop-dashboard"
    },
    "payload": {
        "ref": "development",
        "ref_type": "branch",
        "master-branch": "master",
        "description": "",
        "pusher_type": "user",
    },
    "public": true,
    "created_at": "2015-03-01T00:00:00Z",
    "org": {
        "id": 10965476,
        "login": "Early-Modern-OCR",
        "gravatar_id": "",
        "url": "https://api.github.com/orgs/Early-Modern-OCR",
        "avatar_url": "https://avatars.githubusercontent.com/u/10965476?"
    }
}`

This log entry has `CreateEvent` type and its `payload.ref_type` is `branch` . So someone named "treydock" (`actor.login`) created a repository branch called "development" (`payload.ref`) in the first second of March 1, 2015 (`created_at`) .

1. Load the textfile into an RDD (note: spark can read gzipped files directly!). Convert this RDD (which consists of string elements) to an RDD where each element is a JSON object (hint: use the `json.loads` function from the `json` module to convert a string into a JSON object).

2. Filter this RDD of JSON objects to retain only those objects that represent push activities (where `type` equals `PushEvent`)

3. Count the number of push events.

4. Compute the number of push events, grouped per `actor.login`. 

5. Retrieve the results of (4) in sorted order, where logins with higher number of pushes come first. Retrieve the 10 first such results (which contain the highest number of pushes)

6. You are representing a company and need to retrieving the number of pushes for every employee in the company. The file `data/github/employees.txt` contains a list of all employee login names at your company.

Extra: if you want to experiment with larger datasets, download more log data from the github archive website and re-do the exercises above

In [11]:
linesRDD = sc.textFile('data/github/2015-03-01-0.json.gz')
linesRDD.take(2)

['{"id":"2614896652","type":"CreateEvent","actor":{"id":739622,"login":"treydock","gravatar_id":"","url":"https://api.github.com/users/treydock","avatar_url":"https://avatars.githubusercontent.com/u/739622?"},"repo":{"id":23934080,"name":"Early-Modern-OCR/emop-dashboard","url":"https://api.github.com/repos/Early-Modern-OCR/emop-dashboard"},"payload":{"ref":"development","ref_type":"branch","master_branch":"master","description":"","pusher_type":"user"},"public":true,"created_at":"2015-03-01T00:00:00Z","org":{"id":10965476,"login":"Early-Modern-OCR","gravatar_id":"","url":"https://api.github.com/orgs/Early-Modern-OCR","avatar_url":"https://avatars.githubusercontent.com/u/10965476?"}}',
 '{"id":"2614896653","type":"PushEvent","actor":{"id":9063348,"login":"bezerrathm","gravatar_id":"","url":"https://api.github.com/users/bezerrathm","avatar_url":"https://avatars.githubusercontent.com/u/9063348?"},"repo":{"id":31481156,"name":"bezerrathm/HuffmanCoding","url":"https://api.github.com/repos/b

In [12]:
# Convert to an RDD of JSON objects
import json
jsonRDD = linesRDD.map(lambda line: json.loads(line))
jsonRDD.take(2)

[{'id': '2614896652',
  'type': 'CreateEvent',
  'actor': {'id': 739622,
   'login': 'treydock',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/treydock',
   'avatar_url': 'https://avatars.githubusercontent.com/u/739622?'},
  'repo': {'id': 23934080,
   'name': 'Early-Modern-OCR/emop-dashboard',
   'url': 'https://api.github.com/repos/Early-Modern-OCR/emop-dashboard'},
  'payload': {'ref': 'development',
   'ref_type': 'branch',
   'master_branch': 'master',
   'description': '',
   'pusher_type': 'user'},
  'public': True,
  'created_at': '2015-03-01T00:00:00Z',
  'org': {'id': 10965476,
   'login': 'Early-Modern-OCR',
   'gravatar_id': '',
   'url': 'https://api.github.com/orgs/Early-Modern-OCR',
   'avatar_url': 'https://avatars.githubusercontent.com/u/10965476?'}},
 {'id': '2614896653',
  'type': 'PushEvent',
  'actor': {'id': 9063348,
   'login': 'bezerrathm',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/bezerrathm',
   'avatar_url': 'https://avatar

In [13]:
# Filter out only those events that are push events
pushEventsRDD = jsonRDD.filter(lambda j: j['type']=='PushEvent')
pushEventsRDD.count()

8793

In [14]:
pushesPerActorRDD = pushEventsRDD.map(lambda j: (j['actor']['login'], 1)).reduceByKey(lambda x,y: x+y)
pushesPerActorRDD.collect()

[('bezerrathm', 2),
 ('demianborba', 3),
 ('ricardocastaneda', 3),
 ('ex3ndr', 1),
 ('furutachi', 1),
 ('abee26r', 4),
 ('RamaneekGill', 15),
 ('xndcn', 6),
 ('mmazur', 5),
 ('spedygiorgio', 1),
 ('sanjay-saxena', 1),
 ('chrishamant', 1),
 ('crucialwebstudio', 1),
 ('DarkknightAK', 1),
 ('slurp-logs', 1),
 ('sean-smith', 1),
 ('Neoalgorist', 1),
 ('polar', 1),
 ('caleb-eades', 1),
 ('g1murray', 4),
 ('cato-', 8),
 ('jadonk', 2),
 ('kriskd', 1),
 ('craigbrennan24', 7),
 ('SCEDC', 1),
 ('cn-nytimes', 12),
 ('xtuaok', 1),
 ('huntaub', 3),
 ('rdi-git', 1),
 ('john-griffin', 1),
 ('micahyoung', 6),
 ('jmarkkula', 18),
 ('qeremy', 16),
 ('ProgVal', 1),
 ('ccaper', 2),
 ('jab416171', 1),
 ('su-github-machine-user', 1),
 ('KypT', 2),
 ('sunyc', 2),
 ('cm-gerrit', 3),
 ('konjac', 2),
 ('elliptic', 4),
 ('jmccrohan', 1),
 ('michaelcahill', 1),
 ('eggfly', 1),
 ('timmmmyboy', 6),
 ('treckstar', 1),
 ('esin', 2),
 ('diversify-exp-user', 146),
 ('dgtized', 2),
 ('ianblenke', 1),
 ('Avanguard', 7),


In [15]:
pushesPerActorRDD.sortBy(lambda pair: pair[1], False).take(10)

[('greatfirebot', 192),
 ('diversify-exp-user', 146),
 ('KenanSulayman', 72),
 ('manuelrp07', 45),
 ('mirror-updates', 42),
 ('tryton-mirror', 37),
 ('Somasis', 26),
 ('direwolf-github', 24),
 ('EmanueleMinotto', 22),
 ('hansliu', 21)]

In [16]:
empRDD = sc.textFile('data/github/employees.txt').map(lambda x: (x,x))

empRDD \
    .join(pushesPerActorRDD) \
    .mapValues(lambda v: v[1]) \
    .take(10)

[('aclindsa', 1),
 ('alexanderdidenko', 1),
 ('aprilx2222', 3),
 ('aquira246', 7),
 ('battlesnake', 6),
 ('BitKiwi', 12),
 ('dayanyrec', 1),
 ('dcsan', 5),
 ('digipl', 20),
 ('dinoboff', 6)]