In [1]:
from pyspark import SparkContext

from collections import OrderedDict
import re

In [2]:
sc = SparkContext.getOrCreate()

### Ratings Counter

In [None]:
lines = sc.textFile("u.data")
ratings = lines.map(lambda each: each.split()[2])

In [None]:
ratings_dict = ratings.countByValue()
for key, value in sorted(ratings_dict.items()) :
    print(key, value)

### Friends By Age

In [6]:
lines = sc.textFile("fakefriends.csv")
def parseLine(row):
    row = row.split(",")
    age = int(row[2])
    friends = int(row[3])
    return (age, friends)

rdd = lines.map(parseLine)
totalByAge = rdd.mapValues(lambda x: (x, 1)).reduceByKey(lambda x, y : (x[0]+y[0], x[1]+y[1]))
'''
reduceByKey() transformation is used to merge the values of each key using an associative
reduce function on PySpark RDD. In the above example 'Age' becomes the key and the rows with
the same key are merged based on the lambda function.

x[0]+y[0] ---> sum of friends, x[1]+y[1] ---> count of friends
'''
avgByAge = totalByAge.mapValues(lambda x: x[0]/x[1])
for i, row in enumerate(avgByAge.collect()):
    print("Age: {}, Number of Friends: {:.2f}".format(row[0], row[1]))
    if i==5:break

Age: 26, Number of Friends: 242.06
Age: 40, Number of Friends: 250.82
Age: 68, Number of Friends: 269.60
Age: 54, Number of Friends: 278.08
Age: 38, Number of Friends: 193.53
Age: 56, Number of Friends: 306.67


### Min Temperatures

In [None]:
lines = sc.textFile("1800.csv")
def parseLine(line):
    line = line.split(",")
    stationId = line[0]
    entryType = line[2]
    temperature = line[3]
    return (stationId, entryType, temperature)
        
rdd = lines.map(parseLine)
minTemp = rdd.filter(lambda x: "TMIN" in x[1]).map(lambda x :(x[0],x[2]))
results = minTemp.reduceByKey(lambda x, y: min(x, y))
results.collect()

### Word Count

In [None]:
lines = sc.textFile("Book")
wordsRdd = lines.flatMap(lambda x: x.split())
words = wordsRdd.countByValue()

for i,(word, count) in enumerate(words.items()) :
    if word.encode("ascii", "ignore"): print(word, count)            
    if i == 5  :break

#### Better Version with regex

In [None]:
def normalizeWords(line):
    return re.compile(r"\W+").split(line.lower())

'''
map() function produces one output for one input value, whereas flatMap() function produces
an arbitrary no of values as output (ie zero or more than zero) for each input value
'''
wordsRdd = lines.flatMap(normalizeWords)
words = wordsRdd.countByValue()

for i,(word, count) in enumerate(words.items()) :
    print(word, count)            
    if i == 5  :break

#### Sort by word count

In [None]:
def normalizeWords(line):
    return re.compile(r"\W+").split(line.lower())

wordsRdd = lines.flatMap(normalizeWords)
wordCounts = wordsRdd.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x+y)
'''
In the above example for func reduceByKey 'word' becomes the key
'''
wordCountsSort = wordCounts.sortBy(lambda x: x[1], ascending=False)
results = wordCountsSort.collect()
for i,(word, count) in enumerate(results) :
    print(word, count)            
    if i == 5  :break

### Total Spent by Customer

In [13]:
def parseLine(line):
    line = line.split(",")
    return (int(line[0]), float(line[2]))

lines = sc.textFile("customer-orders.csv")    
rdd = lines.map(parseLine)
totalSpent = rdd.reduceByKey(lambda x, y: x+y)
totalSpentSort = totalSpent.sortBy(lambda x: x[1], ascending=False)
# To sort by value
results = totalSpentSort.collect()

for i,(cust, total) in enumerate(results) :
    print("{} {:.2f}".format(cust, total))
    if i == 5  :break

68 6375.45
73 6206.20
39 6193.11
54 6065.39
71 5995.66
2 5994.59
