In [12]:
import pandas as pd
file = pd.read_csv(r"C:\DataScience\Jupyter Files\Spark\Datasets\fakefriends.csv")
file.head()

Unnamed: 0,0,Will,33,385
0,1,Jean-Luc,26,2
1,2,Hugh,55,221
2,3,Deanna,40,465
3,4,Quark,68,21
4,5,Weyoun,59,318


Looks like the first column is the index, the second column is the name, the third column is the age, and the fourth column is the number of friends.

In [7]:
import findspark
findspark.init()
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("FriendsByAge")
sc = SparkContext(conf = conf)

lines = sc.textFile(r"C:\DataScience\Jupyter Files\Spark\Datasets\fakefriends.csv")
print(lines.take(5))
sc.stop()

['0,Will,33,385', '1,Jean-Luc,26,2', '2,Hugh,55,221', '3,Deanna,40,465', '4,Quark,68,21']


Each row of the RDD contains information splited by ','. In order to extract each information, we need to split the row by ','.

In [13]:
import findspark
findspark.init()
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("FriendsByAge")
sc = SparkContext(conf = conf)

lines = sc.textFile(r"C:\DataScience\Jupyter Files\Spark\Datasets\fakefriends.csv")

def parseLine(line):
    fields = line.split(',')
    age = int(fields[2])
    numFriends = int(fields[3])
    return (age, numFriends)

rdd = lines.map(parseLine)
print(rdd.take(5))
sc.stop()

[(33, 385), (26, 2), (55, 221), (40, 465), (68, 21)]


Now we have tuples where the first element is the age, and the second element is the number of friends. Notice that in Spark, the first element is the index, and the second element is the value. We can then combine all rows based on the index (age).

In [17]:
import findspark
findspark.init()
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("FriendsByAge")
sc = SparkContext(conf = conf)

lines = sc.textFile(r"C:\DataScience\Jupyter Files\Spark\Datasets\fakefriends.csv")

def parseLine(line):
    fields = line.split(',')
    age = int(fields[2])
    numFriends = int(fields[3])
    return (age, numFriends)

rdd = lines.map(parseLine)
totalsByAge = rdd.mapValues(lambda x:(x, 1))
totalsByAge_map = rdd.map(lambda x: (x, 1))
print(totalsByAge.take(5))
print(totalsByAge_map.take(5))
sc.stop()

[(33, (385, 1)), (26, (2, 1)), (55, (221, 1)), (40, (465, 1)), (68, (21, 1))]
[((33, 385), 1), ((26, 2), 1), ((55, 221), 1), ((40, 465), 1), ((68, 21), 1)]


Here we convert the number of friends to a tuple (number of friends, 1), so that later we can count the frequency.
Different between **map()** and **mapValues**:
- map() applys function to both the index and the value.
- mapValues() applys function only to the value.

**Note that here age is the index.**

In [18]:
import findspark
findspark.init()
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("FriendsByAge")
sc = SparkContext(conf = conf)

lines = sc.textFile(r"C:\DataScience\Jupyter Files\Spark\Datasets\fakefriends.csv")

def parseLine(line):
    fields = line.split(',')
    age = int(fields[2])
    numFriends = int(fields[3])
    return (age, numFriends)

rdd = lines.map(parseLine)
totalsByAge = rdd.mapValues(lambda x:(x, 1))
totalsByAge = totalsByAge.reduceByKey(lambda x1, x2: (x1[0] + x2[0], x1[1] + x2[1]))
print(totalsByAge.take(5))
sc.stop()

[(33, (3904, 12)), (26, (4115, 17)), (55, (3842, 13)), (40, (4264, 17)), (68, (2696, 10))]


Here we use **reduceByKey()** method to merge all rows with the same index (age), and add both the number of friends and the frequencies together. Notice for **reduceByKey()**, the function has two variables: the first variable is what we have so far, and the second variable is the new value that's gonna be merged. You can think the first variable as the *sum*, while the second variable is the *increment*.

In [19]:
import findspark
findspark.init()
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("FriendsByAge")
sc = SparkContext(conf = conf)

lines = sc.textFile(r"C:\DataScience\Jupyter Files\Spark\Datasets\fakefriends.csv")

def parseLine(line):
    fields = line.split(',')
    age = int(fields[2])
    numFriends = int(fields[3])
    return (age, numFriends)

rdd = lines.map(parseLine)
totalsByAge = rdd.mapValues(lambda x:(x, 1))
totalsByAge = totalsByAge.reduceByKey(lambda x1, x2: (x1[0] + x2[0], x1[1] + x2[1]))
averagesByAge = totalsByAge.mapValues(lambda x: x[0] / x[1])
print(averagesByAge.take(5))
sc.stop()

[(33, 325.3333333333333), (26, 242.05882352941177), (55, 295.53846153846155), (40, 250.8235294117647), (68, 269.6)]


Now we just calculate the average number of friends by dividing the total number of friends by the total frequency.

In [20]:
import findspark
findspark.init()
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("FriendsByAge")
sc = SparkContext(conf = conf)

lines = sc.textFile(r"C:\DataScience\Jupyter Files\Spark\Datasets\fakefriends.csv")

def parseLine(line):
    fields = line.split(',')
    age = int(fields[2])
    numFriends = int(fields[3])
    return (age, numFriends)

rdd = lines.map(parseLine)
totalsByAge = rdd.mapValues(lambda x:(x, 1))
totalsByAge = totalsByAge.reduceByKey(lambda x1, x2: (x1[0] + x2[0], x1[1] + x2[1]))
averagesByAge = totalsByAge.mapValues(lambda x: x[0] / x[1])
results = averagesByAge.collect()
for result in results:
    print(result)
sc.stop()

(33, 325.3333333333333)
(26, 242.05882352941177)
(55, 295.53846153846155)
(40, 250.8235294117647)
(68, 269.6)
(59, 220.0)
(37, 249.33333333333334)
(54, 278.0769230769231)
(38, 193.53333333333333)
(27, 228.125)
(53, 222.85714285714286)
(57, 258.8333333333333)
(56, 306.6666666666667)
(43, 230.57142857142858)
(36, 246.6)
(22, 206.42857142857142)
(35, 211.625)
(45, 309.53846153846155)
(60, 202.71428571428572)
(67, 214.625)
(19, 213.27272727272728)
(30, 235.8181818181818)
(51, 302.14285714285717)
(25, 197.45454545454547)
(21, 350.875)
(42, 303.5)
(49, 184.66666666666666)
(48, 281.4)
(50, 254.6)
(39, 169.28571428571428)
(32, 207.9090909090909)
(58, 116.54545454545455)
(64, 281.3333333333333)
(31, 267.25)
(52, 340.6363636363636)
(24, 233.8)
(20, 165.0)
(62, 220.76923076923077)
(41, 268.55555555555554)
(44, 282.1666666666667)
(69, 235.2)
(65, 298.2)
(61, 256.22222222222223)
(28, 209.1)
(66, 276.44444444444446)
(46, 223.69230769230768)
(29, 215.91666666666666)
(18, 343.375)
(47, 233.22222222222

**collect()**: return all the results as an array. Now the results are not in the form of RDD. Collect is one of the RDD actions. The RDD actions are functions that convert the RDD to some results. Before the actions, you may have multiple RDD transforming, **but noting actually happens until an action is called!**