In [23]:
import findspark
findspark.init()

In [24]:
from pyspark.sql import SparkSession

spark:SparkSession = SparkSession.builder.master("local[1]")\
            .appName("SparkByExample.com")\
            .getOrCreate()

In [25]:
# Create RDD from parallelize

data = [1,2,3,4,5,6,7,8,9,10,11,12]
rdd = spark.sparkContext.parallelize(data)
rdd.collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [26]:
# Create RDD from external Data source

rdd2 = spark.sparkContext.textFile('textFile.txt')
rdd2.collect()

['1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20']

In [27]:
# Reads entire file into a RDD as single record

rdd3 = spark.sparkContext.wholeTextFiles('textFile.txt')
rdd3.collect()

[('file:/C:/Users/qkrwn/workplace/textFile.txt',
  '1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20\n')]

In [28]:
# Create empty RDD with no partition

#rdd = spark.sparkContext.emptyRDD
rdd

ParallelCollectionRDD[26] at readRDDFromFile at PythonRDD.scala:287

In [29]:
# Create empty RDD with partition
# Create 10 partitions

rdd2 = spark.sparkContext.parallelize([1,2,3,4,56,7,8,9,12,3], 10)
rdd2.collect()

print("Initial partition count:" + str(rdd.getNumPartitions()))

Initial partition count:1


In [30]:
# re-partition
# repartition() or coalesce() 동일 함수
reparRdd = rdd2.coalesce(6)
print("re-partition count:" + str(reparRdd.getNumPartitions()))

re-partition count:6


In [31]:
rdd = spark.sparkContext.textFile('test.txt')
print(rdd.collect())

['Project Gutenberg’s', 'Alice’s Adventures in Wonderland', 'by Lewis Carroll', 'This eBook is for the use', 'of anyone anywhere', 'at no cost and with', 'Alice’s Adventures in Wonderland', 'by Lewis Carroll', 'This eBook is for the use', 'of anyone anywhere', 'at no cost and with', 'This eBook is for the use', 'of anyone anywhere', 'at no cost and with', 'Project Gutenberg’s', 'Alice’s Adventures in Wonderland', 'by Lewis Carroll', 'This eBook is for the use', 'of anyone anywhere', 'at no cost and with', 'Alice’s Adventures in Wonderland', 'by Lewis Carroll', 'This eBook is for the use', 'of anyone anywhere', 'at no cost and with', 'This eBook is for the use', 'of anyone anywhere', 'at no cost and with', 'Project Gutenberg’s', 'Alice’s Adventures in Wonderland', 'by Lewis Carroll', 'This eBook is for the use', 'of anyone anywhere', 'at no cost and with', 'Alice’s Adventures in Wonderland', 'by Lewis Carroll', 'This eBook is for the use', 'of anyone anywhere', 'at no cost and with', 'T

In [32]:
rdd2 = rdd.flatMap(lambda x: x.split(" "))
print(rdd2.collect())

['Project', 'Gutenberg’s', 'Alice’s', 'Adventures', 'in', 'Wonderland', 'by', 'Lewis', 'Carroll', 'This', 'eBook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'Alice’s', 'Adventures', 'in', 'Wonderland', 'by', 'Lewis', 'Carroll', 'This', 'eBook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'This', 'eBook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'Project', 'Gutenberg’s', 'Alice’s', 'Adventures', 'in', 'Wonderland', 'by', 'Lewis', 'Carroll', 'This', 'eBook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'Alice’s', 'Adventures', 'in', 'Wonderland', 'by', 'Lewis', 'Carroll', 'This', 'eBook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'This', 'eBook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'Project', 'Gutenberg

In [33]:
rdd3 = rdd2.map(lambda x: (x, 1))
print(rdd3.collect())

[('Project', 1), ('Gutenberg’s', 1), ('Alice’s', 1), ('Adventures', 1), ('in', 1), ('Wonderland', 1), ('by', 1), ('Lewis', 1), ('Carroll', 1), ('This', 1), ('eBook', 1), ('is', 1), ('for', 1), ('the', 1), ('use', 1), ('of', 1), ('anyone', 1), ('anywhere', 1), ('at', 1), ('no', 1), ('cost', 1), ('and', 1), ('with', 1), ('Alice’s', 1), ('Adventures', 1), ('in', 1), ('Wonderland', 1), ('by', 1), ('Lewis', 1), ('Carroll', 1), ('This', 1), ('eBook', 1), ('is', 1), ('for', 1), ('the', 1), ('use', 1), ('of', 1), ('anyone', 1), ('anywhere', 1), ('at', 1), ('no', 1), ('cost', 1), ('and', 1), ('with', 1), ('This', 1), ('eBook', 1), ('is', 1), ('for', 1), ('the', 1), ('use', 1), ('of', 1), ('anyone', 1), ('anywhere', 1), ('at', 1), ('no', 1), ('cost', 1), ('and', 1), ('with', 1), ('Project', 1), ('Gutenberg’s', 1), ('Alice’s', 1), ('Adventures', 1), ('in', 1), ('Wonderland', 1), ('by', 1), ('Lewis', 1), ('Carroll', 1), ('This', 1), ('eBook', 1), ('is', 1), ('for', 1), ('the', 1), ('use', 1), ('of

In [34]:
# reduceByKey : sum 연산을 적용하여 단어 문자열을 줄임

rdd4 = rdd3.reduceByKey(lambda a,b: a+b)
print(rdd4.collect())

[('Project', 9), ('Gutenberg’s', 9), ('Alice’s', 18), ('Adventures', 18), ('in', 18), ('Wonderland', 18), ('by', 18), ('Lewis', 18), ('Carroll', 18), ('This', 27), ('eBook', 27), ('is', 27), ('for', 27), ('the', 27), ('use', 27), ('of', 27), ('anyone', 27), ('anywhere', 27), ('at', 27), ('no', 27), ('cost', 27), ('and', 27), ('with', 27)]


In [35]:
# sortByKey : key에서 RDD 요소를 정렬하는데 사용

rdd5 = rdd4.map(lambda x: (x[1],x[0])).sortByKey()
print(rdd5.collect())

[(9, 'Project'), (9, 'Gutenberg’s'), (18, 'Alice’s'), (18, 'Adventures'), (18, 'in'), (18, 'Wonderland'), (18, 'by'), (18, 'Lewis'), (18, 'Carroll'), (27, 'This'), (27, 'eBook'), (27, 'is'), (27, 'for'), (27, 'the'), (27, 'use'), (27, 'of'), (27, 'anyone'), (27, 'anywhere'), (27, 'at'), (27, 'no'), (27, 'cost'), (27, 'and'), (27, 'with')]


In [36]:
# filter : RDD의 record를 필터링 하는데 사용됨

rdd6 = rdd5.filter(lambda x: 'a' in x[1])
print(rdd6.collect())

[(18, 'Wonderland'), (18, 'Carroll'), (27, 'anyone'), (27, 'anywhere'), (27, 'at'), (27, 'and')]


In [37]:
# Action - count
print("Count : " + str(rdd6.count()))

# Action - first
firstRec = rdd6.first()
print("First Record: " + str(firstRec[0]) + "," + firstRec[1])

# Action - max
datMax = rdd6.max()
print("Max Record: " + str(datMax[0]) + "," + datMax[1])

# Action - reduce
totalWordCount = rdd6.reduce(lambda a,b: (a[0]+b[0], a[1]))
print("DataReduce Record : " + str(totalWordCount))

# Action - take
data3 = rdd6.take(3)
for f in data3:
    print("data3 Key: " + str(f[0]) + ", Value: " + f[1])
    
# Action - collect
data = rdd6.collect()
for f in data:
    print("Key: "+ str(f[0]) +", Value: "+f[1])
    

# rdd6.saveAsTextFile("wordCount") 

Count : 6
First Record: 18,Wonderland
Max Record: 27,at
DataReduce Record : (144, 'Wonderland')
data3 Key: 18, Value: Wonderland
data3 Key: 18, Value: Carroll
data3 Key: 27, Value: anyone
Key: 18, Value: Wonderland
Key: 18, Value: Carroll
Key: 27, Value: anyone
Key: 27, Value: anywhere
Key: 27, Value: at
Key: 27, Value: and


In [38]:
cacheRdd = rdd.cache()
cacheRdd

test.txt MapPartitionsRDD[34] at textFile at NativeMethodAccessorImpl.java:0

In [39]:
import pyspark

dfPersist = rdd.persist()
dfPersist

In [40]:
rddPersist2 = rdd.unpersist()
rddPersist2

In [41]:
broadcastVar = spark.sparkContext.broadcast([0,1,2,3])
broadcastVar.value

[0, 1, 2, 3]

In [42]:
# Converts RDD to DataFrame
dfFromRDD1 = rdd.toDF()

# Converts RDD to DataFrame with column names
dfFromRDD2 = rdd.toDF("col1","col2")

# using createDataFrame() - Convert DataFrame to RDD
df = spark.createDataFrame(rdd).toDF("col1","col2")

# Convert DataFrame to RDD
rdd = df.rdd

TypeError: Can not infer schema for type: <class 'str'>