In [1]:
sc

In [2]:
from pyspark.sql.types import Row
from datetime import datetime

In [3]:
simple_data = sc.parallelize([1, 'Alice', 50])
simple_data

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:184

In [4]:
simple_data.count()

3

In [5]:
simple_data.first()

1

In [6]:
simple_data.take(2)

[1, 'Alice']

In [7]:
simple_data.collect()

[1, 'Alice', 50]

 * the above calls are actions, that materialize the result of all tranformations which occurred before
 * these operations are expensive, so need to be used with care

In [9]:
df = simple_data.toDF()

TypeError: Can not infer schema for type: <class 'int'>

 * this error occurs, since the RDD has no schema and contains elements of different types

#### RDD that can be converted to dataframe

In [10]:
records = sc.parallelize([[1, 'Alice', 50],[2, 'Bob', 80]])
records

ParallelCollectionRDD[10] at parallelize at PythonRDD.scala:184

In [11]:
records.count()

2

In [12]:
records.first()

[1, 'Alice', 50]

In [13]:
records.take(2)

[[1, 'Alice', 50], [2, 'Bob', 80]]

In [14]:
records.collect()

[[1, 'Alice', 50], [2, 'Bob', 80]]

In [15]:
df = records.toDF()

In [16]:
complex_data = sc.parallelize([Row(col_float=1.44, col_integer=10, col_string='John')])

In [17]:
complex_data_df = complex_data.toDF()
complex_data_df.show()

+---------+-----------+----------+
|col_float|col_integer|col_string|
+---------+-----------+----------+
|     1.44|         10|      John|
+---------+-----------+----------+



In [18]:
complex_data = sc.parallelize([Row(col_float=1.44, col_integer=10, col_list = [1, 2, 3], col_string='John')])
complex_data_df = complex_data.toDF()
complex_data_df.show()

+---------+-----------+---------+----------+
|col_float|col_integer| col_list|col_string|
+---------+-----------+---------+----------+
|     1.44|         10|[1, 2, 3]|      John|
+---------+-----------+---------+----------+



In [24]:
row_1 = Row(col_list = [1, 2, 3], col_dict = {'K1':0}, col_row = Row(a=10, b=20, c=30), col_time=datetime(2014, 8, 1, 14, 1, 5))
row_2 = Row(col_list = [1, 2, 3, 4, 5], col_dict = {'K1':0, 'K2':1}, col_row = Row(a=40, b=50, c=60), col_time=datetime(2014, 8, 1, 14, 1, 6))
row_3 = Row(col_list = [1, 2, 3, 4, 5, 6, 7], col_dict = {'K1':0, 'K2':1, 'K3':2}, col_row = Row(a=70, b=80, c=90), col_time=datetime(2014, 8, 1, 14, 1, 7))

complex_data = sc.parallelize([row_1, row_2, row_3])

In [25]:
complex_data_df = complex_data.toDF()
complex_data_df.show()

+--------------------+--------------------+------------+-------------------+
|            col_dict|            col_list|     col_row|           col_time|
+--------------------+--------------------+------------+-------------------+
|           [K1 -> 0]|           [1, 2, 3]|[10, 20, 30]|2014-08-01 14:01:05|
|  [K1 -> 0, K2 -> 1]|     [1, 2, 3, 4, 5]|[40, 50, 60]|2014-08-01 14:01:06|
|[K3 -> 2, K1 -> 0...|[1, 2, 3, 4, 5, 6...|[70, 80, 90]|2014-08-01 14:01:07|
+--------------------+--------------------+------------+-------------------+



In [26]:
sqlContext = SQLContext(sc)