In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql import functions

The command to kill all spark session, before you start a new one. 
I got error message like:
__Another instance of Derby may have already booted the database /Users/jessie/Desktop/MusicBox/metastore_db.__
```shell
ps -ef | grep spark-shell
kill -9 Spark-Shell-processID ( example: kill -9 4848)
```

To inspect the first 10 lines of a large text file:
```shell
head -n 10 all_play.log.fn
```
I found that the first two lines of "data/all_play.log.fn" is bad formated. So use the following command to delete them:
```shell

```

In [2]:
#Create a SparkSession
spark = SparkSession.builder.appName("ReadingData").getOrCreate()

In [3]:
lines = spark.sparkContext.textFile("data/all_play.log.fn")
#header = lines.first(2) #extract header
#lines = lines.filter(row >= row != header)   #filter out header

In [4]:
from pyspark.sql.types import *

def parseLine(line):
    fields = line.split('\t')
    if len(fields) == 10:
        uid = int(fields[0])
        device = str(fields[1])
        song_id = int(fields[2])
        song_type = int(fields[3])
        song_name = str(fields[4])
        singer = str(fields[5])
        play_time = int(fields[6])
        song_length = int(fields[7])
        paid_flag = int(fields[8])
        fn = str(fields[9])
        return Row(uid, device, song_id, song_type, song_name, singer, play_time, song_length, paid_flag, fn)
    else:
        return Row(None)


schema = StructType([StructField('uid', IntegerType(), True),
                     StructField('device', StringType(), True),
                     StructField('song_id', IntegerType(), False),
                     StructField('song_type', IntegerType(), True),
                     StructField('song_name', StringType(), True),
                     StructField('singer', StringType(), True),
                     StructField('play_time', IntegerType(), False),
                     StructField('song_length', IntegerType(), True),
                     StructField('paid_flag', IntegerType(), True),
                     StructField('fn', StringType(), True),])

if you want to create an empty dataframe with RDD:
```python
from pyspark.sql.types import *
field = [StructField(“FIELDNAME_1”,StringType(), True),StructField(“FIELDNAME_2”, StringType(), True),\
 StructField(“FIELDNAME_3”, StringType(), True)]
schema = StructType(field)
df = sqlContext.createDataFrame(sc.emptyRDD(), schema)
```

In [7]:
songs = lines.map(parseLine).filter(lambda x: len(x) == len(schema))
# Convert that to a DataFrame
songDataset = spark.createDataFrame(songs)
#df = spark.read.csv("data/all_play.log.fn",header=False,schema=schema)           

In [8]:
songDataset.show()

+---------+---+--------+---+--------------------+--------------------+------+---+---+------------------+
|       _1| _2|      _3| _4|                  _5|                  _6|    _7| _8| _9|               _10|
+---------+---+--------+---+--------------------+--------------------+------+---+---+------------------+
|154422682|ar |20870993|  1|                 用情 |              狮子合唱团 | 22013|332|  0| 20170301_play.log|
|154421907|ip | 6560858|  0|             表情不要悲伤 |    伯贤&D.O.&张艺兴&朴灿烈 |    96|161|  0| 20170301_play.log|
|154422630|ar | 3385963|  1|Baby, Don't Cry(人...|                EXO |235868|235|  0| 20170301_play.log|
|154410267|ar | 6777172|  0|   3D-环绕音律1(3D Mix) |             McTaiM |   164|237|  0| 20170301_play.log|
|154407793|ar |19472465|  0|              刚好遇见你 |                曲肖冰 |    24|201|  0| 20170301_play.log|
|154422626|ar | 3198036|  1|              只唱给你听 |            SpeXial |275249|  0|  0| 20170301_play.log|
|154422681|ar |  891952|  0|   老男孩-(电影《老男孩》主题曲) |      

In [None]:
spark.stop()