# 数据读取与保存

### 文件格式
- 文本文件
- JSON
- CSV
- SequenceFIle
- Protocol buffers
- 对象文件

#### 读取文本文件

In [1]:
from pyspark import SparkContext
sc = SparkContext('local', 'FileReader')

In [3]:
import os
currentPath = os.path.abspath('.')

In [11]:
# 读取文件夹下所有文件
inputRDD = sc.textFile('file://{}/MyBlog/*.txt'.format(currentPath))
inputRDD.collect()

['INFO This is a info log 1',
 'INFO Log 2',
 '',
 'ERROR OMG,This is an error!!',
 "WARN Don't worry , just a warn....",
 '',
 '',
 '',
 '',
 'ERROR Another error.',
 'WARN warn again',
 'INFO get an info.',
 'Quick Start',
 'Interactive Analysis with the Spark Shell',
 'Basics',
 'More on RDD Operations',
 'Caching',
 'Self-Contained Applications',
 'Where to Go from Here',
 'This tutorial provides a quick introduction to using Spark. We will first introduce the API through Spark’s interactive shell (in Python or Scala), then show how to write applications in Java, Scala, and Python. See the programming guide for a more complete reference.',
 '',
 'To follow along with this guide, first download a packaged release of Spark from the Spark website. Since we won’t be using HDFS, you can download a package for any version of Hadoop.',
 '',
 'Interactive Analysis with the Spark Shell',
 'Basics',
 'Spark’s shell provides a simple way to learn the API, as well as a powerful tool to analyze

In [12]:
# 读取文件夹，文件名为K，内容为V
wholePairRDD = sc.wholeTextFiles('file://{}/MyBlog'.format(currentPath))
partRDD = wholePairRDD.mapValues(lambda x: x[0: len(x) % 100])
partRDD.collect()

[('file:/Users/cool/OneDrive/002_projects/learning-spark/notes-sparkDataAnalysis/MyBlog/input.txt',
  'INFO This is a info log 1\nINFO Log 2\n\nERROR OMG,This is an e'),
 ('file:/Users/cool/OneDrive/002_projects/learning-spark/notes-sparkDataAnalysis/MyBlog/quickstart.txt',
  'Quick Start\nInt')]

In [16]:
wholePairRDD.collectAsMap()

{'file:/Users/cool/OneDrive/002_projects/learning-spark/notes-sparkDataAnalysis/MyBlog/input.txt': "INFO This is a info log 1\nINFO Log 2\n\nERROR OMG,This is an error!!\nWARN Don't worry , just a warn....\n\n\n\n\nERROR Another error.\nWARN warn again\nINFO get an info.",
 'file:/Users/cool/OneDrive/002_projects/learning-spark/notes-sparkDataAnalysis/MyBlog/quickstart.txt': 'Quick Start\nInteractive Analysis with the Spark Shell\nBasics\nMore on RDD Operations\nCaching\nSelf-Contained Applications\nWhere to Go from Here\nThis tutorial provides a quick introduction to using Spark. We will first introduce the API through Spark’s interactive shell (in Python or Scala), then show how to write applications in Java, Scala, and Python. See the programming guide for a more complete reference.\n\nTo follow along with this guide, first download a packaged release of Spark from the Spark website. Since we won’t be using HDFS, you can download a package for any version of Hadoop.\n\nInteractive A

In [17]:
# 保存RDD到本地文件
partRDD.saveAsTextFile('file://{}/MyBlog/pair'.format(currentPath))

#### JSON

In [18]:
import json
jsonRDD = sc.wholeTextFiles('file://{}/testweet.json'.format(currentPath))
data = jsonRDD.mapValues(lambda x: json.loads(x))
data.collect()

[('file:/Users/cool/OneDrive/002_projects/learning-spark/notes-sparkDataAnalysis/testweet.json',
  {'createdAt': 'Nov 4, 2014 4:56:59 PM',
   'id': 529799371026485248,
   'text': 'Adventures With Coffee, Code, and Writing.',
   'source': '<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>',
   'isTruncated': False,
   'inReplyToStatusId': -1,
   'inReplyToUserId': -1,
   'isFavorited': False,
   'retweetCount': 0,
   'isPossiblySensitive': False,
   'contributorsIDs': [],
   'userMentionEntities': [],
   'urlEntities': [],
   'hashtagEntities': [],
   'mediaEntities': [],
   'currentUserRetweetId': -1,
   'user': {'id': 15594928,
    'name': 'Holden Karau',
    'screenName': 'holdenkarau',
    'location': '',
    'description': '',
    'descriptionURLEntities': [],
    'isContributorsEnabled': False,
    'profileImageUrl': 'http://pbs.twimg.com/profile_images/3005696115/2036374bbadbed85249cdd50aac6e170_normal.jpeg',
    'profileImageUrlHttps': 'https://pbs.twimg.com/pro

In [21]:
# 数据以Json形式存储
pairRDD = sc.parallelize([["panda",1],["banana",2],["monkey",3]])
pairRDD.collectAsMap()

{'panda': 1, 'banana': 2, 'monkey': 3}

In [22]:
jsonRDD = pairRDD.filter(lambda x: x[0] == 'panda').map(lambda x: json.dumps(x))
jsonRDD.collect()

['["panda", 1]']

In [23]:
jsonRDD.saveAsTextFile('file://{}/jsonRDD.json'.format(currentPath))

### 文件系统

#### Hive JSON HDFS

In [24]:
from pyspark.sql import HiveContext
hiveCtx = HiveContext(sc)

In [25]:
json = sc.textFile('file://{}/hive.json'.format(currentPath))
json.collect()

['{"user":{"name":"Niko","age":"15"},"text":"I love big data"}',
 '{"user":{"name":"Helen","age":"25"},"text":"I love MachineLearning"}']

In [28]:
contract = hiveCtx.read.json('file://{}/hive.json'.format(currentPath))
contract.registerTempTable('contract')
results = hiveCtx.sql('select user.name from contract')
results.collect()

[Row(name='Niko'), Row(name='Helen')]

In [104]:
# 这是1.6版本的，注意更改
contract = hiveCtx.jsonFile("file://" + os.path.abspath(".") + "/hive.json")
contract.registerTempTable("contract")
results = hiveCtx.sql("select user.name from contract")
results.collect()



[Row(name='Niko'), Row(name='Helen')]