# 问题描述

读取没有记录的csv会发生什么

In [1]:
import pyspark
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql.functions import col

spark = SparkSession \
    .builder \
    .appName("example") \
    .getOrCreate()
sc = spark.sparkContext
sql_ctx = SQLContext(sc)

指定一个schema

In [2]:
schema = StructType([
    StructField('col_a', StringType(), True),
    StructField('col_b', IntegerType(), True),
])

创建一个空的df（使用上面的schema）

In [3]:
df = sql_ctx.createDataFrame([
], schema)
df.printSchema()
df.show()

root
 |-- col_a: string (nullable = true)
 |-- col_b: integer (nullable = true)

+-----+-----+
|col_a|col_b|
+-----+-----+
+-----+-----+



In [4]:
path = '/tmp/practice/csv_read_empty.csv'

## 测试1

schema=None, header=False

In [5]:
df.write.mode('overwrite').csv(path, header=False)

可以看到，读出来的df没有schema

In [6]:
df2 = sql_ctx.read.csv(path, header=False, schema=None)
df2.printSchema()
df2.show()

root

++
||
++
++



那我们接下来试试，如果读取时设置header=True，会发生什么

## 测试2

schema=None, header=True

In [7]:
df.write.mode('overwrite').csv(path, header=True)

可以看到，读出来的df没有schema（即使设置了header）

In [8]:
df2 = sql_ctx.read.csv(path, header=True, schema=None)
df2.printSchema()
df2.show()

root

++
||
++
++



我们再尝试一下设置schema

## 测试3

schema=schame, header=False

In [9]:
df.write.mode('overwrite').csv(path, header=False)

可以看到，读出来的df是正确的schema

In [10]:
df2 = sql_ctx.read.csv(path, header=False, schema=schema)
df2.printSchema()
df2.show()

root
 |-- col_a: string (nullable = true)
 |-- col_b: integer (nullable = true)

+-----+-----+
|col_a|col_b|
+-----+-----+
+-----+-----+



## 测试4

schema=schame, header=True

In [11]:
df.write.mode('overwrite').csv(path, header=True)

可以看到，读出来的df是正确的schema

In [12]:
df2 = sql_ctx.read.csv(path, header=True, schema=schema)
df2.printSchema()
df2.show()

root
 |-- col_a: string (nullable = true)
 |-- col_b: integer (nullable = true)

+-----+-----+
|col_a|col_b|
+-----+-----+
+-----+-----+

