In [108]:
spark

In [118]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

schema = StructType() \
      .add("RecordNumber",IntegerType(),True) \
      .add("Zipcode",IntegerType(),True) \
      .add("ZipCodeType",StringType(),True) \
      .add("City",StringType(),True) \
      .add("State",StringType(),True) \
      .add("LocationType",StringType(),True) \
      .add("Lat",DoubleType(),True) \
      .add("Long",DoubleType(),True) \
      .add("Xaxis",IntegerType(),True) \
      .add("Yaxis",DoubleType(),True) \
      .add("Zaxis",DoubleType(),True) \
      .add("WorldRegion",StringType(),True) \
      .add("Country",StringType(),True) \
      .add("LocationText",StringType(),True) \
      .add("Location",StringType(),True) \
      .add("Decommisioned",BooleanType(),True) \
      .add("TaxReturnsFiled",StringType(),True) \
      .add("EstimatedPopulation",IntegerType(),True) \
      .add("TotalWages",IntegerType(),True) \
      .add("Notes",StringType(),True)
       

df_with_schema = spark.read.format("csv") \
      .option("header", True) \
      .schema(schema) \
      .load("../data/zipcodes.csv")

In [116]:
df.show(3)

+------------+-------+-----------+-------------------+-----+--------------+-----+------+-----+-----+-----+-----------+-------+--------------------+--------------------+-------------+---------------+-------------------+----------+-----+
|RecordNumber|Zipcode|ZipCodeType|               City|State|  LocationType|  Lat|  Long|Xaxis|Yaxis|Zaxis|WorldRegion|Country|        LocationText|            Location|Decommisioned|TaxReturnsFiled|EstimatedPopulation|TotalWages|Notes|
+------------+-------+-----------+-------------------+-----+--------------+-----+------+-----+-----+-----+-----------+-------+--------------------+--------------------+-------------+---------------+-------------------+----------+-----+
|           1|    704|   STANDARD|        PARC PARQUE|   PR|NOT ACCEPTABLE|17.96|-66.22| 0.38|-0.87|  0.3|         NA|     US|     Parc Parque, PR|NA-US-PR-PARC PARQUE|        false|           null|               null|      null| null|
|           2|    704|   STANDARD|PASEO COSTA DEL SUR|  

## Apache Parquet Pyspark Example


In [119]:

data =[("James ","","Smith","36636","M",3000),
              ("Michael ","Rose","","40288","M",4000),
              ("Robert ","","Williams","42114","M",4000),
              ("Maria ","Anne","Jones","39192","F",4000),
              ("Jen","Mary","Brown","","F",-1)]
columns=["firstname","middlename","lastname","dob","gender","salary"]
df=spark.createDataFrame(data,columns)

In [120]:
df.write.parquet("../data/parquest_data/people.parquet")

                                                                                

In [126]:
parqDF=spark.read.parquet("../data/parquest_data/people.parquet")

parqDF.createOrReplaceTempView("ParquetTable")

In [129]:
spark.sql("select * from ParquetTable where gender='M'").show()

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|  dob|gender|salary|
+---------+----------+--------+-----+------+------+
|  Robert |          |Williams|42114|     M|  4000|
| Michael |      Rose|        |40288|     M|  4000|
|   James |          |   Smith|36636|     M|  3000|
+---------+----------+--------+-----+------+------+



In [131]:
# Read JSON file into dataframe
df = spark.read.json("../data/zipcodes.json")
df.printSchema()
df.show(3)

root
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Decommisioned: boolean (nullable = true)
 |-- EstimatedPopulation: long (nullable = true)
 |-- Lat: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- LocationText: string (nullable = true)
 |-- LocationType: string (nullable = true)
 |-- Long: double (nullable = true)
 |-- Notes: string (nullable = true)
 |-- RecordNumber: long (nullable = true)
 |-- State: string (nullable = true)
 |-- TaxReturnsFiled: long (nullable = true)
 |-- TotalWages: long (nullable = true)
 |-- WorldRegion: string (nullable = true)
 |-- Xaxis: double (nullable = true)
 |-- Yaxis: double (nullable = true)
 |-- Zaxis: double (nullable = true)
 |-- ZipCodeType: string (nullable = true)
 |-- Zipcode: long (nullable = true)

+-------------------+-------+-------------+-------------------+-----+--------------------+--------------------+--------------+------+-----+------------+-----+---------------+----------+-----

In [132]:
## read multiple json from a folder
# Read multiple files
df2 = spark.read.json(
    ['../data/zipcode1.json','../data/zipcode2.json'])
df2.show()  

+-------------------+-------+-------------+-----+--------------------+--------------------+--------------+------+------------+-----+-----------+-----+-----+-----+-----------+-------+
|               City|Country|Decommisioned|  Lat|            Location|        LocationText|  LocationType|  Long|RecordNumber|State|WorldRegion|Xaxis|Yaxis|Zaxis|ZipCodeType|Zipcode|
+-------------------+-------+-------------+-----+--------------------+--------------------+--------------+------+------------+-----+-----------+-----+-----+-----+-----------+-------+
|PASEO COSTA DEL SUR|     US|        false|17.96|NA-US-PR-PASEO CO...|Paseo Costa Del S...|NOT ACCEPTABLE|-66.22|           2|   PR|         NA| 0.38|-0.87|  0.3|   STANDARD|    704|
|       BDA SAN LUIS|     US|        false|18.14|NA-US-PR-BDA SAN ...|    Bda San Luis, PR|NOT ACCEPTABLE|-66.26|          10|   PR|         NA| 0.38|-0.86| 0.31|   STANDARD|    709|
|        PARC PARQUE|     US|        false|17.96|NA-US-PR-PARC PARQUE|     Parc Parqu

In [133]:
ls ../data

cover_letter.txt        simple-zipcodes.csv  zipcodes.csv
data.txt                small_zipcode.csv    zipcodes.json
multiline-zipcode.json  test.txt             [0m[01;34mzipcodes-state[0m/
[01;34mparquest_data[0m/          zipcode1.json        [01;34mzipcodes-state-more[0m/
README.md               zipcode2.json


In [136]:
spark.read.csv(["../data/small_zipcode.csv","../data/small_zipcode.csv"]).show()

+---+-------+--------+-------------------+-----+----------+
|_c0|    _c1|     _c2|                _c3|  _c4|       _c5|
+---+-------+--------+-------------------+-----+----------+
| id|zipcode|    type|               city|state|population|
|  1|    704|STANDARD|               null|   PR|     30100|
|  2|    704|    null|PASEO COSTA DEL SUR|   PR|      null|
|  3|    709|    null|       BDA SAN LUIS|   PR|      3700|
|  4|  76166|  UNIQUE|  CINGULAR WIRELESS|   TX|     84000|
|  5|  76177|STANDARD|               null|   TX|      null|
| id|zipcode|    type|               city|state|population|
|  1|    704|STANDARD|               null|   PR|     30100|
|  2|    704|    null|PASEO COSTA DEL SUR|   PR|      null|
|  3|    709|    null|       BDA SAN LUIS|   PR|      3700|
|  4|  76166|  UNIQUE|  CINGULAR WIRELESS|   TX|     84000|
|  5|  76177|STANDARD|               null|   TX|      null|
+---+-------+--------+-------------------+-----+----------+



## Writing Avro Partition Data

In [143]:
data = [("James ","","Smith",2018,1,"M",3000),
      ("Michael ","Rose","",2010,3,"M",4000),
      ("Robert ","","Williams",2010,3,"M",4000),
      ("Maria ","Anne","Jones",2005,5,"F",4000),
      ("Jen","Mary","Brown",2010,7,"",-1)
       ]

columns = ["firstname", "middlename", "lastname", "dob_year",
 "dob_month", "gender", "salary"]
df = spark.createDataFrame(data,columns)

df.show()

+---------+----------+--------+--------+---------+------+------+
|firstname|middlename|lastname|dob_year|dob_month|gender|salary|
+---------+----------+--------+--------+---------+------+------+
|   James |          |   Smith|    2018|        1|     M|  3000|
| Michael |      Rose|        |    2010|        3|     M|  4000|
|  Robert |          |Williams|    2010|        3|     M|  4000|
|   Maria |      Anne|   Jones|    2005|        5|     F|  4000|
|      Jen|      Mary|   Brown|    2010|        7|      |    -1|
+---------+----------+--------+--------+---------+------+------+



In [147]:
df.write.format("avro").save("../data/avro_data/profile.avro")