# Simple Data Exploration Using PySpark

# Simple Data Exploration Using PySpark

# SPARK DFs

In [0]:
#Importing the necessary libraries

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import col 
from pyspark.sql.functions import lit
from pyspark.sql.functions import sum, avg, max, min, mean, count

In [None]:
spark = SparkSession.builder.appName("DataFrame").getOrCreate()

In [None]:
# Defining the file path

path  = "/FileStore/tables/taxi_zones.csv"

In [None]:
# Reading the CSV file

df = spark.read.csv(path, header=True, inferSchema=True)

In [0]:
# Showing the data frame

display(df)

X,Y,OBJECTID,Shape_Leng,Shape_Area,zone,LocationID,borough
-74.1767857452143,40.6895156480431,1,0.116357453189,0.0007823067885,Newark Airport,1,EWR
-73.8261257703202,40.6257242377511,2,0.43346966679,0.00486634037837,Jamaica Bay,2,Queens
-73.8494789238597,40.8658875419774,3,0.0843411059012,0.000314414156821,Allerton/Pelham Gardens,3,Bronx
-73.9770229219339,40.7241521436714,4,0.0435665270921,0.000111871946192,Alphabet City,4,Manhattan
-74.1899296712375,40.550340123832,5,0.0921464898574,0.000497957489363,Arden Heights,5,Staten Island
-74.0677744607421,40.5990621740821,6,0.150490542523,0.000606460984581,Arrochar/Fort Wadsworth,6,Staten Island
-73.9214905669465,40.761084729151,7,0.107417171123,0.000389787989274,Astoria,7,Queens
-73.9232024092836,40.7786069617704,8,0.0275906911574,2.6587716279e-05,Astoria Park,8,Queens
-73.7880202487407,40.7544109271114,9,0.0997840924705,0.000338443803197,Auburndale,9,Queens
-73.7916654578906,40.6781247031195,10,0.0998394794152,0.000435823818081,Baisley Park,10,Queens


In [0]:
# Printing out the dataframe's schema.

display(df.printSchema())

# Maually inputing Schema

In [0]:
Schema = StructType ([
  StructField("OBJECTID", IntegerType(), True),
  StructField("LocationID", IntegerType(), True),
  StructField("zone", StringType(), True),
  StructField("borough", StringType(), True)  
])

# Selecting Columns 

In [0]:
# Selecting and showing 2 columns (zone and borough)

display(df.select(df["zone"],df["borough"]))

zone,borough
Newark Airport,EWR
Jamaica Bay,Queens
Allerton/Pelham Gardens,Bronx
Alphabet City,Manhattan
Arden Heights,Staten Island
Arrochar/Fort Wadsworth,Staten Island
Astoria,Queens
Astoria Park,Queens
Auburndale,Queens
Baisley Park,Queens


In [0]:
# Showing one column

display(df.select("zone"))

zone
Newark Airport
Jamaica Bay
Allerton/Pelham Gardens
Alphabet City
Arden Heights
Arrochar/Fort Wadsworth
Astoria
Astoria Park
Auburndale
Baisley Park


In [0]:
# Selecting columns  

df.columns[:]

# withColumns

In [0]:
# Changing the locationID column-type from "integer" to "double".

df = df.withColumn("locationID", col("locationID").cast('double'))

In [0]:
df.printSchema()

In [0]:
# Adding (20) to a column "ObjectID" and creating a new column "new-ObjID" to fill with the results.

df = df.withColumn("new_ObjID", col('ObjectID')+ 20)

In [0]:
display(df)

X,Y,OBJECTID,Shape_Leng,Shape_Area,zone,locationID,borough,new_ObjID
-74.1767857452143,40.6895156480431,1,0.116357453189,0.0007823067885,Newark Airport,1.0,EWR,21
-73.8261257703202,40.6257242377511,2,0.43346966679,0.00486634037837,Jamaica Bay,2.0,Queens,22
-73.8494789238597,40.8658875419774,3,0.0843411059012,0.000314414156821,Allerton/Pelham Gardens,3.0,Bronx,23
-73.9770229219339,40.7241521436714,4,0.0435665270921,0.000111871946192,Alphabet City,4.0,Manhattan,24
-74.1899296712375,40.550340123832,5,0.0921464898574,0.000497957489363,Arden Heights,5.0,Staten Island,25
-74.0677744607421,40.5990621740821,6,0.150490542523,0.000606460984581,Arrochar/Fort Wadsworth,6.0,Staten Island,26
-73.9214905669465,40.761084729151,7,0.107417171123,0.000389787989274,Astoria,7.0,Queens,27
-73.9232024092836,40.7786069617704,8,0.0275906911574,2.6587716279e-05,Astoria Park,8.0,Queens,28
-73.7880202487407,40.7544109271114,9,0.0997840924705,0.000338443803197,Auburndale,9.0,Queens,29
-73.7916654578906,40.6781247031195,10,0.0998394794152,0.000435823818081,Baisley Park,10.0,Queens,30


In [0]:
# Creating a new column and filling it with empty

df = df.withColumn("New_Column", lit(""))

In [0]:
display(df)

X,Y,OBJECTID,Shape_Leng,Shape_Area,zone,locationID,borough,new_ObjID,New_Column
-74.1767857452143,40.6895156480431,1,0.116357453189,0.0007823067885,Newark Airport,1.0,EWR,21,
-73.8261257703202,40.6257242377511,2,0.43346966679,0.00486634037837,Jamaica Bay,2.0,Queens,22,
-73.8494789238597,40.8658875419774,3,0.0843411059012,0.000314414156821,Allerton/Pelham Gardens,3.0,Bronx,23,
-73.9770229219339,40.7241521436714,4,0.0435665270921,0.000111871946192,Alphabet City,4.0,Manhattan,24,
-74.1899296712375,40.550340123832,5,0.0921464898574,0.000497957489363,Arden Heights,5.0,Staten Island,25,
-74.0677744607421,40.5990621740821,6,0.150490542523,0.000606460984581,Arrochar/Fort Wadsworth,6.0,Staten Island,26,
-73.9214905669465,40.761084729151,7,0.107417171123,0.000389787989274,Astoria,7.0,Queens,27,
-73.9232024092836,40.7786069617704,8,0.0275906911574,2.6587716279e-05,Astoria Park,8.0,Queens,28,
-73.7880202487407,40.7544109271114,9,0.0997840924705,0.000338443803197,Auburndale,9.0,Queens,29,
-73.7916654578906,40.6781247031195,10,0.0998394794152,0.000435823818081,Baisley Park,10.0,Queens,30,


# withColumnRenamed : To rename column

In [0]:
# Renaming column "X" with "Long"

df = df.withColumnRenamed("X","Long")
display(df)

Long,Y,OBJECTID,Shape_Leng,Shape_Area,zone,locationID,borough,new_ObjID,New_Column
-74.1767857452143,40.6895156480431,1,0.116357453189,0.0007823067885,Newark Airport,1.0,EWR,21,
-73.8261257703202,40.6257242377511,2,0.43346966679,0.00486634037837,Jamaica Bay,2.0,Queens,22,
-73.8494789238597,40.8658875419774,3,0.0843411059012,0.000314414156821,Allerton/Pelham Gardens,3.0,Bronx,23,
-73.9770229219339,40.7241521436714,4,0.0435665270921,0.000111871946192,Alphabet City,4.0,Manhattan,24,
-74.1899296712375,40.550340123832,5,0.0921464898574,0.000497957489363,Arden Heights,5.0,Staten Island,25,
-74.0677744607421,40.5990621740821,6,0.150490542523,0.000606460984581,Arrochar/Fort Wadsworth,6.0,Staten Island,26,
-73.9214905669465,40.761084729151,7,0.107417171123,0.000389787989274,Astoria,7.0,Queens,27,
-73.9232024092836,40.7786069617704,8,0.0275906911574,2.6587716279e-05,Astoria Park,8.0,Queens,28,
-73.7880202487407,40.7544109271114,9,0.0997840924705,0.000338443803197,Auburndale,9.0,Queens,29,
-73.7916654578906,40.6781247031195,10,0.0998394794152,0.000435823818081,Baisley Park,10.0,Queens,30,


In [None]:
# Filter

In [0]:
# Filter out "Queens" borough

queens =  df.filter(df.borough == "Queens")

In [0]:
display(queens)

Long,Y,OBJECTID,Shape_Leng,Shape_Area,zone,locationID,borough,new_ObjID,New_Column
-73.8261257703202,40.6257242377511,2,0.43346966679,0.00486634037837,Jamaica Bay,2.0,Queens,22,
-73.9214905669465,40.761084729151,7,0.107417171123,0.000389787989274,Astoria,7.0,Queens,27,
-73.9232024092836,40.7786069617704,8,0.0275906911574,2.6587716279e-05,Astoria Park,8.0,Queens,28,
-73.7880202487407,40.7544109271114,9,0.0997840924705,0.000338443803197,Auburndale,9.0,Queens,29,
-73.7916654578906,40.6781247031195,10,0.0998394794152,0.000435823818081,Baisley Park,10.0,Queens,30,
-73.7879710847436,40.7852195006457,15,0.14433622262,0.000925219395547,Bay Terrace/Fort Totten,15.0,Queens,35,
-73.7716678221165,40.7612088345005,16,0.141291873771,0.000871889446182,Bayside,16.0,Queens,36,
-73.7278693961567,40.7364724391387,19,0.101824875452,0.000546661094782,Bellerose,19.0,Queens,39,
-73.9097811978144,40.5589500919216,27,0.202508808518,0.00134088762746,Breezy Point/Fort Tilden/Riis Beach,27.0,Queens,47,
-73.80732908405,40.710852781755,28,0.097960782214,0.000291203927662,Briarwood/Jamaica Hills,28.0,Queens,48,


In [0]:
# Filtering 2 columns "Borough" and "ObjectID"

display(df.filter( (df.borough == "Queens") & (df.OBJECTID >= 30)))

Long,Y,OBJECTID,Shape_Leng,Shape_Area,zone,locationID,borough,new_ObjID,New_Column
-73.8200975464803,40.6048721837899,30,0.0945097669793,0.000145862107626,Broad Channel,30.0,Queens,50,
-73.7355495585459,40.6932955704325,38,0.0832175685234,0.000327392684821,Cambria Heights,38.0,Queens,58,
-73.8440703458857,40.7819879584294,53,0.161500913385,0.000947530980821,College Point,53.0,Queens,73,
-73.8590533534623,40.7415986155017,56,0.0568478126677,0.000180907844436,Corona,56.0,Queens,76,
-73.853384474855,40.7523160392058,57,0.0192705048557,1.80259807917e-05,Corona,56.0,Queens,77,
-73.7313921625056,40.760631276033,64,0.18445188474,0.00105790284614,Douglaston,64.0,Queens,84,
-73.8684029143644,40.7639478203254,70,0.0638403183367,0.000195458476728,East Elmhurst,70.0,Queens,90,
-73.8065843331469,40.7536974869459,73,0.0853020209129,0.000291799754395,East Flushing,73.0,Queens,93,
-73.8723440095934,40.7384639203673,82,0.119875649697,0.000323601079994,Elmhurst,82.0,Queens,102,
-73.889221829475,40.7401456031773,83,0.105984933269,0.000217463718718,Elmhurst/Maspeth,83.0,Queens,103,


In [0]:
# Multiple Filtering with (isin)

vals = ["EWR", "Staten Island"]
display(df.filter(df.borough.isin(vals)))

Long,Y,OBJECTID,Shape_Leng,Shape_Area,zone,locationID,borough,new_ObjID,New_Column
-74.1767857452143,40.6895156480431,1,0.116357453189,0.0007823067885,Newark Airport,1.0,EWR,21,
-74.1899296712375,40.550340123832,5,0.0921464898574,0.000497957489363,Arden Heights,5.0,Staten Island,25,
-74.0677744607421,40.5990621740821,6,0.150490542523,0.000606460984581,Arrochar/Fort Wadsworth,6.0,Staten Island,26,
-74.1594432875648,40.607504219542,23,0.290556028962,0.00219556576201,Bloomfield/Emerson Hill,23.0,Staten Island,43,
-74.2295465457844,40.527298175003,44,0.235688967594,0.00194465649192,Charleston/Tottenville,44.0,Staten Island,64,
-74.1739373269384,40.5320172010873,84,0.233623987032,0.00207375572052,Eltingville/Annadale/Prince's Bay,84.0,Staten Island,104,
-74.187702737221,40.5796179453647,99,0.1833714893,0.00121016463877,Freshkills Park,99.0,Staten Island,119,
-74.1527146857649,40.5488307159092,109,0.178267819599,0.00116960076185,Great Kills,109.0,Staten Island,129,
-74.1258464065771,40.5432675249983,110,0.103946292913,0.000525745098785,Great Kills Park,110.0,Staten Island,130,
-74.0924861023209,40.6201275450588,115,0.116169413964,0.000373168991958,Grymes Hill/Clifton,115.0,Staten Island,135,


In [0]:
# Filtering with (startswith)

df.filter(df.borough.startswith('a')).show()

In [0]:
display(df.filter(df.borough.startswith('E')))

Long,Y,OBJECTID,Shape_Leng,Shape_Area,zone,locationID,borough,new_ObjID,New_Column
-74.1767857452143,40.6895156480431,1,0.116357453189,0.0007823067885,Newark Airport,1.0,EWR,21,


In [0]:
# Filtering with (contains)

display(df.filter(df.borough.contains('nx')))

Long,Y,OBJECTID,Shape_Leng,Shape_Area,zone,locationID,borough,new_ObjID,New_Column
-73.8494789238597,40.8658875419774,3,0.0843411059012,0.000314414156821,Allerton/Pelham Gardens,3.0,Bronx,23,
-73.8869219948557,40.8687628819908,18,0.0697995498569,0.000148850163948,Bedford Park,18.0,Bronx,38,
-73.8860348405346,40.8577731142544,20,0.0514401924362,0.000134512633032,Belmont,20.0,Bronx,40,
-73.875722045295,40.85992052114,31,0.0964245666516,0.000333975927329,Bronx Park,31.0,Bronx,51,
-73.8646241408313,40.8644517064252,32,0.05426721601,0.000150879171971,Bronxdale,32.0,Bronx,52,
-73.7864863118305,40.8474998984855,46,0.134475429879,0.000926391677672,City Island,46.0,Bronx,66,
-73.8969291492323,40.8457549453718,47,0.0898275563294,0.000163198117339,Claremont/Bathgate,47.0,Bronx,67,
-73.8304236318969,40.874061550909,51,0.0953613442277,0.000395756553505,Co-Op City,51.0,Bronx,71,
-73.8207053822972,40.8414754987567,58,0.0598554094851,0.000204980931361,Country Club,58.0,Bronx,78,
-73.8930744338338,40.8388599905968,59,0.0377948070893,6.28765230648e-05,Crotona Park,59.0,Bronx,79,


# Count, Distinct and Duplicate

In [0]:
# Counts the number of rows in the dataframe

df.count()

In [0]:
#Counts the number of Bronx row in the dataframe

df.filter(df.borough == "Bronx").count()

In [0]:
# Filters out the distinct values of the borough column

display(df.select(df.borough).distinct())

borough
Queens
EWR
Brooklyn
Staten Island
Manhattan
Bronx


In [0]:
# Filters out the distinct values of the zone column

display(df.select(df.zone).distinct())

zone
Governor's Island/Ellis Island/Liberty Island
Homecrest
Corona
Bensonhurst West
Westerleigh
Newark Airport
Charleston/Tottenville
Douglaston
East Concourse/Concourse Village
Mount Hope


In [0]:
display(df.select(df['zone'],df['borough']).distinct())

zone,borough
East Village,Manhattan
Whitestone,Queens
Long Island City/Queens Plaza,Queens
Battery Park City,Manhattan
Old Astoria,Queens
SoHo,Manhattan
Mount Hope,Bronx
South Jamaica,Queens
Bayside,Queens
Manhattan Beach,Brooklyn


In [0]:
# Shows unique values of zone and borough 

display(df.dropDuplicates(["borough","zone"]))

Long,Y,OBJECTID,Shape_Leng,Shape_Area,zone,locationID,borough,new_ObjID,New_Column
-73.8494789238597,40.8658875419774,3,0.0843411059012,0.000314414156821,Allerton/Pelham Gardens,3.0,Bronx,23,
-73.8869219948557,40.8687628819908,18,0.0697995498569,0.000148850163948,Bedford Park,18.0,Bronx,38,
-73.8860348405346,40.8577731142544,20,0.0514401924362,0.000134512633032,Belmont,20.0,Bronx,40,
-73.875722045295,40.85992052114,31,0.0964245666516,0.000333975927329,Bronx Park,31.0,Bronx,51,
-73.8646241408313,40.8644517064252,32,0.05426721601,0.000150879171971,Bronxdale,32.0,Bronx,52,
-73.7864863118305,40.8474998984855,46,0.134475429879,0.000926391677672,City Island,46.0,Bronx,66,
-73.8969291492323,40.8457549453718,47,0.0898275563294,0.000163198117339,Claremont/Bathgate,47.0,Bronx,67,
-73.8304236318969,40.874061550909,51,0.0953613442277,0.000395756553505,Co-Op City,51.0,Bronx,71,
-73.8207053822972,40.8414754987567,58,0.0598554094851,0.000204980931361,Country Club,58.0,Bronx,78,
-73.8930744338338,40.8388599905968,59,0.0377948070893,6.28765230648e-05,Crotona Park,59.0,Bronx,79,


# sort and orderBy

In [0]:
# Using "sort" to sort the dataframe based on LocationID

display(df.sort('LOCATIONID'))

Long,Y,OBJECTID,Shape_Leng,Shape_Area,zone,locationID,borough,new_ObjID,New_Column
-74.1767857452143,40.6895156480431,1,0.116357453189,0.0007823067885,Newark Airport,1.0,EWR,21,
-73.8261257703202,40.6257242377511,2,0.43346966679,0.00486634037837,Jamaica Bay,2.0,Queens,22,
-73.8494789238597,40.8658875419774,3,0.0843411059012,0.000314414156821,Allerton/Pelham Gardens,3.0,Bronx,23,
-73.9770229219339,40.7241521436714,4,0.0435665270921,0.000111871946192,Alphabet City,4.0,Manhattan,24,
-74.1899296712375,40.550340123832,5,0.0921464898574,0.000497957489363,Arden Heights,5.0,Staten Island,25,
-74.0677744607421,40.5990621740821,6,0.150490542523,0.000606460984581,Arrochar/Fort Wadsworth,6.0,Staten Island,26,
-73.9214905669465,40.761084729151,7,0.107417171123,0.000389787989274,Astoria,7.0,Queens,27,
-73.9232024092836,40.7786069617704,8,0.0275906911574,2.6587716279e-05,Astoria Park,8.0,Queens,28,
-73.7880202487407,40.7544109271114,9,0.0997840924705,0.000338443803197,Auburndale,9.0,Queens,29,
-73.7916654578906,40.6781247031195,10,0.0998394794152,0.000435823818081,Baisley Park,10.0,Queens,30,


In [0]:
# Using "orderBy" to sort the data

display(df.orderBy('LOCATIONID'))

Long,Y,OBJECTID,Shape_Leng,Shape_Area,zone,locationID,borough,new_ObjID,New_Column
-74.1767857452143,40.6895156480431,1,0.116357453189,0.0007823067885,Newark Airport,1.0,EWR,21,
-73.8261257703202,40.6257242377511,2,0.43346966679,0.00486634037837,Jamaica Bay,2.0,Queens,22,
-73.8494789238597,40.8658875419774,3,0.0843411059012,0.000314414156821,Allerton/Pelham Gardens,3.0,Bronx,23,
-73.9770229219339,40.7241521436714,4,0.0435665270921,0.000111871946192,Alphabet City,4.0,Manhattan,24,
-74.1899296712375,40.550340123832,5,0.0921464898574,0.000497957489363,Arden Heights,5.0,Staten Island,25,
-74.0677744607421,40.5990621740821,6,0.150490542523,0.000606460984581,Arrochar/Fort Wadsworth,6.0,Staten Island,26,
-73.9214905669465,40.761084729151,7,0.107417171123,0.000389787989274,Astoria,7.0,Queens,27,
-73.9232024092836,40.7786069617704,8,0.0275906911574,2.6587716279e-05,Astoria Park,8.0,Queens,28,
-73.7880202487407,40.7544109271114,9,0.0997840924705,0.000338443803197,Auburndale,9.0,Queens,29,
-73.7916654578906,40.6781247031195,10,0.0998394794152,0.000435823818081,Baisley Park,10.0,Queens,30,


In [0]:
# Sorting based on ascending and descending order

display(df.sort(df.locationID.desc()))

Long,Y,OBJECTID,Shape_Leng,Shape_Area,zone,locationID,borough,new_ObjID,New_Column
-73.9512079916544,40.7784958687768,263,0.0370166252994,6.57697664169e-05,Yorkville West,263.0,Manhattan,283,
-73.9458298180079,40.7765342289951,262,0.0490636231541,0.000122330270966,Yorkville East,262.0,Manhattan,282,
-74.0129193755126,40.708975618892,261,0.0271204563616,3.43423231652e-05,World Trade Center,261.0,Manhattan,281,
-73.9037132789432,40.7467977944692,260,0.133514154636,0.000422345326907,Woodside,260.0,Queens,280,
-73.8563511172889,40.8991027731978,259,0.126750305191,0.000394552487366,Woodlawn/Wakefield,259.0,Bronx,279,
-73.8566390530717,40.6901263678129,258,0.0890133787693,0.000366209617143,Woodhaven,258.0,Queens,278,
-73.9772393111852,40.6536644952118,257,0.0586690259793,0.00013890947321,Windsor Terrace,257.0,Brooklyn,277,
-73.9591078838642,40.7109771296904,256,0.0679149669603,0.000168611097013,Williamsburg (South Side),256.0,Brooklyn,276,
-73.9571337756889,40.7188341922346,255,0.0623841997664,0.000172309184842,Williamsburg (North Side),255.0,Brooklyn,275,
-73.8582696518177,40.8832233292992,254,0.0858863754861,0.000360040216032,Williamsbridge/Olinville,254.0,Bronx,274,


# groupBy

In [0]:
# Grouping the dataframe by zone 

display(df.groupBy("zone").count())

zone,count
Governor's Island/Ellis Island/Liberty Island,3
Homecrest,1
Corona,2
Bensonhurst West,1
Westerleigh,1
Newark Airport,1
Charleston/Tottenville,1
Douglaston,1
East Concourse/Concourse Village,1
Mount Hope,1


In [0]:
display(df.groupBy("borough").count())

borough,count
Queens,69
EWR,1
Brooklyn,61
Staten Island,20
Manhattan,69
Bronx,43


In [0]:
display(df.groupBy('borough').agg(count("*"), max("objectid"), min("locationid")))

borough,count(1),max(objectid),min(locationid)
Queens,69,260,2.0
EWR,1,1,1.0
Brooklyn,61,257,11.0
Staten Island,20,251,5.0
Manhattan,69,263,4.0
Bronx,43,259,3.0


In [0]:
display(df.filter(df.borough == "Bronx").groupBy('zone', 'locationid').agg(count("*")).filter(df.locationID > 50))

zone,locationid,count(1)
Mount Hope,169.0,1
Soundview/Bruckner,212.0,1
West Concourse,247.0,1
Van Cortlandt Village,241.0,1
Pelham Bay Park,184.0,1
East Tremont,78.0,1
East Concourse/Concourse Village,69.0,1
Longwood,147.0,1
Riverdale/North Riverdale/Fieldston,200.0,1
Pelham Parkway,185.0,1


# UDFs "User defined functions"

In [0]:
# A User Defined Functions that takes columns "zone" and "borough", combines them and returns a string output.

def zone_borough (zone, borough):
  return zone + " " + borough

z_b_UDF = udf(lambda x,y: zone_borough(x,y), StringType()) # StringType to specify the return type

df = df.withColumn ("zone borough", z_b_UDF(df.zone, df.borough))
display(df)

Long,Y,OBJECTID,Shape_Leng,Shape_Area,zone,locationID,borough,new_ObjID,New_Column,zone borough
-74.1767857452143,40.6895156480431,1,0.116357453189,0.0007823067885,Newark Airport,1.0,EWR,21,,Newark Airport EWR
-73.8261257703202,40.6257242377511,2,0.43346966679,0.00486634037837,Jamaica Bay,2.0,Queens,22,,Jamaica Bay Queens
-73.8494789238597,40.8658875419774,3,0.0843411059012,0.000314414156821,Allerton/Pelham Gardens,3.0,Bronx,23,,Allerton/Pelham Gardens Bronx
-73.9770229219339,40.7241521436714,4,0.0435665270921,0.000111871946192,Alphabet City,4.0,Manhattan,24,,Alphabet City Manhattan
-74.1899296712375,40.550340123832,5,0.0921464898574,0.000497957489363,Arden Heights,5.0,Staten Island,25,,Arden Heights Staten Island
-74.0677744607421,40.5990621740821,6,0.150490542523,0.000606460984581,Arrochar/Fort Wadsworth,6.0,Staten Island,26,,Arrochar/Fort Wadsworth Staten Island
-73.9214905669465,40.761084729151,7,0.107417171123,0.000389787989274,Astoria,7.0,Queens,27,,Astoria Queens
-73.9232024092836,40.7786069617704,8,0.0275906911574,2.6587716279e-05,Astoria Park,8.0,Queens,28,,Astoria Park Queens
-73.7880202487407,40.7544109271114,9,0.0997840924705,0.000338443803197,Auburndale,9.0,Queens,29,,Auburndale Queens
-73.7916654578906,40.6781247031195,10,0.0998394794152,0.000435823818081,Baisley Park,10.0,Queens,30,,Baisley Park Queens


# Writing dataframe to memory

In [0]:
output_path  = "/FileStore/tables/taxi_zones/output/"

In [0]:
# Writing DataFrame to memory

df.write.csv(output_path, header=True)