# Data Frames Basics

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Basics').getOrCreate()

Read csv file:

In [19]:
df = spark.read.csv('data/aiddata-countries.csv', header = True)

In [20]:
df.show(3)

+---+----------+------------+----+------------+---------+------------------------------+----------------------+----------------------+
|_c0|aiddata_id|aiddata_2_id|year|       donor|recipient|commitment_amount_usd_constant|coalesced_purpose_code|coalesced_purpose_name|
+---+----------+------------+----+------------+---------+------------------------------+----------------------+----------------------+
|  1|   2414478|          NA|1977|Saudi Arabia|    India|                     348718518|                 23030|  Power generation/...|
|  2|   2414509|          NA|1977|Saudi Arabia|   Brazil|                     191647004|                 23040|  Electrical transm...|
|  3|   2414635|          NA|1983|Saudi Arabia|    India|                      79371799|                 21030|        Rail transport|
+---+----------+------------+----+------------+---------+------------------------------+----------------------+----------------------+
only showing top 3 rows



In [21]:
df.columns

['_c0',
 'aiddata_id',
 'aiddata_2_id',
 'year',
 'donor',
 'recipient',
 'commitment_amount_usd_constant',
 'coalesced_purpose_code',
 'coalesced_purpose_name']

In [22]:
df.schema

StructType(List(StructField(_c0,StringType,true),StructField(aiddata_id,StringType,true),StructField(aiddata_2_id,StringType,true),StructField(year,StringType,true),StructField(donor,StringType,true),StructField(recipient,StringType,true),StructField(commitment_amount_usd_constant,StringType,true),StructField(coalesced_purpose_code,StringType,true),StructField(coalesced_purpose_name,StringType,true)))

In [23]:
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- aiddata_id: string (nullable = true)
 |-- aiddata_2_id: string (nullable = true)
 |-- year: string (nullable = true)
 |-- donor: string (nullable = true)
 |-- recipient: string (nullable = true)
 |-- commitment_amount_usd_constant: string (nullable = true)
 |-- coalesced_purpose_code: string (nullable = true)
 |-- coalesced_purpose_name: string (nullable = true)



The Describe() returns a dataframe with the statistical summary of numeric columns

In [24]:
df.describe().show()

+-------+------------------+--------------------+--------------------+------------------+-------------+-------------+------------------------------+----------------------+----------------------+
|summary|               _c0|          aiddata_id|        aiddata_2_id|              year|        donor|    recipient|commitment_amount_usd_constant|coalesced_purpose_code|coalesced_purpose_name|
+-------+------------------+--------------------+--------------------+------------------+-------------+-------------+------------------------------+----------------------+----------------------+
|  count|             98540|               98540|               98540|             98540|        98540|        98540|                         98540|                 98540|                 98540|
|   mean|           49270.5|5.689305473480566...| 2.414586565459502E7|2004.0098843109397|         null|         null|            3722388.3759488533|    33356.897432514714|                  null|
| stddev|28446.1921001739

In [25]:
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- aiddata_id: string (nullable = true)
 |-- aiddata_2_id: string (nullable = true)
 |-- year: string (nullable = true)
 |-- donor: string (nullable = true)
 |-- recipient: string (nullable = true)
 |-- commitment_amount_usd_constant: string (nullable = true)
 |-- coalesced_purpose_code: string (nullable = true)
 |-- coalesced_purpose_name: string (nullable = true)



# Defining Schema

* We can also specify 

In [26]:
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, FloatType

format - StructType([StructField(name = 'Column name', dataType = StringType(), nullable = True), .... ])

In [27]:
data_schema = StructType([StructField('_c0', StringType(), True),
               StructField('aiddata_id', IntegerType(), True),
               StructField('aiddata_2_id', StringType(), True),
               StructField('year', IntegerType(), True),
               StructField('donor', StringType(), True),
               StructField('recipient', StringType(), True),
               StructField('commitment_amount_usd_constant', FloatType(), True),
               StructField('coalesced_purpose_code', StringType(), True),
               StructField('coalesced_purpose_name', StringType(), True)])

Sepecify schema while reading the file.

In [29]:
df = spark.read.csv('data/aiddata-countries.csv', header = True, schema = data_schema)

In [30]:
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- aiddata_id: integer (nullable = true)
 |-- aiddata_2_id: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- donor: string (nullable = true)
 |-- recipient: string (nullable = true)
 |-- commitment_amount_usd_constant: float (nullable = true)
 |-- coalesced_purpose_code: string (nullable = true)
 |-- coalesced_purpose_name: string (nullable = true)



* Spark can also infer schema for us when reading the file

In [31]:
df = spark.read.csv('data/aiddata-countries.csv', header = True, inferSchema = True)

In [32]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- aiddata_id: double (nullable = true)
 |-- aiddata_2_id: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- donor: string (nullable = true)
 |-- recipient: string (nullable = true)
 |-- commitment_amount_usd_constant: double (nullable = true)
 |-- coalesced_purpose_code: integer (nullable = true)
 |-- coalesced_purpose_name: string (nullable = true)



# Getting Data from Data Frame

We can use indexing to get column object

In [33]:
df['year']

Column<b'year'>

To check type of object:

In [34]:
type(df['year'])

pyspark.sql.column.Column

To get dataframe from the column use "select"

In [35]:
df.select("year")

DataFrame[year: int]

To see the data, we need to use show()

In [36]:
df.select('year').show(5)

+----+
|year|
+----+
|1977|
|1977|
|1983|
|1984|
|1984|
+----+
only showing top 5 rows



The show() function return dataframe where as head() return row objects

In [37]:
df.head(2)

[Row(_c0=1, aiddata_id=2414478.0, aiddata_2_id='NA', year=1977, donor='Saudi Arabia', recipient='India', commitment_amount_usd_constant=348718518.0, coalesced_purpose_code=23030, coalesced_purpose_name='Power generation/renewable sources'),
 Row(_c0=2, aiddata_id=2414509.0, aiddata_2_id='NA', year=1977, donor='Saudi Arabia', recipient='Brazil', commitment_amount_usd_constant=191647004.0, coalesced_purpose_code=23040, coalesced_purpose_name='Electrical transmission/ distribution')]

In [38]:
type(df.head(2)[0])

pyspark.sql.types.Row

* To select multiple columns

In [39]:
df.select(['year', 'donor']).show(5)

+----+------------+
|year|       donor|
+----+------------+
|1977|Saudi Arabia|
|1977|Saudi Arabia|
|1983|Saudi Arabia|
|1984|Saudi Arabia|
|1984|Saudi Arabia|
+----+------------+
only showing top 5 rows



* Add new Column - dataframe.withColumn('new column name', 'old column')
* Not in place

In [40]:
df.withColumn('double_Year', df['year']*2).show(2)

+---+----------+------------+----+------------+---------+------------------------------+----------------------+----------------------+-----------+
|_c0|aiddata_id|aiddata_2_id|year|       donor|recipient|commitment_amount_usd_constant|coalesced_purpose_code|coalesced_purpose_name|double_Year|
+---+----------+------------+----+------------+---------+------------------------------+----------------------+----------------------+-----------+
|  1| 2414478.0|          NA|1977|Saudi Arabia|    India|                  3.48718518E8|                 23030|  Power generation/...|       3954|
|  2| 2414509.0|          NA|1977|Saudi Arabia|   Brazil|                  1.91647004E8|                 23040|  Electrical transm...|       3954|
+---+----------+------------+----+------------+---------+------------------------------+----------------------+----------------------+-----------+
only showing top 2 rows



* Rename Column name:

In [41]:
df.withColumnRenamed('year', 'years')

DataFrame[_c0: int, aiddata_id: double, aiddata_2_id: string, years: int, donor: string, recipient: string, commitment_amount_usd_constant: double, coalesced_purpose_code: int, coalesced_purpose_name: string]

In [42]:
df.head()

Row(_c0=1, aiddata_id=2414478.0, aiddata_2_id='NA', year=1977, donor='Saudi Arabia', recipient='India', commitment_amount_usd_constant=348718518.0, coalesced_purpose_code=23030, coalesced_purpose_name='Power generation/renewable sources')

# Sql

To register dataframe as temperory view:

In [43]:
df.createOrReplaceTempView('aid')

In [44]:
result = spark.sql('select year from aid')   # Can use SQL to query dataframe

In [45]:
result.show(3)

+----+
|year|
+----+
|1977|
|1977|
|1983|
+----+
only showing top 3 rows

