In [1]:
from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession.builder.getOrCreate()


22/11/01 10:02:28 WARN Utils: Your hostname, kevin resolves to a loopback address: 127.0.1.1; using 192.168.1.6 instead (on interface wlp0s20f3)
22/11/01 10:02:28 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/01 10:02:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
df = spark.read.format('json')\
                .load('/home/kevin/Desktop/Big-Data-with-Pyspark/data/flight-data/json/2015-summary.json')

df.printSchema()

[Stage 0:>                                                          (0 + 1) / 1]

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



                                                                                

# Schema
A schema defines the column names and types of a dataframe

In [3]:
spark.read.format('json').load('/home/kevin/Desktop/Big-Data-with-Pyspark/data/flight-data/json/2015-summary.json').schema

StructType([StructField('DEST_COUNTRY_NAME', StringType(), True), StructField('ORIGIN_COUNTRY_NAME', StringType(), True), StructField('count', LongType(), True)])

A schema is a structType made up of a number of fields, StructFields, that have a name, type, a Boolean flag which specifies whether that column can contain missing or null values, and finally, users can optionally specify associated metadata with that column. 

In [4]:
from pyspark.sql.types import StructField, StructType, StringType, LongType

myManualSchema = StructType([
    StructField('DEST_COUNTRY_NAME', StringType(), True),
    StructField('ORIGIN_COUNTRY_NAME', StringType(), True),
    StructField('count', LongType(), False, metadata={'hello': 'world'})
])

df = spark.read.format('json')\
                    .schema(myManualSchema)\
                    .load('/home/kevin/Desktop/Big-Data-with-Pyspark/data/flight-data/json/2015-summary.json')

# Columns and Expressions

## Columns
There are a lot of ways to construct and refer to columns but the two simplest ways are by using the col or column functions

## Expressions
An expression is a set of transformations on one or more values in a record in a dataframe. Anm expression is crerated via the expr function

## Columns as expressions
Columns provide a subset of expression functionality. If you use col() and want to perform transformations on that column, you must perform those on that column reference. When using an expression, the expr function can actually parse transformations and column references from a string and can subsequently be passed into further transformations

In [5]:
df.columns

['DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count']

# Records and Rows



## Creationg Rows

In [6]:
from pyspark.sql import Row

myRow = Row('Hello', None, 1, False)

# DataFrame Transformations

## Creating DataFrames


In [7]:
df = spark.read.format('json').load('/home/kevin/Desktop/Big-Data-with-Pyspark/data/flight-data/json/2015-summary.json')
df.createOrReplaceTempView('dfTable')

## select and selectexpr

In [8]:
df.select('DEST_COUNTRY_NAME').show(2)

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|    United States|
|    United States|
+-----------------+
only showing top 2 rows



In [11]:

from pyspark.sql.functions import col, column, expr
df.select(expr('DEST_COUNTRY_NAME AS destination')).show(2)

+-------------+
|  destination|
+-------------+
|United States|
|United States|
+-------------+
only showing top 2 rows



In [13]:
df.selectExpr('DEST_COUNTRY_NAME AS destination', 'DEST_COUNTRY_NAME').show(2)

+-------------+-----------------+
|  destination|DEST_COUNTRY_NAME|
+-------------+-----------------+
|United States|    United States|
|United States|    United States|
+-------------+-----------------+
only showing top 2 rows



## Converting to Spark Types(Literals)

In [15]:
from pyspark.sql.functions import lit

df.select(expr('*'), lit(1).alias('One')).show(2)

+-----------------+-------------------+-----+---+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|One|
+-----------------+-------------------+-----+---+
|    United States|            Romania|   15|  1|
|    United States|            Croatia|    1|  1|
+-----------------+-------------------+-----+---+
only showing top 2 rows



## Adding Columns

In [16]:
df.withColumn('numberOne', lit(1)).show(2)

+-----------------+-------------------+-----+---------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|numberOne|
+-----------------+-------------------+-----+---------+
|    United States|            Romania|   15|        1|
|    United States|            Croatia|    1|        1|
+-----------------+-------------------+-----+---------+
only showing top 2 rows



Let us set a boolean flag for when the origin country is the same as the destination country

In [18]:
df.withColumn('withinCountry', expr('ORIGIN_COUNTRY_NAME == DEST_COUNTRY_NAME')).show(5)

+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|
+-----------------+-------------------+-----+-------------+
|    United States|            Romania|   15|        false|
|    United States|            Croatia|    1|        false|
|    United States|            Ireland|  344|        false|
|            Egypt|      United States|   15|        false|
|    United States|              India|   62|        false|
+-----------------+-------------------+-----+-------------+
only showing top 5 rows



## Renaming Columns

In [19]:
df.withColumnRenamed('DEST_COUNTRY_NAME', 'dest').columns

['dest', 'ORIGIN_COUNTRY_NAME', 'count']

## Removing Columns

In [21]:
df.drop('ORIGIN_COUNTRY_NAME').columns

['DEST_COUNTRY_NAME', 'count']

## Changing a Column's Type(Cast)

In [22]:
df.withColumn('count2', col('count').cast('long'))

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint, count2: bigint]

## Filtering Rows

In [23]:
df.filter(col('count') < 2).show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



In [25]:
df.where('count < 2').show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



In [26]:
df.where(col('count') <10).where(col('ORIGIN_COUNTRY_NAME') != 'Croatia').show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|          Singapore|    1|
|          Moldova|      United States|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



## Getting Unique Rows

In [28]:
df.select('ORIGIN_COUNTRY_NAME', 'DEST_COUNTRY_NAME').distinct().count()

256

## Random Samples


In [29]:
seed = 5
withReplacement = False
fraction = 0.5
df.sample(withReplacement, fraction, seed).count()

138

## Concatenating and Appending Rows(Union)

In [34]:
from pyspark.sql import Row

schema = df.schema

newRows = [
    Row('new Country 1', 'Other Country', 20),
    Row('New Country 2', 'Other Country 1', 12)
]

parallelizedRows = spark.sparkContext.parallelize(newRows)
newDF = spark.createDataFrame(parallelizedRows, schema)


df.union(newDF)\
    .where('count = 1')\
    .where(col('ORIGIN_COUNTRY_NAME') != 'United states')\
    .show(4)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
|          Moldova|      United States|    1|
|            Malta|      United States|    1|
+-----------------+-------------------+-----+
only showing top 4 rows



## Sorting Rows

In [35]:
df.sort('count').show(5)

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|               Malta|      United States|    1|
|Saint Vincent and...|      United States|    1|
|       United States|            Croatia|    1|
|       United States|          Gibraltar|    1|
|       United States|          Singapore|    1|
+--------------------+-------------------+-----+
only showing top 5 rows



In [36]:
df.orderBy('count', 'DEST_COUNTRY_NAME').show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|     Burkina Faso|      United States|    1|
|    Cote d'Ivoire|      United States|    1|
|           Cyprus|      United States|    1|
|         Djibouti|      United States|    1|
|        Indonesia|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



In [39]:
from pyspark.sql.functions import desc, asc
df.orderBy(expr("count asc")).show(5)

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|               Malta|      United States|    1|
|Saint Vincent and...|      United States|    1|
|       United States|            Croatia|    1|
|       United States|          Gibraltar|    1|
|       United States|          Singapore|    1|
+--------------------+-------------------+-----+
only showing top 5 rows



In [40]:
df.orderBy(col('count').desc(), col('DEST_COUNTRY_NAME').asc()).show(5)

+-----------------+-------------------+------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|
+-----------------+-------------------+------+
|    United States|      United States|370002|
|    United States|             Canada|  8483|
|           Canada|      United States|  8399|
|    United States|             Mexico|  7187|
|           Mexico|      United States|  7140|
+-----------------+-------------------+------+
only showing top 5 rows



In [41]:
spark.stop()