In [1]:
import sys
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.utils import getResolvedOptions

from pyspark.sql.types import StringType, StructType, StructField

In [2]:
gc = GlueContext(SparkContext.getOrCreate())
gc.setConf("spark.sql.ansi.enabled","true")
spark = gc.spark_session

In [3]:
# $example on:programmatic_schema$
sc = spark.sparkContext
sc

In [4]:
%%bash
cat ../examples/src/main/resources/people.json

{"name":"Michael"}
{"name":"Andy", "age":30}
{"name":"Justin", "age":19}


In [5]:
# Load a text file and convert each line to a Row.
lines = sc.textFile("../examples/src/main/resources/people.txt")
parts = lines.map(lambda l: l.split(","))
# Each line is converted to a tuple.
people = parts.map(lambda p: (p[0], p[1].strip()))

# The schema is encoded in a string.
schemaString = "name age"

fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
schema = StructType(fields)

# Apply the schema to the RDD.
schemaPeople = spark.createDataFrame(people, schema)

# Creates a temporary view using the DataFrame
schemaPeople.createOrReplaceTempView("people")

# SQL can be run over DataFrames that have been registered as a table.
results = spark.sql("SELECT name FROM people")

results.show()
# +-------+
# |   name|
# +-------+
# |Michael|
# |   Andy|
# | Justin|
# +-------+
# $example off:programmatic_schema$


+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
+-------+

