In [0]:
# Create SparkSession from builder
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[4]") \
                    .appName('SparkByExamples.com') \
                    .getOrCreate()

In [0]:
"""
master() – If you are running it on the cluster you need to use your master name as an argument to master(). usually, it would be either yarn or mesos depends on your cluster setup.

Use local[x] when running in Standalone mode. x should be an integer value and should be greater than 0; this represents how many partitions it should create when using RDD, DataFrame, and Dataset. Ideally, x value should be the number of CPU cores you have.

appName() – Used to set your application name.

getOrCreate() – This returns a SparkSession object if already exists, and creates a new one if not exist.
"""

Out[2]: '\nmaster() – If you are running it on the cluster you need to use your master name as an argument to master(). usually, it would be either yarn or mesos depends on your cluster setup.\n\nUse local[x] when running in Standalone mode. x should be an integer value and should be greater than 0; this represents how many partitions it should create when using RDD, DataFrame, and Dataset. Ideally, x value should be the number of CPU cores you have.\n\nappName() – Used to set your application name.\n\ngetOrCreate() – This returns a SparkSession object if already exists, and creates a new one if not exist.\n'

In [0]:
# Create new SparkSession
spark2 = SparkSession.newSession
print(spark2)

<function SparkSession.newSession at 0x7f7b6878b280>


In [0]:
# Get Existing SparkSession
spark3 = SparkSession.builder.getOrCreate
print(spark3)


<bound method SparkSession.Builder.getOrCreate of <pyspark.sql.session.SparkSession.Builder object at 0x7f7b688e5310>>


In [0]:
# Usage of config()
spark = SparkSession.builder \
      .master("local[1]") \
      .appName("SparkByExamples.com") \
      .config("spark.some.config.option", "config-value") \
      .getOrCreate()

In [0]:
# Enabling Hive to use in Spark
spark = SparkSession.builder \
      .master("local[1]") \
      .appName("SparkByExamples.com") \
      .config("spark.sql.warehouse.dir", "<path>/spark-warehouse") \
      .enableHiveSupport() \
      .getOrCreate()


In [0]:
# Set Config
spark.conf.set("spark.executor.memory", "5g")

# Get a Spark Config
partions = spark.conf.get("spark.sql.shuffle.partitions")
print(partions)


[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
[0;32m<command-1064759925509123>[0m in [0;36m<module>[0;34m[0m
[1;32m      1[0m [0;31m# Set Config[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 2[0;31m [0mspark[0m[0;34m.[0m[0mconf[0m[0;34m.[0m[0mset[0m[0;34m([0m[0;34m"spark.executor.memory"[0m[0;34m,[0m [0;34m"5g"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      3[0m [0;34m[0m[0m
[1;32m      4[0m [0;31m# Get a Spark Config[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      5[0m [0mpartions[0m [0;34m=[0m [0mspark[0m[0;34m.[0m[0mconf[0m[0;34m.[0m[0mget[0m[0;34m([0m[0;34m"spark.sql.shuffle.partitions"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m

[0;32m/databricks/spark/python/pyspark/sql/conf.py[0m in [0;36mset[0;34m(self, key, value)[0m
[1;32m     34[0m     [0;32mdef[0m [0mset[0m[0;34

In [0]:
# Create DataFrame
df = spark.createDataFrame(
    [("Scala", 25000), ("Spark", 35000), ("PHP", 21000)])
df.show()


+-----+-----+
|   _1|   _2|
+-----+-----+
|Scala|25000|
|Spark|35000|
|  PHP|21000|
+-----+-----+



In [0]:
# Spark SQL
df.createOrReplaceTempView("sample_table")
df2 = spark.sql("SELECT _1,_2 FROM sample_table")
df2.show()


+-----+-----+
|   _1|   _2|
+-----+-----+
|Scala|25000|
|Spark|35000|
|  PHP|21000|
+-----+-----+



In [0]:
# Create Hive table & query it.  
spark.table("sample_table").write.saveAsTable("sample_hive_table")
df3 = spark.sql("SELECT _1,_2 FROM sample_hive_table")
df3.show()


+-----+-----+
|   _1|   _2|
+-----+-----+
|Scala|25000|
|Spark|35000|
|  PHP|21000|
+-----+-----+



In [0]:
# Get metadata from the Catalog
# List databases
dbs = spark.catalog.listDatabases()
print(dbs)

# Output
#[Database(name='default', description='default database', 
#locationUri='file:/Users/admin/.spyder-py3/spark-warehouse')]

# List Tables
tbls = spark.catalog.listTables()
print(tbls)


[Database(name='default', description='Default Hive database', locationUri='dbfs:/user/hive/warehouse')]
[Table(name='sample_hive_table', database='default', description=None, tableType='MANAGED', isTemporary=False), Table(name='sample_table', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]
