In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [3]:
# create spark session and Run Spark locally with as many worker threads as logical cores on your local machine
# getOrCreate() method - Gets an existing SparkSession or, if there is no existing one, creates a new one based on the options set in this builder. In case an existing SparkSession is returned, the config options specified in this builder will be applied to the existing SparkSession.

spark = SparkSession.builder\
.appName("application one")\
.config("spark.master", "local[*]")\
.getOrCreate()

In [4]:
# File location and type
file_location = r"C:/Users/user/Desktop/SampleSuperstore.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

In [5]:
# The applied options are for CSV files. For other file types, these will be ignored.
df = spark\
    .read\
    .format(file_type)\
    .option("inferSchema", infer_schema)\
    .option("header", first_row_is_header)\
    .option("sep", delimiter)\
    .load(file_location)

In [None]:
# print selected columns to console
df.select("Ship Mode", "Segment", "Country", "City", "State", "Postal Code").show()

In [None]:
# Convert spark dataframe to pandas dataframe
pd_df = df.toPandas()
print(pd_df)
print(pd_df.head())
print(pd_df.tail())

In [13]:
df.createOrReplaceTempView('sample_super')

In [19]:
dfdf = spark.sql(
'''select City, SUM(Sales) as Sales
 from sample_super group by City '''
)

In [20]:
dfdf.show()

+---------------+------------------+
|           City|             Sales|
+---------------+------------------+
|          Tyler|           347.206|
|    Springfield|43054.342000000004|
|        Edmonds|2523.6920000000005|
|          Tempe|          1070.302|
|  Bowling Green|          2077.375|
|          Pasco|          2201.112|
|         Auburn|          3155.168|
|North Las Vegas| 9801.001999999999|
|       Thornton|           765.248|
|       Palatine|           116.312|
|        Phoenix|         11000.257|
|     Plainfield|           4526.85|
|  Lake Elsinore|            283.92|
|     Georgetown|1786.4200000000003|
|      Bethlehem|          1689.634|
|         Wilson|368.73199999999997|
|      Hollywood|          1070.474|
|         Monroe|2970.4339999999997|
|       Woodland|264.66200000000003|
| Pembroke Pines|         1714.3755|
+---------------+------------------+
only showing top 20 rows

