# Spark Tutorial Notebook

### Setup

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import (StructField, 
                               StructType,
                               StringType, 
                               IntegerType,
                               FloatType,
                               DateType)

In [2]:
spark = SparkSession.builder.appName("Basics").getOrCreate()

Exception: Java gateway process exited before sending its port number

### Build Schema and Read Data

In [None]:
data_schema = [
    StructField("ticker", StringType(), True),
    StructField("open", FloatType()),
    StructField("close", FloatType()),
    StructField("adj_close", FloatType()),
    StructField("low", FloatType()),
    StructField("high", FloatType()),
    StructField("volume", FloatType()),
    StructField("date", DateType())
]

final_struct = StructType(fields=data_schema)

data_frame = spark.read.csv("historical_stock_prices.csv", schema=final_struct)
data_frame.printSchema()

In [None]:
data_frame = data_frame.withColumnRenamed("ticker", "symbol")
data_frame = data_frame.withColumnRenamed("open", "opening_price")
data_frame = data_frame.withColumnRenamed("close", "closing_price")
data_frame = data_frame.withColumnRenamed("low", "lowest_price")
data_frame = data_frame.withColumnRenamed("high", "highest_price")

data_frame = data_frame.withColumn("volume_in_millions", data_frame["volume"] / 1000000)

data_frame.head(2)

### Spark SQL Example

In [None]:
data_frame.createOrReplaceTempView("stocks")

In [None]:
results = spark.sql("SELECT * FROM stocks WHERE symbol='MSFT'")
results.show()

In [None]:
database_url = "0.0.0.0:5432"


data_frame.write.format("jdbc").options(
    url="jdbc:" + database_url,
    driver='org.postgresql.Driver',
    dbtable='stocks',
    user='postgres',
    password='').mode('append').save()