In [1]:


import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
''
sc = pyspark.SparkContext(appName="yahoo_stock_price_raw_parsed_cleaned")
spark = SparkSession(sc)
spark.conf.set('spark.sql.session.timeZone', 'UTC')

In [2]:
from skainet_spark import Pipeline, transform, Input, Output, Metadata,ValidatedPipeline,assign_shortcuts,print_statistics
from pyspark.sql.types import StructField, StructType, StringType, LongType,DoubleType,ArrayType,FloatType, BooleanType,IntegerType
import pyspark.sql.functions as F
from pyspark.sql.functions import udf


# Yahoo Price - Raw Parsed Clean

## Raw

In [3]:
@transform(spark,
    raw = Output('/yahoo_stock_price/raw/yahoo_price_raw.parquet'),
    metadata = Metadata('/yahoo_stock_price/raw/yahoo_price_metadata_raw.csv', spark)
)
def raw_yahoo_price(spark,raw,metadata):
    schema=metadata()
    df = (spark
          .read
          .format('csv')
          .option('header','true')
        .schema(schema)
          .load('input/yahoo_price/*.csv')
         )
    pipeline=Pipeline(df)
    pipeline.show_dimensions()
    pipeline.dataframe.printSchema()
    pipeline.dataframe.show(10)
    pipeline.write(raw)



In [4]:
raw_yahoo_price(spark)

cols: 8 rows: 1081502
root
 |-- open_date: string (nullable = true)
 |-- open_price: string (nullable = true)
 |-- high_price: string (nullable = true)
 |-- low_price: string (nullable = true)
 |-- close_price: string (nullable = true)
 |-- adj_close_price: string (nullable = true)
 |-- volume: string (nullable = true)
 |-- ticker_symbol: string (nullable = true)

+----------+------------------+----------+------------------+------------------+------------------+--------+-------------+
| open_date|        open_price|high_price|         low_price|       close_price|   adj_close_price|  volume|ticker_symbol|
+----------+------------------+----------+------------------+------------------+------------------+--------+-------------+
|2010-07-14|        245.185181|247.162155|         243.47348|        245.915909|        245.915909| 6229700|        googl|
|2010-07-15|        246.111115|247.597595|        241.581589|        247.257263|        247.257263| 9706600|        googl|
|2010-07-16|      

## Parsed

In [5]:
@transform(spark,
    raw = Input('/yahoo_stock_price/raw/yahoo_price_raw.parquet',spark),
    parsed = Output('/yahoo_stock_price/parsed/yahoo_price_parsed.parquet')
)
def parsed_yahoo_price(spark,raw,parsed):
    pipe = Pipeline(raw)
    #print_statistics(pipe)
    double_type_cols=['open_price','high_price','low_price','close_price','adj_close_price']
    int_type_cols = ['volume']
    pipe = (pipe
            .transform(parse_date)
            .transform(cast_cols_to_number,double_type_cols,DoubleType)
            .transform(cast_cols_to_number,int_type_cols,IntegerType)
            .transform(parse_ticker)
            .transform(drop_where_open_price_null)
            .transform(create_unique_id)
            .transform(restrict_dates)
           )

   
    print('Yahoo Price Parsed')
    pipe.dataframe.select('*').show(10,0)
    print_statistics(pipe)
    pipe.write(parsed)

def restrict_dates(dataframe):
    col = 'open_date'
    dataframe = dataframe.where(F.col(col) >= '2020-06-01')
    return dataframe
def create_unique_id(dataframe):
    new_col = 'stock_id'
    dataframe = dataframe.withColumn(new_col, F.concat_ws('-',F.col('open_date'), F.col('ticker_symbol') ) )
    return dataframe

def drop_where_open_price_null(dataframe):
    col='open_price'
    dataframe = dataframe.where(F.col(col).isNotNull())
    return dataframe

def parse_ticker(dataframe):
    col = 'ticker_symbol'
    dataframe = dataframe.withColumn(col, F.upper(F.col(col)))
    return dataframe

def parse_date(dataframe):
    col = 'open_date'
    date_format= 'yyyy-MM-dd'
    dataframe = dataframe.withColumn(col,F.to_date(F.col(col), date_format))
    return dataframe

def cast_cols_to_number(dataframe,cols,data_type):
    for col in cols:
        dataframe = dataframe.withColumn(col,F.col(col).cast(data_type()))
    return dataframe


In [6]:
parsed_yahoo_price(spark)

Yahoo Price Parsed
+----------+-----------+-----------+-----------+-----------+---------------+-------+-------------+----------------+
|open_date |open_price |high_price |low_price  |close_price|adj_close_price|volume |ticker_symbol|stock_id        |
+----------+-----------+-----------+-----------+-----------+---------------+-------+-------------+----------------+
|2020-06-01|1425.699951|1441.579956|1422.280029|1434.869995|1434.869995    |1258100|GOOGL        |2020-06-01-GOOGL|
|2020-06-02|1435.0     |1443.0     |1421.609985|1442.310059|1442.310059    |1172100|GOOGL        |2020-06-02-GOOGL|
|2020-06-03|1442.699951|1449.01001 |1431.619995|1439.25    |1439.25        |1386600|GOOGL        |2020-06-03-GOOGL|
|2020-06-04|1436.780029|1441.319946|1406.01001 |1414.300049|1414.300049    |1349100|GOOGL        |2020-06-04-GOOGL|
|2020-06-05|1415.640015|1446.300049|1407.619995|1440.02002 |1440.02002     |2132100|GOOGL        |2020-06-05-GOOGL|
|2020-06-08|1426.280029|1449.0     |1424.47998 |1448.

## Clean

In [7]:


@transform(spark,
    parsed = Input('/yahoo_stock_price/parsed/yahoo_price_parsed.parquet', spark),
    metadata = Metadata('/yahoo_stock_price/clean/yahoo_price_metadata_clean.csv', spark),
    clean = Output('/yahoo_stock_price/clean/yahoo_price_clean.parquet'),
    clean_exception = Output('/yahoo_stock_price/exception/yahoo_price_clean_exception.parquet')
)
def yahoo_price_clean(spark, parsed, metadata, clean, clean_exception):
    
    schema = metadata()
    
    print(schema.fieldNames())

    pipe = Pipeline(parsed)
    pipe = (pipe
            .show_dimensions()

           )
    pipe.dataframe.printSchema()
    pipe.dataframe.show(10,0)


    validated_pipe = ValidatedPipeline(pipe, metadata)
    validated_pipe = (validated_pipe
                      
                      .validate()
                     )
    

    validated_pipe.write(clean, clean_exception)
    


In [8]:
yahoo_price_clean(spark)

['open_date', 'open_price', 'high_price', 'low_price', 'close_price', 'adj_close_price', 'volume', 'ticker_symbol', 'stock_id']
cols: 9 rows: 13641
root
 |-- open_date: date (nullable = true)
 |-- open_price: double (nullable = true)
 |-- high_price: double (nullable = true)
 |-- low_price: double (nullable = true)
 |-- close_price: double (nullable = true)
 |-- adj_close_price: double (nullable = true)
 |-- volume: integer (nullable = true)
 |-- ticker_symbol: string (nullable = true)
 |-- stock_id: string (nullable = true)

+----------+----------+----------+---------+-----------+---------------+-------+-------------+--------------+
|open_date |open_price|high_price|low_price|close_price|adj_close_price|volume |ticker_symbol|stock_id      |
+----------+----------+----------+---------+-----------+---------------+-------+-------------+--------------+
|2020-06-01|50.919998 |52.720001 |50.82    |52.220001  |51.741547      |2330900|PEG          |2020-06-01-PEG|
|2020-06-02|52.5      |52.61