In [1]:


import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
''
sc = pyspark.SparkContext(appName="news_price_ddl")
spark = SparkSession(sc)
from pyspark.sql.types import StructField, StructType, StringType, LongType,DoubleType,ArrayType,FloatType, BooleanType
import pyspark.sql.functions as F
from pyspark.sql.functions import udf
import numpy as np

In [2]:
from skainet_spark import Pipeline, transform, Input, Output, Metadata,ValidatedPipeline,print_statistics

In [3]:
def remove_null_records(dataframe):
    # Remove null records because there are states for UK/Swiss (they aint have states) or any case
    dataframe = dataframe.na.drop()
    return dataframe

# Nodes
## Publisher

In [4]:
@transform(spark,
    news_ontology = Input('/news/ontology/news_ontology.parquet', spark),

    metdata = Metadata('/news/ddl/publisher_metadata_ddl.csv',spark),
    ddl_csv = Output('/news/ddl/publisher_ddl.csv'),
    ddl_csv_exception = Output('/news/exception/publisher_ddl_exception.csv')
)
def publisher_ddl(spark,news_ontology, metdata, ddl_csv, ddl_csv_exception):
    pipe_comp_details = Pipeline(news_ontology)
   
    pipe_comp_details = (pipe_comp_details
                         .rename_columns({
                             "publisher":'publisher_id'
                         })
                         .select([
                             'publisher_id',


                         ])
                         .distinct()
                         .transform(remove_null_records)
    )
    
    pipe_comp_details.show_dimensions()
    pipe_comp_details.dataframe.printSchema()
    pipe_comp_details.dataframe.show(5)
    
    schema = metdata()
    validated_pipe = ValidatedPipeline(pipe_comp_details, schema)
    validated_pipe = (validated_pipe
                      .validate()
                     )
    
    validated_pipe.write_csv(ddl_csv,ddl_csv_exception)
    
    

In [5]:
publisher_ddl(spark)

cols: 1 rows: 77
root
 |-- publisher_id: string (nullable = true)

+--------------------+
|        publisher_id|
+--------------------+
|Markets Insider A...|
|Ryan Ermey Associ...|
|           Kiplinger|
|       MediaPost.com|
|           Schaeffer|
+--------------------+
only showing top 5 rows

Validated count: 77
Exception count: 0


## News

In [6]:
@transform(spark,
    news_ontology = Input('/news/ontology/news_ontology.parquet', spark),

    metdata = Metadata('/news/ddl/news_metadata_ddl.csv',spark),
    ddl_csv = Output('/news/ddl/news_ddl.csv'),
    ddl_csv_exception = Output('/news/exception/news_ddl_exception.csv')
)
def news_ddl(spark,news_ontology, metdata, ddl_csv, ddl_csv_exception):
    pipe_comp_details = Pipeline(news_ontology)
   
    pipe_comp_details = (pipe_comp_details
                         .select([
                             'news_id',
                             'title',
                             'content'


                         ])
                         .distinct()
                         
    )
    
    pipe_comp_details.show_dimensions()
    pipe_comp_details.dataframe.printSchema()
    pipe_comp_details.dataframe.select('news_id').show(5,0)
    
    schema = metdata()
    validated_pipe = ValidatedPipeline(pipe_comp_details, schema)
    validated_pipe = (validated_pipe
                      .validate()
                     )
    
    validated_pipe.write_csv(ddl_csv,ddl_csv_exception)
    
    

In [7]:
news_ddl(spark)

cols: 3 rows: 2767
root
 |-- news_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- content: string (nullable = true)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|news_id                                                                                                                                                                                                                 |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|New England Journal of Medicine Publishes Two Positive Phase 3 Trials Showing DUPIXENT® (dupilumab) Improved Moderate-to-Severe Asthma-2018-05-21-PR Newswire                                             

## Date

In [8]:
@transform(spark,
    news_ontology = Input('/news/ontology/news_ontology.parquet', spark),
    stock_clean = Input('/stock_price/ontology/stock_price_ontology.parquet',spark),
    metdata = Metadata('/news/ddl/date_metadata_ddl.csv',spark),
    ddl_csv = Output('/news/ddl/date_ddl.csv'),
    ddl_csv_exception = Output('/news/exception/date_ddl_exception.csv')
)
def date_ddl(spark,news_ontology,stock_clean, metdata, ddl_csv, ddl_csv_exception):
    pipe_stock = Pipeline(stock_clean)
    pipe_news = Pipeline(news_ontology)
    
    pipe_news = (pipe_news
                         .rename_columns({
                             "published_date":'date_id',
                             
                         })
                         .select([
                             'date_id',
                         ])
                         .distinct()
                       
    )
    pipe_stock = (pipe_stock
                         .rename_columns({
                             "open_date":'date_id',
                             
                         })
                         .select([
                             'date_id',
                         ])
                         .distinct()
                       
    )
    df_union = pipe_stock.dataframe.union(pipe_news.dataframe).orderBy(F.asc('date_id'))
    
    pipe_union = Pipeline(df_union)
    print_statistics(pipe_union)
 
    pipe_union.dataframe.show(5)
    
    schema = metdata()
    validated_pipe = ValidatedPipeline(pipe_union, schema)
    validated_pipe = (validated_pipe
                      .validate()
                     )
    
    validated_pipe.write_csv(ddl_csv,ddl_csv_exception)
    
    

In [9]:
date_ddl(spark)

root
 |-- date_id: date (nullable = true)

cols: 1 rows: 1020
+----------+
|   date_id|
+----------+
|2005-10-06|
|2005-10-27|
|2005-11-16|
|2006-01-23|
|2006-03-08|
+----------+
only showing top 5 rows

Validated count: 1020
Exception count: 0


## Stock

In [10]:
@transform(spark,
    
    stock_clean = Input('/stock_price/ontology/stock_price_ontology.parquet',spark),
    metdata = Metadata('/stock_price/ddl/stock_metadata_ddl.csv',spark),
    ddl_csv = Output('/stock_price/ddl/stock_ddl.csv'),
    ddl_csv_exception = Output('/stock_price/exception/stock_ddl_exception.csv')
)
def stock_ddl(spark,stock_clean, metdata, ddl_csv, ddl_csv_exception):
    pipe = Pipeline(stock_clean)
  
    
    pipe = (pipe
            
             .select([
                 'stock_id',
                 'open_price',
                 'high_price',
                 'low_price',
                 'close_price',
                 'adj_close_price',
                 'volume'
             ])
             .distinct()
                       
    )

 
    pipe.dataframe.show(5)
    print_statistics(pipe)
    schema = metdata()
    validated_pipe = ValidatedPipeline(pipe, schema)
    validated_pipe = (validated_pipe
                      .validate()
                     )
    
    validated_pipe.write_csv(ddl_csv,ddl_csv_exception)
    
    

In [11]:
stock_ddl(spark)

+--------------+----------+----------+---------+-----------+---------------+--------+
|      stock_id|open_price|high_price|low_price|close_price|adj_close_price|  volume|
+--------------+----------+----------+---------+-----------+---------------+--------+
|2020-06-26-BKR|     14.87|     14.94|    14.49|       14.7|           14.7| 6852900|
| 2020-06-25-VZ|     53.91| 54.389999|    53.34|  54.279999|      53.677212|17122500|
|2020-06-04-HIG| 41.849998| 44.290001|    41.57|  44.279999|      44.279999| 3732800|
|2020-07-02-RHI| 52.220001| 53.189999|51.639999|  51.720001|      51.720001|  815000|
| 2020-07-08-LB|     15.09|     15.69|    14.86|      15.67|          15.67| 3990900|
+--------------+----------+----------+---------+-----------+---------------+--------+
only showing top 5 rows

root
 |-- stock_id: string (nullable = true)
 |-- open_price: double (nullable = true)
 |-- high_price: double (nullable = true)
 |-- low_price: double (nullable = true)
 |-- close_price: double (nulla

# Relations
## stock_IS_VALUED_FOR_company


In [12]:
@transform(spark,
    
    stock_clean = Input('/stock_price/ontology/stock_price_ontology.parquet',spark),
    metdata = Metadata('/stock_price/ddl/stock_IS_VALUED_FOR_company_metadata_ddl.csv',spark),
    ddl_csv = Output('/stock_price/ddl/stock_IS_VALUED_FOR_company_ddl.csv'),
    ddl_csv_exception = Output('/stock_price/ddl/stock_IS_VALUED_FOR_company_ddl_exception.csv')
)
def stock_IS_VALUED_FOR_company_ddl(spark,stock_clean, metdata, ddl_csv, ddl_csv_exception):
    pipe = Pipeline(stock_clean)
  
    
    pipe = (pipe
             .rename_columns({
                 "ticker_symbol":'company_id',

             })
             .select([
                 'stock_id',
                 'company_id',

             ])
             .distinct()
                       
    )

 
    pipe.dataframe.show(5)
    print_statistics(pipe)
    schema = metdata()
    validated_pipe = ValidatedPipeline(pipe, schema)
    validated_pipe = (validated_pipe
                      .validate()
                     )
    
    validated_pipe.write_csv(ddl_csv,ddl_csv_exception)
    
    

In [13]:
stock_IS_VALUED_FOR_company_ddl(spark)

+---------------+----------+
|       stock_id|company_id|
+---------------+----------+
|2020-06-12-DISH|      DISH|
|  2020-06-02-MO|        MO|
|   2020-07-02-T|         T|
| 2020-07-09-PGR|       PGR|
| 2020-07-13-PFG|       PFG|
+---------------+----------+
only showing top 5 rows

root
 |-- stock_id: string (nullable = true)
 |-- company_id: string (nullable = true)

cols: 2 rows: 13641
Validated count: 13641
Exception count: 0


## publisher_PUBLISHES_news

In [14]:
@transform(spark,
    news_ontology = Input('/news/ontology/news_ontology.parquet', spark),

    metdata = Metadata('/news/ddl/publisher_PUBLISHES_news_metadata_ddl.csv',spark),
    ddl_csv = Output('/news/ddl/publisher_PUBLISHES_news_ddl.csv'),
    ddl_csv_exception = Output('/news/exception/publisher_PUBLISHES_news_ddl_exception.csv')
)
def publisher_PUBLISHES_news_ddl(spark,news_ontology, metdata, ddl_csv, ddl_csv_exception):
    pipe_comp_details = Pipeline(news_ontology)
    pipe_comp_details.dataframe.printSchema()
    pipe_comp_details = (pipe_comp_details
                         .rename_columns({
                           "publisher":'publisher_id'  
                         }).select([
                             'publisher_id',
                             'news_id'


                         ])
                         .distinct()
                         
    )
    
    pipe_comp_details.show_dimensions()
    pipe_comp_details.dataframe.printSchema()
    pipe_comp_details.dataframe.show(5)
    
    schema = metdata()
    validated_pipe = ValidatedPipeline(pipe_comp_details, schema)
    validated_pipe = (validated_pipe
                      .validate()
                     )
    
    validated_pipe.write_csv(ddl_csv,ddl_csv_exception)
    
    

In [15]:
publisher_PUBLISHES_news_ddl(spark)

root
 |-- title: string (nullable = true)
 |-- content: string (nullable = true)
 |-- published_date: date (nullable = true)
 |-- publisher: string (nullable = true)
 |-- mentioned_tickers: string (nullable = true)
 |-- news_id: string (nullable = true)

cols: 2 rows: 2756
root
 |-- publisher_id: string (nullable = true)
 |-- news_id: string (nullable = true)

+--------------------+--------------------+
|        publisher_id|             news_id|
+--------------------+--------------------+
|             Reuters|Note 7 fiasco cou...|
|       GlobeNewswire|Verizon to redeem...|
|         PR Newswire|John C Lukegord's...|
|MT Newswires MTNe...|Stock Futures Def...|
|     The Motley Fool|What These 7 Indu...|
+--------------------+--------------------+
only showing top 5 rows

Validated count: 2756
Exception count: 0


## news_CONCERNS_company

In [16]:
@transform(spark,
    news_ontology = Input('/news/ontology/news_ontology.parquet', spark),

    metdata = Metadata('/news/ddl/news_CONCERNS_company_metadata_ddl.csv',spark),
    ddl_csv = Output('/news/ddl/news_CONCERNS_company_ddl.csv'),
    ddl_csv_exception = Output('/news/exception/news_CONCERNS_company_ddl_exception.csv')
)
def news_CONCERNS_company_ddl(spark,news_ontology, metdata, ddl_csv, ddl_csv_exception):
    pipe_comp_details = Pipeline(news_ontology)
    pipe_comp_details.dataframe.printSchema()
    pipe_comp_details = (pipe_comp_details
                         .rename_columns({
                           "mentioned_tickers":'company_id'  
                         }).select([
                             'news_id',
                             'company_id'


                         ])
                         .distinct()
                         
    )
    
    pipe_comp_details.show_dimensions()
    pipe_comp_details.dataframe.printSchema()
    pipe_comp_details.dataframe.show(5)
    
    schema = metdata()
    validated_pipe = ValidatedPipeline(pipe_comp_details, schema)
    validated_pipe = (validated_pipe
                      .validate()
                     )
    
    validated_pipe.write_csv(ddl_csv,ddl_csv_exception)
    
    

In [17]:
news_CONCERNS_company_ddl(spark)

root
 |-- title: string (nullable = true)
 |-- content: string (nullable = true)
 |-- published_date: date (nullable = true)
 |-- publisher: string (nullable = true)
 |-- mentioned_tickers: string (nullable = true)
 |-- news_id: string (nullable = true)

cols: 2 rows: 5841
root
 |-- news_id: string (nullable = true)
 |-- company_id: string (nullable = true)

+--------------------+----------+
|             news_id|company_id|
+--------------------+----------+
|What Is a Dividen...|        ED|
|Why the Markets A...|       LUV|
|How Diversificati...|       CVX|
|Nasdaq Sees Best ...|      INTC|
|Verizon ready to ...|        VZ|
+--------------------+----------+
only showing top 5 rows

Validated count: 5841
Exception count: 0


## news_IS_ISSUED_ON_date

In [18]:
@transform(spark,
    news_ontology = Input('/news/ontology/news_ontology.parquet', spark),

    metdata = Metadata('/news/ddl/news_IS_ISSUED_ON_date_metadata_ddl.csv',spark),
    ddl_csv = Output('/news/ddl/news_IS_ISSUED_ON_date_ddl.csv'),
    ddl_csv_exception = Output('/news/exception/news_IS_ISSUED_ON_date_ddl_exception.csv')
)
def news_IS_ISSUED_ON_date_ddl(spark,news_ontology, metdata, ddl_csv, ddl_csv_exception):
    pipe_comp_details = Pipeline(news_ontology)
    pipe_comp_details.dataframe.printSchema()
    pipe_comp_details = (pipe_comp_details
                         .rename_columns({
                           "published_date":'date_id'  
                         }).select([
                             'news_id',
                             'date_id'


                         ])
                         .distinct()
                         
    )
    
    pipe_comp_details.show_dimensions()
    pipe_comp_details.dataframe.printSchema()
    pipe_comp_details.dataframe.show(5)
    
    schema = metdata()
    validated_pipe = ValidatedPipeline(pipe_comp_details, schema)
    validated_pipe = (validated_pipe
                      .validate()
                     )
    
    validated_pipe.write_csv(ddl_csv,ddl_csv_exception)
    
    

In [19]:
news_IS_ISSUED_ON_date_ddl(spark)

root
 |-- title: string (nullable = true)
 |-- content: string (nullable = true)
 |-- published_date: date (nullable = true)
 |-- publisher: string (nullable = true)
 |-- mentioned_tickers: string (nullable = true)
 |-- news_id: string (nullable = true)

cols: 2 rows: 2756
root
 |-- news_id: string (nullable = true)
 |-- date_id: date (nullable = true)

+--------------------+----------+
|             news_id|   date_id|
+--------------------+----------+
|10 Small-Cap Stoc...|2020-05-04|
|The Zacks Analyst...|2016-09-19|
|Verizon is buildi...|2018-09-06|
|Yahoo Mail reimag...|2019-09-23|
|Verizon is taking...|2017-03-31|
+--------------------+----------+
only showing top 5 rows

Validated count: 2756
Exception count: 0


## stock_IS_VALUED_ON_date

In [20]:
@transform(spark,
    
    stock_clean = Input('/stock_price/ontology/stock_price_ontology.parquet',spark),
    metdata = Metadata('/stock_price/ddl/stock_IS_VALUED_ON_date_metadata_ddl.csv',spark),
    ddl_csv = Output('/stock_price/ddl/stock_IS_VALUED_ON_date_ddl.csv'),
    ddl_csv_exception = Output('/stock_price/exception/stock_IS_VALUED_ON_date_ddl_exception.csv')
)
def stock_IS_VALUED_ON_date_ddl(spark,stock_clean, metdata, ddl_csv, ddl_csv_exception):
    pipe = Pipeline(stock_clean)
  
    
    pipe = (pipe
             .rename_columns({
                 "open_date":'date_id',

             })
             .select([
                 'stock_id',
                 'date_id',

             ])
             .distinct()
                       
    )

 
    pipe.dataframe.show(5)
    print_statistics(pipe)
    schema = metdata()
    validated_pipe = ValidatedPipeline(pipe, schema)
    validated_pipe = (validated_pipe
                      .validate()
                     )
    
    validated_pipe.write_csv(ddl_csv,ddl_csv_exception)
    
    

In [21]:
stock_IS_VALUED_ON_date_ddl(spark)

+---------------+----------+
|       stock_id|   date_id|
+---------------+----------+
|2020-06-08-CSCO|2020-06-08|
| 2020-06-19-HIG|2020-06-19|
|2020-06-01-FLIR|2020-06-01|
|  2020-06-03-LB|2020-06-03|
| 2020-06-10-KEY|2020-06-10|
+---------------+----------+
only showing top 5 rows

root
 |-- stock_id: string (nullable = true)
 |-- date_id: date (nullable = true)

cols: 2 rows: 13641
Validated count: 13641
Exception count: 0
