In [1]:


import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
''
sc = pyspark.SparkContext(appName="nasdaq_news_raw_parsed_cleaned")
spark = SparkSession(sc)
spark.conf.set('spark.sql.session.timeZone', 'UTC')

In [2]:
from skainet_spark import Pipeline, transform, Input, Output, Metadata,ValidatedPipeline,assign_shortcuts,print_statistics
from pyspark.sql.types import StructField, StructType, StringType, LongType,DoubleType,ArrayType,FloatType, BooleanType,IntegerType
import pyspark.sql.functions as F
from pyspark.sql.functions import udf


## Nasdaq News Raw

In [3]:
@transform(spark,
    raw = Output('/nasdaq_news/raw/nasdaq_news_raw.parquet'),
    metadata = Metadata('/nasdaq_news/raw/nasdaq_news_metadata_raw.csv', spark)
)
def raw_nasdaq_news(spark,raw,metadata):
    schema=metadata()
    df = (spark
          .read
          .format('json')
          
          .schema(schema)
          .load('input/nasdaq_news/*.json')
         )
    pipeline=Pipeline(df)
    pipeline.show_dimensions()
    pipeline.dataframe.show(10)
    pipeline.write(raw)



In [4]:
raw_nasdaq_news(spark)

cols: 5 rows: 1668
+--------------------+--------------------+--------------------+--------------------+--------------------+
|               title|             content|         contributor|      published_date|   mentioned_tickers|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|
3M Co (MMM) Q1 2...|




Image source...|
Publisher


The ...|2019-04-25T15:00:...|             ["MMM"]|
|
3M Co (MMM) Q2 2...|




Image source...|
Publisher


The ...|2019-07-25T19:23:...|             ["MMM"]|
|
3M (MMM) Q3 2017...|



Image source:...|
Publisher


The ...|2017-10-25T12:54:...|             ["MMM"]|
|
3M (MMM) Q4 2018...|



Image source:...|
Publisher


The ...|2019-01-29T09:52:...|             ["MMM"]|
|
3M Co (MMM) Q4 2...|
Image source: Th...|
Contributor


  ...|2020-01-28T14:30:...|             ["MMM"]|
|
3M Co (MMM) Q3 2...|



Image source:...|
Publisher


The ...|2018-10-23T03:42:...|             ["MMM"]|
|
3M Co (MMM) Q1 2

## Nasdaq News Parsed

In [5]:
@transform(spark,
    raw = Input('/nasdaq_news/raw/nasdaq_news_raw.parquet',spark),
    parsed = Output('/nasdaq_news/parsed/nasdaq_news_parsed.parquet')
)
def parsed_nasdaq_news(spark,raw,parsed):
    pipe = Pipeline(raw)
    print_statistics(pipe)
    
    cols_rename = {
        'contributor':'publisher'
    }
    pipe = (pipe
            .rename_columns(cols_rename)
            .transform(parse_delete_chars,'publisher')
            .transform(parse_delete_chars,'title')
            .transform(parse_content)

            #.transform(extract_news_writers)
            .transform(parse_contributor)
            .transform(parse_date)
            .transform(parse_mentioned_tickers)
            .transform(drop_records_null_content)
            
           )
    #pipe.dataframe.groupby('contributor').count().show(1000,0)
    pipe.dataframe.select('published_date','title').where(F.col('publisher').contains('InvestorPlace')).show(2,0)
    
    print('Nasdaq News')
    print_statistics(pipe)
    pipe.write(parsed)
    
def drop_records_null_content(dataframe):
    col = 'content'
    dataframe = dataframe.where(F.col(col).isNotNull())
    return dataframe
    
def parse_content(dataframe):
    col='content'
    constant_suffix = """\nThe views and opinions expressed herein are the views and opinions of the author and do not necessarily reflect those of Nasdaq, Inc.\n"""
    dataframe = dataframe.withColumn(col,F.regexp_replace(F.col(col),constant_suffix,''))
    dataframe = dataframe.withColumn(col,F.regexp_replace(F.col(col),"<(?s).*>",' '))
    dataframe = parse_delete_chars(dataframe,col)
    
    return dataframe
    
    
def parse_mentioned_tickers(dataframe):
    col='mentioned_tickers'
    dataframe = dataframe.withColumn(col,F.regexp_replace(F.col(col),'(^\[)|(\])',''))
    dataframe = dataframe.withColumn(col,F.explode(F.split(F.col(col),',')))
    dataframe = dataframe.withColumn(col,F.regexp_replace(F.col(col),'\.','-'))
    dataframe = dataframe.withColumn(col, F.upper(F.col(col)))
    return dataframe
    
def parse_date(dataframe):
    """date formaat: yyyy-MM-dd'T'HH:mm:ssZ
    Example; 2020-05-29T12:50:59-0400"""
    col = 'published_date'
    dataframe = dataframe.withColumn(col,F.to_date(F.col(col),"yyyy-MM-dd'T'HH:mm:ssZ"))
    return dataframe
    
def extract_news_writers(dataframe):
    col='publisher'
    
    regexp_string = """[A-Z][a-z]+ [A-z][a-z]+ The Motley Fool|[A-Z][a-z]+[ ][A-z][a-z]+ InvestorPlace|[A-Z][a-z]+[ ][A-Z].[ ][A-z][a-z]+ InvestorPlace|[A-Z][a-z]+[ ][A-Z].[ ][A-z][a-z]+ The Motley Fool"""
    dataframe = dataframe.withColumn(col,F.regexp_replace(F.col(col),'Publisher *|Contributor *',''))
    dataframe = (dataframe.withColumn('writer',F.when(F.col(col)
                                                      .rlike(regexp_string),
                                                      F.trim(F.regexp_replace(F.col(col),'The Motley Fool|InvestorPlace',''))).otherwise(F.lit(None).cast(StringType()))))
    reuters_ds = dataframe.select("*").where(F.col(col).contains('Reuters'))
    reuters_ds = (reuters_ds.withColumn('writer',F.when(F.col(col)
                                                      .rlike('[A-Z][a-z]+[ ][A-z][a-z]+ Reuters'),
                                                      F.trim(F.regexp_replace(F.col(col),' Reuters ','~'))).otherwise(F.col('writer')).cast(StringType())))
    #reuters_ds = reuters_ds.withColumn('writer',F.regexp_replace(F.col('writer'),'[s ]',''))
    reuters_ds = reuters_ds.withColumn('writer',F.regexp_replace(F.col('writer'),' Reuter',''))
    reuters_ds = reuters_ds.withColumn('writer',F.split(F.col('writer'),'~'))
    reuters_ds = reuters_ds.withColumn('writer',F.explode(F.col('writer')))
    dataframe = dataframe.union(reuters_ds)
    dataframe = dataframe.withColumn(col,F.trim(F.col(col)))
    
    return dataframe

def parse_delete_chars(dataframe,col):

    dataframe = dataframe.withColumn(col,F.regexp_replace(F.col(col),'\n',' '))
    dataframe = dataframe.withColumn(col,F.regexp_replace(F.col(col),"\\s+",' '))
    dataframe = dataframe.withColumn(col,F.regexp_replace(F.col(col),'\,',''))
    dataframe = dataframe.withColumn(col,F.trim(F.col(col)))
    return dataframe

def parse_contributor(dataframe):
    col = 'publisher'
    dataframe = dataframe.withColumn(col,F.regexp_replace(F.col(col),'Publisher *|Contributor *',''))
    format_contributor = {'Motley Fool':'The Motley Fool','Reuters':'Reuters','InvestorPlace':'InvestorPlace'}
    for key in format_contributor:
        dataframe = dataframe.withColumn(col,F.when(F.col(col).contains(key),F.lit(format_contributor[key])).otherwise(F.col(col)))
    
    return dataframe

In [6]:
parsed_nasdaq_news(spark)

root
 |-- title: string (nullable = true)
 |-- content: string (nullable = true)
 |-- contributor: string (nullable = true)
 |-- published_date: string (nullable = true)
 |-- mentioned_tickers: string (nullable = true)

cols: 5 rows: 1668
+--------------+------------------------------------------------+
|published_date|title                                           |
+--------------+------------------------------------------------+
|2017-07-07    |The 10 Safest Blue-Chip Dividends on Wall Street|
|2017-07-07    |The 10 Safest Blue-Chip Dividends on Wall Street|
+--------------+------------------------------------------------+
only showing top 2 rows

Nasdaq News
root
 |-- title: string (nullable = true)
 |-- content: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- published_date: date (nullable = true)
 |-- mentioned_tickers: string (nullable = true)

cols: 5 rows: 6194


## Nasdaq News Cleaned

In [7]:


@transform(spark,
    parsed = Input('/nasdaq_news/parsed/nasdaq_news_parsed.parquet', spark),
    metadata = Metadata('/nasdaq_news/clean/nasdaq_news_metadata_clean.csv', spark),
    clean = Output('/nasdaq_news/clean/nasdaq_news_clean.parquet'),
    clean_exception = Output('/nasdaq_news/exception/nasdaq_news_clean_exception.parquet')
)
def nasdaq_news_clean(spark, parsed, metadata, clean, clean_exception):
    
    schema = metadata()
    
    print(schema.fieldNames())

    pipe = Pipeline(parsed)
    pipe = (pipe
            .show_dimensions()

           )
    pipe.dataframe.printSchema()
   


    validated_pipe = ValidatedPipeline(pipe, metadata)
    validated_pipe = (validated_pipe
                      #.add_validation(F.col('home_team_name') == 'Arsenal', 'column is null')
                      .validate()
                     )
    

    validated_pipe.write(clean, clean_exception)
    


In [8]:
nasdaq_news_clean(spark)

['title', 'content', 'publisher', 'published_date', 'mentioned_tickers']
cols: 5 rows: 6194
root
 |-- title: string (nullable = true)
 |-- content: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- published_date: date (nullable = true)
 |-- mentioned_tickers: string (nullable = true)

Validated count: 6194
Exception count: 0
