In [1]:


import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
''
sc = pyspark.SparkContext(appName="bi_news_raw_parsed_cleaned")
spark = SparkSession(sc)
spark.conf.set('spark.sql.session.timeZone', 'UTC')

In [2]:
from skainet_spark import Pipeline, transform, Input, Output, Metadata,ValidatedPipeline,assign_shortcuts,print_statistics
from pyspark.sql.types import StructField, StructType, StringType, LongType,DoubleType,ArrayType,FloatType, BooleanType,IntegerType
import pyspark.sql.functions as F
from pyspark.sql.functions import udf


## Bi News Raw

In [3]:
@transform(spark,
    raw = Output('/bi_news/raw/bi_news_raw.parquet'),
    metadata = Metadata('/bi_news/raw/bi_news_metadata_raw.csv', spark)
)
def raw_bi_news(spark,raw,metadata):
    schema=metadata()
    df = (spark
          .read
          .format('json')
          
          .schema(schema)
          .load('input/bi_news_content/*.json')
         )
    pipeline=Pipeline(df)
    pipeline.show_dimensions()
    pipeline.dataframe.show(10)
    pipeline.write(raw)



In [4]:
raw_bi_news()

cols: 5 rows: 1215
+--------------------+--------------------+--------------------+--------------------+-------------+
|               title|             content|      published_date|           publisher|ticker_symbol|
+--------------------+--------------------+--------------------+--------------------+-------------+
|Verizon ends 2019...|
4Q 2019 highligh...|Jan. 30, 2020, 07...|PRESS RELEASE Glo...|       ['vz']|
|Correcting and Re...|
In a release iss...|Aug. 1, 2019, 04:...|PRESS RELEASE Glo...|       ['vz']|
|Verizon reports s...|
Company sees str...|Aug. 1, 2019, 12:...|PRESS RELEASE Glo...|       ['vz']|
|Verizon reports s...|
Highest third-qu...|Oct. 25, 2019, 01...|PRESS RELEASE Glo...|       ['vz']|
|Strong wireless c...|
4Q 2018 highligh...|Jan. 29, 2019, 07...|PRESS RELEASE Glo...|       ['vz']|
|Verizon ends firs...|
2Q 2018 highligh...|Jul. 24, 2018, 01...|PRESS RELEASE Glo...|       ['vz']|
|As 5G era begins,...|
3Q 2018 highligh...|Oct. 23, 2018, 01...|PRESS RELEASE Glo

## BI News Parsed

In [5]:
@transform(spark,
    raw = Input('/bi_news/raw/bi_news_raw.parquet',spark),
    parsed = Output('/bi_news/parsed/bi_news_parsed.parquet')
)
def parsed_bi_news(spark,raw,parsed):
    pipe = Pipeline(raw)
    print_statistics(pipe)
    cols_rename = {
        'ticker_symbol':'mentioned_tickers'
    }
    pipe = (pipe
            .rename_columns(cols_rename)
            .transform(parse_date)
            .transform(parse_delete_chars,'title')
            .transform(parse_delete_chars,'content')
            
            .transform(parse_publisher)
            #.transform(extract_news_writers)
            .transform(drop_records_null_content)
            .transform(parse_mentioned_tickers)
           )
    
    print('Business Insider - News')
    print_statistics(pipe)
    pipe.write(parsed)
    
    

    
def drop_records_null_content(dataframe):
    col = 'content'
    dataframe = dataframe.where(F.col(col).isNotNull())
    return dataframe
    

def parse_mentioned_tickers(dataframe):
    col='mentioned_tickers'
    dataframe = dataframe.withColumn(col,F.regexp_replace(F.col(col),'(^\[)|(\])',''))
    dataframe = dataframe.withColumn(col,F.explode(F.split(F.col(col),',')))
    dataframe = dataframe.withColumn(col,F.regexp_replace(F.col(col),'\.','-'))
    dataframe = dataframe.withColumn(col, F.upper(F.col(col)))
    return dataframe
    
def parse_date(dataframe):
    """date formaat: MMM. d, yyyy, hh:mm a
    Example; Jan. 30, 2020, 07:00 AM"""
    col = 'published_date'
    dataframe = dataframe.withColumn(col,F.to_date(F.col(col),"MMM. d, yyyy, hh:mm a"))
    return dataframe
    
def extract_news_writers(dataframe):
    col='publisher'
    writ_col= 'writer'
    regexp_string = """[A-Z][a-z]+ [A-z][a-z]+,"""
    
    dataframe = (dataframe
                 .withColumn(writ_col,F.when(F.col(col)
                                                      .rlike(regexp_string),
                                                     F.split(F.col(col),',')[0]).otherwise(F.lit(None).cast(StringType())))
                .withColumn(writ_col,F.when(F.size(F.split(F.col(writ_col),' ')) > 2, 
                                            F.concat_ws(' ',F.split(F.col(writ_col),' ')[0],F.split(F.col(writ_col),' ')[1] )).otherwise(F.col(writ_col))))
    
    dataframe = dataframe.withColumn(col,F.trim(F.col(col)))
    
    return dataframe

def parse_delete_chars(dataframe,col):

    dataframe = dataframe.withColumn(col,F.regexp_replace(F.col(col),'\n',' '))
    dataframe = dataframe.withColumn(col,F.regexp_replace(F.col(col),"\\s+",' '))
    dataframe = dataframe.withColumn(col,F.regexp_replace(F.col(col),'\,',''))
    dataframe = dataframe.withColumn(col,F.trim(F.col(col)))
    return dataframe

def parse_publisher(dataframe):
    col = 'publisher'
    dataframe = dataframe.withColumn(col,F.regexp_replace(F.col(col),'PRESS RELEASE *',''))
    format_contributor = {'InvestorPlace':'InvestorPlace','BI Prime':'BI Prime'}
    for key in format_contributor:
        dataframe = dataframe.withColumn(col,F.when(F.col(col).contains(key),F.lit(format_contributor[key])).otherwise(F.col(col)))
    dataframe = dataframe.withColumn(col,F.trim(F.col(col)))
    return dataframe

In [6]:
parsed_bi_news(spark)

root
 |-- title: string (nullable = true)
 |-- content: string (nullable = true)
 |-- published_date: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- ticker_symbol: string (nullable = true)

cols: 5 rows: 1215
Business Insider - News
root
 |-- title: string (nullable = true)
 |-- content: string (nullable = true)
 |-- published_date: date (nullable = true)
 |-- publisher: string (nullable = true)
 |-- mentioned_tickers: string (nullable = true)

cols: 5 rows: 1129


## Bi news Clean

In [7]:


@transform(spark,
    parsed = Input('/bi_news/parsed/bi_news_parsed.parquet', spark),
    metadata = Metadata('/bi_news/clean/bi_news_metadata_clean.csv', spark),
    clean = Output('/bi_news/clean/bi_news_clean.parquet'),
    clean_exception = Output('/bi_news/exception/bi_news_clean_exception.parquet')
)
def bi_news_clean(spark, parsed, metadata, clean, clean_exception):
    
    schema = metadata()
    
    print(schema.fieldNames())

    pipe = Pipeline(parsed)
    pipe = (pipe
            .show_dimensions()

           )
    pipe.dataframe.printSchema()
   


    validated_pipe = ValidatedPipeline(pipe, metadata)
    validated_pipe = (validated_pipe
                      #.add_validation(F.col('home_team_name') == 'Arsenal', 'column is null')
                      .validate()
                     )
    

    validated_pipe.write(clean, clean_exception)
    


In [8]:
bi_news_clean(spark)

['title', 'content', 'published_date', 'publisher', 'mentioned_tickers']
cols: 5 rows: 1129
root
 |-- title: string (nullable = true)
 |-- content: string (nullable = true)
 |-- published_date: date (nullable = true)
 |-- publisher: string (nullable = true)
 |-- mentioned_tickers: string (nullable = true)

Validated count: 1129
Exception count: 0
