In [1]:


import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
''
sc = pyspark.SparkContext(appName="news_processed")
spark = SparkSession(sc)
spark.conf.set('spark.sql.session.timeZone', 'UTC')

In [2]:
from skainet_spark import Pipeline, transform, Input, Output, Metadata,ValidatedPipeline,assign_shortcuts,print_statistics
from pyspark.sql.types import StructField, StructType, StringType, LongType,DoubleType,ArrayType,FloatType, BooleanType,IntegerType
import pyspark.sql.functions as F
from pyspark.sql.functions import udf

## News Processed

In [3]:
@transform(spark,
    parsed_bi = Input('/bi_news/clean/bi_news_clean.parquet', spark),
    parsed_nq = Input('/nasdaq_news/clean/nasdaq_news_clean.parquet', spark),
           
    metadata_news = Metadata('/news/processed/news_metadata_processed.csv',spark),
    news_ontology = Output('/news/processed/news_processed.parquet'),
    news_ontology_exception = Output('/news/exception/news_processed_exception.parquet')
)
def news_processed(spark, parsed_bi,parsed_nq,metadata_news,news_ontology,news_ontology_exception ):
    pipel_bi = Pipeline(parsed_bi)
    pipel_nq = Pipeline(parsed_nq)
    print('Business Insider News')
    print_statistics(pipel_bi)
   
    print('Nasdaq News')
    print_statistics(pipel_nq)
    
    pipe_merge= None
    
    df = left_join_semi(pipel_nq.dataframe,pipel_bi.dataframe)
    if  df.count() > 0:
        prepared_right_side = prepare_right_side(pipel_bi.dataframe,df)
        merged_df = union(pipel_nq.dataframe,prepared_right_side)
        pipe_merge = Pipeline(merged_df)
    else:
        merged_df = union(pipel_nq.dataframe,pipel_bi.dataframe)
        pipe_merge = Pipeline(merged_df)
    print_statistics(pipe_merge)
    
    
    pipe_merge = (pipe_merge
                  .transform(create_id)
                  .transform(format_mentioned_tickers)
                 )
    pipe_merge.dataframe.select('title','mentioned_tickers','published_date','publisher').show(10,0)
    pipe_merge.dataframe.groupby('mentioned_tickers').count().show(100,0) 
    schema = metadata_news()
    validated_pipe = ValidatedPipeline(pipe_merge, schema)
    validated_pipe = (validated_pipe
                 
                      .validate()
                     )
    
    validated_pipe.write(news_ontology,news_ontology_exception)
    
def format_mentioned_tickers(dataframe):
    col='mentioned_tickers'
    dataframe = dataframe.withColumn(col,F.regexp_replace(F.col(col),'"',''))
    dataframe = dataframe.withColumn(col,F.regexp_replace(F.col(col),"'",''))
    return dataframe
    
    
def create_id(dataframe):
    col = 'news_id'
    dataframe = dataframe.withColumn(col,F.concat_ws('-',F.col('title'),F.col('published_date'),F.col('publisher') ))
    return dataframe

def prepare_right_side(df_r,df_interthreshold=3):
    id_col = 'title'
    join_type = 'left_anti'
    
    df = df_r.join(df_inter,F.levenshtein(df_r['title'],df_inter['title']) < threshold,join_type)
    return df
    
def left_join_semi(nasdaq,busin,threshold=3):
    condition = nasdaq['title'] == busin['title']
    df = nasdaq.join(busin,F.levenshtein(nasdaq['title'],busin['title']) < threshold,'left_semi')
    
    return df

def union(nasdaq,busin):
    columns_order = ['title','content','published_date','publisher','mentioned_tickers']
    order_nq = nasdaq.select(*columns_order)
    order_bi = busin.select(*columns_order)
    merged = order_nq.union(order_bi)
    return merged

In [4]:
news_processed(spark)

Business Insider News
root
 |-- title: string (nullable = true)
 |-- content: string (nullable = true)
 |-- published_date: date (nullable = true)
 |-- publisher: string (nullable = true)
 |-- mentioned_tickers: string (nullable = true)

cols: 5 rows: 1129
Nasdaq News
root
 |-- title: string (nullable = true)
 |-- content: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- published_date: date (nullable = true)
 |-- mentioned_tickers: string (nullable = true)

cols: 5 rows: 6194
root
 |-- title: string (nullable = true)
 |-- content: string (nullable = true)
 |-- published_date: date (nullable = true)
 |-- publisher: string (nullable = true)
 |-- mentioned_tickers: string (nullable = true)

cols: 5 rows: 7323
+-------------------------------------------------------+-----------------+--------------+---------------+
|title                                                  |mentioned_tickers|published_date|publisher      |
+----------------------------------------------