In [1]:


import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
''
sc = pyspark.SparkContext(appName="company_details_processed")
spark = SparkSession(sc)
from pyspark.sql.types import StructField, StructType, StringType, LongType,DoubleType,ArrayType,FloatType, BooleanType
import pyspark.sql.functions as F
from pyspark.sql.functions import udf

In [2]:
from skainet_spark import Pipeline, transform, Input, Output, Metadata,ValidatedPipeline,print_statistics

## Company Details Processed

In [3]:
import numpy as np
@transform(spark,
    parsed_yh = Input('/yahoo_comp_details/clean/yahoo_comp_details_clean.parquet', spark),
    parsed_bi = Input('/bi_comp_details/clean/bi_comp_details_clean.parquet', spark),
    parsed_nq = Input('/nasdaq_comp_details/clean/nasdaq_comp_details_clean.parquet', spark),
           
    metadata_comp_details = Metadata('/company_details/processed/company_details_metadata_processed.csv',spark),
    comp_details_ontology = Output('/company_details/processed/company_details_processed.parquet'),
    comp_details_ontology_exception = Output('/company_details/exception/company_details_processed_exception.parquet')
)
def company_details_processed(spark, parsed_yh, parsed_bi, parsed_nq, metadata_comp_details, 
                             comp_details_ontology,comp_details_ontology_exception ):
    
    pipe_yh = Pipeline(parsed_yh)
    pipe_bi = Pipeline(parsed_bi)
    pipe_nq = Pipeline(parsed_nq)
    print('Yahoo')
    print_statistics(pipe_yh)
    print('Business Insider')
    print_statistics(pipe_bi)
    print('Nasdaq')
    print_statistics(pipe_nq)

    
    pip_comp_details = join_dataframes(pipe_yh.dataframe,pipe_bi.dataframe,pipe_nq.dataframe)
    print_statistics(pip_comp_details)
    pip_comp_details.show_dimensions()
    
    columns_general = ['ticker_symbol','company_name','website_link','num_employees','description','telephone']
    for col in columns_general:
        pip_comp_details = pip_comp_details.transform(match_criteria,col)
    
    
    
    columns_si = ['sector','industry']
    for col in columns_si:
        pip_comp_details = pip_comp_details.transform(match_criteria_si,col)
        
    columns_csc = ['country','state','city']
    for col in columns_csc:
        pip_comp_details = pip_comp_details.transform(match_criteria_csc,col)

    pip_comp_details = (pip_comp_details.rename_columns({
        'shareholders_name_bi':'shareholders_name',
        'percentage_bi':'percentage'
    }))
    columns_all = columns_general+columns_si+columns_csc + ['shareholders_name','percentage']

    pip_comp_details = (pip_comp_details.select(columns_all))
    print_statistics(pip_comp_details)
    schema = metadata_comp_details()
    validated_pipe = ValidatedPipeline(pip_comp_details, schema)
    validated_pipe = (validated_pipe
                 
                      .validate()
                     )
    print_null_report(pip_comp_details)
    validated_pipe.write(comp_details_ontology,comp_details_ontology_exception)
    
    



    

def join_dataframes(yahoo,busin,nasdaq):
    
    join_type='left_outer'
    pipe_largest = None
    yh_count = yahoo.count()
    nq_count = nasdaq.count()
    bi_count = busin.count()
    shortcuts = ['yh','nq','bi']
    pipes = [yahoo,nasdaq,busin]
    all_counts = [yh_count,nq_count,bi_count]
    max_pip = np.argmax(all_counts)
    pipe_largest = pipes[max_pip]
    shortcut = shortcuts[max_pip]
    shortcuts.remove(shortcut)
    pipes.remove(pipe_largest)
    comp_details = None
    for pipe in range(len(pipes)):
        if pipe == 0:
            join_condition = pipe_largest[f"ticker_symbol_{shortcut}"] == pipes[pipe][f"ticker_symbol_{shortcuts[pipe]}"]
            comp_details = pipe_largest.join(pipes[pipe],join_condition,join_type)
        else:
            join_condition = comp_details[f"ticker_symbol_{shortcut}"] == pipes[pipe][f"ticker_symbol_{shortcuts[pipe]}"]

            comp_details = comp_details.join(pipes[pipe],join_condition,join_type)


    pip_comp_details = Pipeline(comp_details)
    return pip_comp_details

def print_null_report(pipeline):
    pipeline.dataframe.select([F.count(F.when(F.isnan(c) | F.col(c).isNull(), c)).alias(c) for c in pipeline.dataframe.columns]).show()
    
def match_criteria(dataframe,column):
    yahoo_col = column+"_yh"
    bi_col = column+"_bi"
    nasdaq_column = column+"_nq"
    
    dataframe = (dataframe.withColumn(column,F.when(
                                                ((F.col(yahoo_col) == F.col(bi_col)) & (F.col(bi_col) == F.col(nasdaq_column)) & (F.col(nasdaq_column) == F.col(yahoo_col))) | \
                                                    (F.col(yahoo_col).isNull() & F.col(bi_col).isNull() & F.col(nasdaq_column).isNull() ),
                                                F.col(yahoo_col) ).otherwise(F.coalesce(F.col(nasdaq_column),F.col(bi_col),F.col(yahoo_col)))
                                     )
                )
    dataframe = (dataframe.withColumn('conflicts_'+column, F.when((F.col(yahoo_col) == F.col(bi_col)) & (F.col(bi_col) == F.col(nasdaq_column)) & (F.col(nasdaq_column) == F.col(yahoo_col)),F.lit('MATCH') )
                                      .when(F.col(nasdaq_column).isNotNull(),F.lit('NQ') )
                                      .when(F.col(bi_col).isNotNull(),F.lit('BI') )
                                      .when(F.col(yahoo_col).isNotNull(),F.lit('YH') ).otherwise(F.lit(F.lit(None)))
                                      ))

    dataframe = dataframe.drop(yahoo_col,bi_col,nasdaq_column)
    return dataframe



def match_criteria_csc(dataframe,column):
    yahoo_col = column+"_yh"
    bi_col = column+"_bi"
    nasdaq_column = column+"_nq"
    if column in ['country','state']:
        dataframe = (dataframe.withColumn(column,F.when(
                                                     (F.col(nasdaq_column) == F.col(yahoo_col)) | \
                                                        (F.col(yahoo_col).isNull()  & F.col(nasdaq_column).isNull() ),
                                                    F.col(yahoo_col) ).otherwise(F.coalesce(F.col(nasdaq_column),F.col(yahoo_col)))
                                         )
                    )
                    
        dataframe = (dataframe.withColumn('conflicts_'+column, F.when((F.col(nasdaq_column) == F.col(yahoo_col)),F.lit('MATCH') )
                                          .when(F.col(nasdaq_column).isNotNull(),F.lit('NQ') )
                                          .when(F.col(yahoo_col).isNotNull(),F.lit('YH') ).otherwise(F.lit(F.lit(None)))
                                          ))
    else:
        dataframe = (dataframe.withColumn(column,F.when(
                                                    ((F.col(yahoo_col) == F.col(bi_col)) & (F.col(bi_col) == F.col(nasdaq_column)) & (F.col(nasdaq_column) == F.col(yahoo_col))) | \
                                                        (F.col(yahoo_col).isNull() & F.col(bi_col).isNull() & F.col(nasdaq_column).isNull() ),
                                                    F.col(yahoo_col) ).otherwise(F.coalesce(F.col(nasdaq_column),F.col(bi_col),F.col(yahoo_col)))
                                         )
                    )
        dataframe = (dataframe.withColumn('conflicts_'+column, F.when((F.col(yahoo_col) == F.col(bi_col)) & (F.col(bi_col) == F.col(nasdaq_column)) & (F.col(nasdaq_column) == F.col(yahoo_col)),F.lit('MATCH') )
                                          .when(F.col(nasdaq_column).isNotNull(),F.lit('NQ') )
                                          .when(F.col(bi_col).isNotNull(),F.lit('BI') )
                                          .when(F.col(yahoo_col).isNotNull(),F.lit('YH') ).otherwise(F.lit(F.lit(None)))
                                          ))

    dataframe = dataframe.drop(yahoo_col,bi_col,nasdaq_column)
    return dataframe


def match_criteria_si(dataframe,column):
    yahoo_col = column+"_yh"
    bi_col = column+"_bi"
    nasdaq_column = column+"_nq"

    dataframe = (dataframe.withColumn(column,F.when(
                                                 (F.col(nasdaq_column) == F.col(yahoo_col)) | \
                                                    (F.col(yahoo_col).isNull()  & F.col(nasdaq_column).isNull() ),
                                                F.col(yahoo_col) ).otherwise(F.coalesce(F.col( yahoo_col),F.col(nasdaq_column)))
                                     )
                )

    dataframe = (dataframe.withColumn('conflicts_'+column, F.when((F.col(nasdaq_column) == F.col(yahoo_col)),F.lit('MATCH') )
                                      .when(F.col(nasdaq_column).isNotNull(),F.lit('NQ') )
                                      .when(F.col(yahoo_col).isNotNull(),F.lit('YH') ).otherwise(F.lit(F.lit(None)))
                                      ))

    dataframe = dataframe.drop(yahoo_col,bi_col,nasdaq_column)
    return dataframe

In [4]:
company_details_processed(spark)

Yahoo
root
 |-- ticker_symbol_yh: string (nullable = true)
 |-- company_name_yh: string (nullable = true)
 |-- telephone_yh: string (nullable = true)
 |-- country_yh: string (nullable = true)
 |-- state_yh: string (nullable = true)
 |-- city_yh: string (nullable = true)
 |-- sector_yh: string (nullable = true)
 |-- industry_yh: string (nullable = true)
 |-- num_employees_yh: integer (nullable = true)
 |-- description_yh: string (nullable = true)
 |-- website_link_yh: string (nullable = true)

cols: 11 rows: 423
Business Insider
root
 |-- company_name_bi: string (nullable = true)
 |-- description_bi: string (nullable = true)
 |-- telephone_bi: string (nullable = true)
 |-- ticker_symbol_bi: string (nullable = true)
 |-- website_link_bi: string (nullable = true)
 |-- shareholders_name_bi: string (nullable = true)
 |-- percentage_bi: double (nullable = true)
 |-- city_bi: string (nullable = true)
 |-- country_bi: string (nullable = true)
 |-- state_bi: string (nullable = true)
 |-- num_em