In [1]:


import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
''
sc = pyspark.SparkContext(appName="bi_comp_details_raw_parsed_cleaned")
spark = SparkSession(sc)
from pyspark.sql.types import StructField, StructType, StringType, LongType,DoubleType,ArrayType,FloatType, BooleanType,IntegerType
import pyspark.sql.functions as F
from pyspark.sql.functions import udf

In [2]:
from skainet_spark import Pipeline, transform, Input, Output, Metadata,ValidatedPipeline,assign_shortcuts



## Business Insider Case - RAW


In [3]:
@transform(spark,
    raw = Output('/bi_comp_details/raw/bi_comp_details_raw.parquet'),
    metadata = Metadata('/bi_comp_details/raw/bi_comp_details_metadata_raw.csv', spark)
)
def raw_bi_comp_data(spark,raw,metadata):
    schema=metadata()
    df = (spark
          .read
          .format('json')
          .option('header','true')
          
          .schema(schema)
          .load('input/bi_comp_details/*.json')
         )
    pipeline=Pipeline(df)
    pipeline.show_dimensions()
    pipeline.dataframe.show(10)
    pipeline.write(raw)


In [4]:
raw_bi_comp_data(spark)

cols: 9 rows: 441
+--------------------+--------------------+--------------------+-----------------+--------------------+--------------+-----------------+------+--------------------+
|             address|        company_name|         description|              fax|        shareholders|postoffice_box|        telephone|ticker|             website|
+--------------------+--------------------+--------------------+-----------------+--------------------+--------------+-----------------+------+--------------------+
|300 South Tryon S...|           Honeywell|
				Honeywell In...|             null|[{"shareholders_n...|          null|+1 (704) 627-6200|   hon|http://www.honeyw...|
|1 Grand Canal Squ...|       Accenture plc|
				Accenture Pl...|+353 (1) 646-2020|[{"shareholders_n...|          null|+353 (1) 646-2000|   acn|http://www.accent...|
|Huntington Center...|Huntington Bancsh...|
				Huntington B...|+1 (614) 480-3761|[{"shareholders_n...|          null|+1 (614) 480-2665|  hban|http://www.hun

## Business Insider - Parsed

In [5]:

import json
import enum


@transform(spark,
    raw = Input('/bi_comp_details/raw/bi_comp_details_raw.parquet',spark),
    parsed_bi = Output('/bi_comp_details/parsed/bi_comp_details_parsed.parquet'),

)
def parsed_bi_comp_data(spark,raw,parsed_bi):
    cols_to_rename = {
        'ticker':'ticker_symbol',
        'website':'website_link'
    }
    SHORTCUT='_bi'
    pipel_bi = Pipeline(raw)
    pipel_bi = (pipel_bi
             .rename_columns(cols_to_rename)
             .transform(cast_shareholders)
             .transform(explode_column_shareholder)
             .transform(parse_address_composite)
             .transform(parse_company_name)
             .transform(parse_connection_number)
             .transform(parse_connection_number,ConnectionNumbers.fax.name)
             .transform(parse_ticker)
             .transform(parse_to_processed_schema)
            .drop_columns(['fax','postoffice_box'])
            .transform(assign_shortcuts,SHORTCUT)
             .show_dimensions()
            )

    print('Business Insider - Company Details')
    
    
    pipel_bi.show_dimensions()
    pipel_bi.dataframe.printSchema()
    pipel_bi.dataframe.show(10)
    

    pipel_bi.write(parsed_bi)
    
    
def explode_column_shareholder(dataframe):
    dataframe = (dataframe.withColumn('shareholders', F.explode(F.col('shareholders')) )
                    .withColumn('shareholders_name',F.col('shareholders.shareholders_name_bi'))
                    .withColumn('percentage', F.col('shareholders.percentage_bi'))
                     .drop('shareholders')
                )
    return dataframe
    
def parse_to_processed_schema(dataframe):
    dataframe = dataframe.withColumn('num_employees',F.lit(None).cast(IntegerType()))

    return dataframe
    
def parse_ticker(dataframe):
    dataframe = dataframe.withColumn('ticker_symbol', F.upper(F.col('ticker_symbol')))
    return dataframe

class ConnectionNumbers(enum.Enum):
    fax=1
    telephone=2
    
def cast_shareholders(dataframe):
    
    def parse_json(array_str):
        ## make scala function not python
        json_obj = json.loads(array_str)
        for item in json_obj:
            number = float(item["percentage"])
            yield (item["shareholders_name"], number)
            
    json_schema = ArrayType(StructType([StructField('shareholders_name_bi', StringType(), nullable=False), StructField('percentage_bi', DoubleType(), nullable=False)]))
   
    udf_parse_json = udf(lambda join_raw: parse_json(join_raw), json_schema)
    dataframe = dataframe.withColumn('shareholders',udf_parse_json(F.col('shareholders')))
    return dataframe

def parse_address_composite(dataframe):
  
    dataframe = dataframe.withColumn('city',F.reverse(F.split(F.col('address'),','))[0])
    dataframe = dataframe.withColumn('city',F.regexp_replace('city',r'-',' '))
    dataframe = dataframe.withColumn('city',F.regexp_replace('city',r'[0-9]{1,}',''))
    dataframe = dataframe.withColumn('country',F.lit(None).cast(StringType()))
    dataframe = dataframe.withColumn('state',F.lit(None).cast(StringType()))
    dataframe = (dataframe.withColumn('city',F.trim(F.initcap('city')))
                .withColumn('state',F.trim(F.initcap('state')))
                .withColumn('country',F.trim(F.initcap('country'))))
    dataframe = dataframe.drop('address')
    return dataframe



def parse_connection_number(dataframe,column_type=ConnectionNumbers.telephone.name):
    
    def join_connection_number(number):
        if number is None:
            return None
        telephone = number.split(' ')
        if telephone[0] == '+1':
            telephone = telephone[1:]
        else:
            telephone[0] = telephone[0].replace('+','')
            
        return "-".join(telephone)
    
    udf_join_connection_number = udf(join_connection_number)
    dataframe = dataframe.withColumn(column_type,F.regexp_replace(column_type,'\\(',''))# check if double backslash or one
    dataframe = dataframe.withColumn(column_type,F.regexp_replace(column_type,'\\)',''))# check if double backslash or one
    dataframe = dataframe.withColumn(column_type,F.regexp_replace(column_type,'-',' '))
    dataframe = dataframe.withColumn(column_type,udf_join_connection_number(F.col(column_type)))

    return dataframe
    
    
def parse_company_name(dataframe):
    dataframe = dataframe.withColumn('company_name',F.lower(F.col('company_name')))
    dataframe = dataframe.withColumn('company_name',F.concat(F.col('company_name'),F.lit(' ')))
    #dataframe = dataframe.withColumn('company_name',F.regexp_replace('company_name',' inc',' incorporated'))
    dataframe = dataframe.withColumn('company_name',F.regexp_replace('company_name',' inc[.]| inc ',' incorporated '))
    #dataframe = dataframe.withColumn('company_name',F.regexp_replace('company_name',' inc| inc[.]',' incorporated '))
    dataframe = dataframe.withColumn('company_name',F.regexp_replace('company_name',' corp[.]| corp ',' corporation '))
    dataframe = dataframe.withColumn('company_name',F.regexp_replace('company_name',' co[.] | co ',' company '))
    dataframe = dataframe.withColumn('company_name',F.regexp_replace('company_name','  ',' '))
    dataframe = dataframe.withColumn('company_name',F.regexp_replace('company_name','\([^)]*\)',''))
    
    dataframe = dataframe.withColumn('company_name',F.trim(F.col('company_name')))
    return dataframe

In [6]:
parsed_bi_comp_data(spark)

cols: 11 rows: 6612
Business Insider - Company Details
cols: 11 rows: 6612
root
 |-- company_name_bi: string (nullable = true)
 |-- description_bi: string (nullable = true)
 |-- telephone_bi: string (nullable = true)
 |-- ticker_symbol_bi: string (nullable = true)
 |-- website_link_bi: string (nullable = true)
 |-- shareholders_name_bi: string (nullable = true)
 |-- percentage_bi: double (nullable = true)
 |-- city_bi: string (nullable = true)
 |-- country_bi: string (nullable = true)
 |-- state_bi: string (nullable = true)
 |-- num_employees_bi: integer (nullable = true)

+---------------+--------------------+------------+----------------+--------------------+--------------------+-------------+---------+----------+--------+----------------+
|company_name_bi|      description_bi|telephone_bi|ticker_symbol_bi|     website_link_bi|shareholders_name_bi|percentage_bi|  city_bi|country_bi|state_bi|num_employees_bi|
+---------------+--------------------+------------+----------------+--------

## Business Insider Company Details- Clean

In [7]:


@transform(spark,
    parsed = Input('/bi_comp_details/parsed/bi_comp_details_parsed.parquet', spark),
    metadata = Metadata('/bi_comp_details/clean/bi_comp_details_metadata_clean.csv', spark),
    clean = Output('/bi_comp_details/clean/bi_comp_details_clean.parquet'),
    clean_exception = Output('/bi_comp_details/exception/bi_comp_details_clean_exception.parquet')
)
def bi_comp_details_clean(spark, parsed, metadata, clean, clean_exception):
    
    schema = metadata()
    
    print(schema.fieldNames())

    pipe = Pipeline(parsed)
    pipe = (pipe
            .show_dimensions()

           )
    
   

    pipe.dataframe.printSchema()
    validated_pipe = ValidatedPipeline(pipe, metadata)
    validated_pipe = (validated_pipe
                      
                      .validate()
                     )
    
    validated_pipe.validated_dataframe.select('*').show(10)
    validated_pipe.write(clean, clean_exception)
    




In [8]:
bi_comp_details_clean(spark)

['company_name_bi', 'ticker_symbol_bi', 'description_bi', 'telephone_bi', 'website_link_bi', 'shareholders_name_bi', 'percentage_bi', 'city_bi', 'country_bi', 'state_bi', 'num_employees_bi']
cols: 11 rows: 6612
root
 |-- company_name_bi: string (nullable = true)
 |-- description_bi: string (nullable = true)
 |-- telephone_bi: string (nullable = true)
 |-- ticker_symbol_bi: string (nullable = true)
 |-- website_link_bi: string (nullable = true)
 |-- shareholders_name_bi: string (nullable = true)
 |-- percentage_bi: double (nullable = true)
 |-- city_bi: string (nullable = true)
 |-- country_bi: string (nullable = true)
 |-- state_bi: string (nullable = true)
 |-- num_employees_bi: integer (nullable = true)

Validated count: 6612
Exception count: 0
+---------------+--------------------+------------+----------------+--------------------+--------------------+-------------+---------+----------+--------+----------------+
|company_name_bi|      description_bi|telephone_bi|ticker_symbol_bi|   