In [1]:


import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
''
sc = pyspark.SparkContext(appName="yahoo_comp_details_raw_parsed_cleaned")
spark = SparkSession(sc)


In [2]:
from skainet_spark import Pipeline, transform, Input, Output, Metadata,ValidatedPipeline,assign_shortcuts
from pyspark.sql.types import StructField, StructType, StringType, LongType,DoubleType,ArrayType,FloatType, BooleanType,IntegerType
import pyspark.sql.functions as F
from pyspark.sql.functions import udf


## Yahoo Comp Details - RAW

In [3]:
@transform(spark,
    raw = Output('/yahoo_comp_details/raw/yahoo_comp_details_raw.parquet'),
    metadata = Metadata('/yahoo_comp_details/raw/yahoo_comp_details_metadata_raw.csv', spark)
)
def raw_yahoo_comp_data(spark,raw,metadata):
    schema=metadata()
    df = (spark
          .read
          .format('json')
          .option('header','true')
          .schema(schema)
          .load('input/yahoo_comp_details/*.json')
         )
    pipeline=Pipeline(df)
    pipeline.show_dimensions()
    pipeline.dataframe.show(10)
    pipeline.write(raw)


    
@transform(spark,
    raw = Output('/state_codes/raw/state_codes_raw.parquet'),
    metadata = Metadata('/state_codes/raw/state_codes_metadata_raw.csv', spark)
)
def raw_state_codes(spark,raw,metadata):
    schema=metadata()
    df = (spark
          .read
          .format('csv')
          .option('header','true')
          .schema(schema)
          .load('input/state_codes.csv')
         )
    pipeline=Pipeline(df)
    pipeline.show_dimensions()
    pipeline.dataframe.show(10)
    pipeline.write(raw)

In [4]:
raw_yahoo_comp_data(spark)
raw_state_codes(spark)

cols: 9 rows: 441
+------+--------------------+--------------------+------------+--------------------+--------------------+------------------+------------+--------------------+
|ticker|        company_name|             address|   telephone|         website_url|            industry|            sector|no_employees|         description|
+------+--------------------+--------------------+------------+--------------------+--------------------+------------------+------------+--------------------+
|   blk|     BlackRock, Inc.|55 East 52nd Stre...|212-810-5300|http://www.blackr...|    Asset Management|Financial Services|      16,300|BlackRock, Inc. i...|
|  jbht|J.B. Hunt Transpo...|615 J.B. Hunt Cor...|479 820 0000|http://www.jbhunt...|Integrated Freigh...|       Industrials|      29,056|J.B. Hunt Transpo...|
|  adbe|          Adobe Inc.|345 Park Avenue~S...|408-536-6000|http://www.adobe.com|Software—Infrastr...|        Technology|      22,634|Adobe Inc. operat...|
|   ibm|International Bus...

## Yahoo Comp Details - PARSED

In [5]:

import json
import enum


@transform(spark,
    raw_yh = Input('/yahoo_comp_details/raw/yahoo_comp_details_raw.parquet',spark),
    raw_state_codes = Input('/state_codes/raw/state_codes_raw.parquet',spark),
    parsed_yh = Output('/yahoo_comp_details/parsed/yahoo_comp_details_parsed.parquet'),
)
def parsed_yahoo_comp_data(spark,raw_yh,parsed_yh,raw_state_codes):
    cols_to_rename = {
        'ticker':'ticker_symbol',
        'website_url':'website_link',
        'no_employees':'num_employees'
    }
    

    SHORTCUT='_yh'
    pipel_yh = Pipeline(raw_yh)
    
    pipel_yh = (pipel_yh
                .rename_columns(cols_to_rename)
                .transform(parse_number)
                .transform(parse_connection_number)
                .transform(parse_company_name)
                .transform(parse_ticker)
                .transform(parse_address,raw_state_codes())
                .select([
                    'ticker_symbol',
                    'company_name',
                    'telephone',
                    'country',
                    'state',
                    'city',
                    'sector',
                    'industry',
                    'num_employees',
                    'description',
                    'website_link'

                
                ])
                .transform(assign_shortcuts,SHORTCUT)
                .show_dimensions()
                
               )
    

    print('Yahoo - Company Details')
    pipel_yh.show_dimensions()
    pipel_yh.dataframe.printSchema()
    pipel_yh.dataframe.show(10)
    

    pipel_yh.write(parsed_yh)
    
    


    
    
def parse_ticker(dataframe):
    dataframe = dataframe.withColumn('ticker_symbol', F.upper(F.col('ticker_symbol')))
    return dataframe

def parse_number(dataframe):
    dataframe = dataframe.withColumn('num_employees',
                                     F.regexp_replace(F.col('num_employees'),',','').cast(IntegerType())
                                    )
    return dataframe

def parse_connection_number(dataframe):
    dataframe = (dataframe
                 .withColumn('telephone',
                             F.when(F.col('telephone').contains('-'),
                                   F.col('telephone')).
                            otherwise(
                                F.concat_ws("-",F.split(F.col('telephone'),' '))
                            
                            ))
                 .withColumn('telephone',F.trim(F.col('telephone')))
                
                )
    return dataframe





def parse_address(dataframe,state_codes=None):
    default_usa_name = 'United States Of America'
    dataframe = (dataframe
                 .withColumn('address_parts',F.reverse(F.split(F.col('address'),'~')))
                 .withColumn('country',F.col('address_parts')[0])
                 .withColumn('city_state',F.col('address_parts')[1])
                 .withColumn('city',F.split(F.col('city_state'),',')[0])
                 .withColumn('state', F.trim(F.split(F.col('city_state'),',')[1]))
                 .withColumn('state', F.split(F.col('state'),' ')[0])
                 .drop('city_state','address_parts',)
                )
    ## take only first part from city in country which are not United States
    dataframe = (dataframe.withColumn('city',
                                     F.when(F.col('country') != F.lit('United States'),
                                           F.split(F.col('city'),' ')[0]
                                           ).
                                      otherwise(F.col('city'))
                                     )
                )
    join_expression = dataframe['state'] == state_codes['Code']
    dataframe = dataframe.join(state_codes,join_expression).withColumn('state',F.col('State_name')).drop('Abbrev','Code','State_name')
    dataframe = (dataframe.withColumn('city',F.trim(F.initcap('city')))
                .withColumn('state',F.trim(F.initcap('state')))
                .withColumn('country',F.trim(F.initcap('country')))
                .withColumn('country',F.when(F.col('country') == F.lit('United States'),
                                            F.lit(default_usa_name))
                            .otherwise(F.col('country'))))
    return dataframe


def parse_company_name(dataframe):
    dataframe = dataframe.withColumn('company_name',F.lower(F.col('company_name')))
    dataframe = dataframe.withColumn('company_name',F.concat(F.col('company_name'),F.lit(' ')))
    dataframe = dataframe.withColumn('company_name',F.regexp_replace('company_name',' inc[.]| inc ',' incorporated '))
    dataframe = dataframe.withColumn('company_name',F.regexp_replace('company_name',' corp[.]',' corporation '))
    dataframe = dataframe.withColumn('company_name',F.regexp_replace('company_name',' co[.]',' company'))
    dataframe = dataframe.withColumn('company_name',F.regexp_replace('company_name',',',''))
    dataframe = dataframe.withColumn('company_name',F.regexp_replace('company_name','\([^)]*\)',''))
    dataframe = dataframe.withColumn('company_name',F.regexp_replace('company_name','  ',' '))
    dataframe = dataframe.withColumn('company_name',F.trim(F.col('company_name')))
    return dataframe
    
    

In [6]:
parsed_yahoo_comp_data(spark)

cols: 11 rows: 423
Yahoo - Company Details
cols: 11 rows: 423
root
 |-- ticker_symbol_yh: string (nullable = true)
 |-- company_name_yh: string (nullable = true)
 |-- telephone_yh: string (nullable = true)
 |-- country_yh: string (nullable = true)
 |-- state_yh: string (nullable = true)
 |-- city_yh: string (nullable = true)
 |-- sector_yh: string (nullable = true)
 |-- industry_yh: string (nullable = true)
 |-- num_employees_yh: integer (nullable = true)
 |-- description_yh: string (nullable = true)
 |-- website_link_yh: string (nullable = true)

+----------------+--------------------+------------+--------------------+-----------+--------------+------------------+--------------------+----------------+--------------------+--------------------+
|ticker_symbol_yh|     company_name_yh|telephone_yh|          country_yh|   state_yh|       city_yh|         sector_yh|         industry_yh|num_employees_yh|      description_yh|     website_link_yh|
+----------------+--------------------+-------

## Yahoo - Company Details Cleaned

In [7]:


@transform(spark,
    parsed = Input('/yahoo_comp_details/parsed/yahoo_comp_details_parsed.parquet', spark),
    metadata = Metadata('/yahoo_comp_details/clean/yahoo_comp_details_metadata_clean.csv', spark),
    clean = Output('/yahoo_comp_details/clean/yahoo_comp_details_clean.parquet'),
    clean_exception = Output('/yahoo_comp_details/exception/yahoo_comp_details_clean_exception.parquet')
)
def yahoo_comp_details_clean(spark, parsed, metadata, clean, clean_exception):
    
    schema = metadata()
    
    print(schema.fieldNames())

    pipe = Pipeline(parsed)
    pipe = (pipe
            .show_dimensions()

           )
    
   


    validated_pipe = ValidatedPipeline(pipe, metadata)
    validated_pipe = (validated_pipe
                      #.add_validation(F.col('home_team_name') == 'Arsenal', 'column is null')
                      .validate()
                     )
    

    validated_pipe.write(clean, clean_exception)

In [8]:
yahoo_comp_details_clean(spark)

['ticker_symbol_yh', 'company_name_yh', 'telephone_yh', 'country_yh', 'state_yh', 'city_yh', 'sector_yh', 'industry_yh', 'num_employees_yh', 'description_yh', 'website_link_yh']
cols: 11 rows: 423
Validated count: 423
Exception count: 0
