In [1]:


import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
''
sc = pyspark.SparkContext(appName="company_details_ddl")
spark = SparkSession(sc)
from pyspark.sql.types import StructField, StructType, StringType, LongType,DoubleType,ArrayType,FloatType, BooleanType
import pyspark.sql.functions as F
from pyspark.sql.functions import udf
import numpy as np

In [2]:
from skainet_spark import Pipeline, transform, Input, Output, Metadata,ValidatedPipeline,print_statistics

In [3]:
def remove_null_records(dataframe):
    # Remove null records because there are states for UK/Swiss (they aint have states) or any case
    dataframe = dataframe.na.drop()
    return dataframe

# Company Details - DDL
Creating files for Neo4j basing on first source of information - Company Details, that was later divided into distinct datasets

### Company

In [4]:
@transform(spark,
    comp_details_ontology = Input('/company_details/ontology/company_details_ontology.parquet', spark),

    metdata = Metadata('/company_details/ddl/company_details_metadata_ddl.csv',spark),
    ddl_csv = Output('/company_details/ddl/comp_details_ddl.csv'),
    ddl_csv_exception = Output('/company_details/exception/comp_details_ddl_exception.csv')
)
def company_ddl(spark,comp_details_ontology, metdata, ddl_csv, ddl_csv_exception):
    pipe_comp_details = Pipeline(comp_details_ontology)
    
    pipe_comp_details = (pipe_comp_details
                         .rename_columns({
                             "ticker_symbol":'company_id'
                         })
                         .select([
                             'company_id',
                             'company_name',
                             'website_link',
                             'num_employees',
                             'description',
                             'telephone'
                         ])
                         .distinct()
                         .transform(remove_null_records)
                        )
    pipe_comp_details.show_dimensions()
    pipe_comp_details.dataframe.printSchema()
    pipe_comp_details.dataframe.show(5)
    schema = metdata()
    validated_pipe = ValidatedPipeline(pipe_comp_details, schema)
    validated_pipe = (validated_pipe
                      .validate()
                     )
    
    validated_pipe.write_csv(ddl_csv,ddl_csv_exception)
    
    

In [5]:
company_ddl(spark)

cols: 6 rows: 418
root
 |-- company_id: string (nullable = true)
 |-- company_name: string (nullable = true)
 |-- website_link: string (nullable = true)
 |-- num_employees: integer (nullable = true)
 |-- description: string (nullable = true)
 |-- telephone: string (nullable = true)

+----------+--------------------+--------------------+-------------+--------------------+------------+
|company_id|        company_name|        website_link|num_employees|         description|   telephone|
+----------+--------------------+--------------------+-------------+--------------------+------------+
|       KMI|kinder morgan inc...|http://www.kinder...|        11086|We are one of the...|713-369-9000|
|       NEE|nextera energy in...|http://www.nexter...|        14800|NEE is one of the...|561-694-4000|
|      FITB| fifth third bancorp|   http://www.53.com|        20182|Fifth Third Banco...|513-921-5505|
|       BEN|franklin resource...|http://www.frankl...|         9600|Franklin Resource...|601-939-2

## Sector 

In [6]:
@transform(spark,
     comp_details_ontology = Input('/company_details/ontology/company_details_ontology.parquet', spark),

    metadata = Metadata('/company_details/ddl/sector_metadata_ddl.csv',spark),
    ddl_csv = Output('/company_details/ddl/sector_ddl.csv'),
    ddl_csv_exception = Output('/company_details/exception/sector_ddl_exception.csv')
)
def sector_ddl(spark,comp_details_ontology, metadata, ddl_csv, ddl_csv_exception):
    pipe_si = Pipeline(comp_details_ontology)
    
    pipe_si = (pipe_si
             .rename_columns({
                 "sector":'sector_id'
             })
             .select([
                 'sector_id',

             ])
             .distinct()
                         .transform(remove_null_records)
            )
    pipe_si.show_dimensions()
    pipe_si.dataframe.printSchema()
    
    schema = metadata()
    validated_pipe = ValidatedPipeline(pipe_si, schema)
    validated_pipe = (validated_pipe
                      .validate()
                     )
    
    validated_pipe.write_csv(ddl_csv,ddl_csv_exception)
    
    

In [7]:
sector_ddl(spark)

cols: 1 rows: 16
root
 |-- sector_id: string (nullable = true)

Validated count: 16
Exception count: 0


## Industry

In [8]:
@transform(spark,
     comp_details_ontology = Input('/company_details/ontology/company_details_ontology.parquet', spark),

    metadata = Metadata('/company_details/ddl/industry_metadata_ddl.csv',spark),
    ddl_csv = Output('/company_details/ddl/industry_ddl.csv'),
    ddl_csv_exception = Output('/company_details/exception/industry_ddl_exception.csv')
)
def industry_ddl(spark,comp_details_ontology, metadata, ddl_csv, ddl_csv_exception):
    pipe_si = Pipeline(comp_details_ontology)
    
    pipe_si = (pipe_si
                         .rename_columns({
                             "industry":'industry_id'
                         })
                         .select([
                             'industry_id',
 
                         ])
                         .distinct()
                                                  .transform(remove_null_records)
                        )
    pipe_si.show_dimensions()
    pipe_si.dataframe.printSchema()
    
    schema = metadata()
    validated_pipe = ValidatedPipeline(pipe_si, schema)
    validated_pipe = (validated_pipe
                      .validate()
                     )
    
    validated_pipe.write_csv(ddl_csv,ddl_csv_exception)
    
    

In [9]:
industry_ddl(spark)

cols: 1 rows: 118
root
 |-- industry_id: string (nullable = true)

Validated count: 118
Exception count: 0


## City

In [10]:
@transform(spark,
    comp_details_ontology = Input('/company_details/ontology/company_details_ontology.parquet', spark),

    metadata = Metadata('/company_details/ddl/city_metadata_ddl.csv',spark),
    ddl_csv = Output('/company_details/ddl/city_ddl.csv'),
    ddl_csv_exception = Output('/company_details/exception/city_ddl_exception.csv')
)
def city_ddl(spark,comp_details_ontology, metadata, ddl_csv, ddl_csv_exception):
    pipe_csc = Pipeline(comp_details_ontology)
    
    pipe_csc = (pipe_csc
                         .rename_columns({
                             "city":'city_id'
                         })
                         .select([
                             'city_id',
 
                         ])
                         .distinct()
                                                  .transform(remove_null_records)
                        )
    pipe_csc.show_dimensions()
    pipe_csc.dataframe.printSchema()
    
    schema = metadata()
    validated_pipe = ValidatedPipeline(pipe_csc, schema)
    validated_pipe = (validated_pipe
                      .validate()
                     )
    
    validated_pipe.write_csv(ddl_csv,ddl_csv_exception)
    
    

In [11]:
city_ddl(spark)

cols: 1 rows: 209
root
 |-- city_id: string (nullable = true)

Validated count: 209
Exception count: 0


## State

In [12]:
@transform(spark,
    comp_details_ontology = Input('/company_details/ontology/company_details_ontology.parquet', spark),

    metadata = Metadata('/company_details/ddl/state_metadata_ddl.csv',spark),
    ddl_csv = Output('/company_details/ddl/state_ddl.csv'),
    ddl_csv_exception = Output('/company_details/exception/state_ddl_exception.csv')
)
def state_ddl(spark,comp_details_ontology, metadata, ddl_csv, ddl_csv_exception):
    pipe_csc = Pipeline(comp_details_ontology)
    
    pipe_csc = (pipe_csc
                         .rename_columns({
                             "state":'state_id'
                         })
                         .select([
                             'state_id',
 
                         ])
                         .distinct()
                         .transform(remove_null_records)
                        )
    pipe_csc.show_dimensions()
    pipe_csc.dataframe.printSchema()
    
    schema = metadata()
    validated_pipe = ValidatedPipeline(pipe_csc, schema)
    validated_pipe = (validated_pipe
                      .validate()
                     )
    
    validated_pipe.write_csv(ddl_csv,ddl_csv_exception)

def remove_null_records(dataframe):
    # Remove null records because there are states for UK/Swiss (they aint have states)
    dataframe = dataframe.na.drop()
    return dataframe
    

In [13]:
state_ddl(spark)

cols: 1 rows: 39
root
 |-- state_id: string (nullable = true)

Validated count: 39
Exception count: 0


## Country

In [14]:
@transform(spark,
    comp_details_ontology = Input('/company_details/ontology/company_details_ontology.parquet', spark),

    metadata = Metadata('/company_details/ddl/country_metadata_ddl.csv',spark),
    ddl_csv = Output('/company_details/ddl/country_ddl.csv'),
    ddl_csv_exception = Output('/company_details/exception/country_ddl_exception.csv')
)
def country_ddl(spark,comp_details_ontology, metadata, ddl_csv, ddl_csv_exception):
    pipe_csc = Pipeline(comp_details_ontology)
    
    pipe_csc = (pipe_csc
                         .rename_columns({
                             "country":'country_id'
                         })
                         .select([
                             'country_id',
 
                         ])
                         .distinct()
                         .transform(remove_null_records)
                        )
    pipe_csc.show_dimensions()
    pipe_csc.dataframe.show(10)
    pipe_csc.dataframe.printSchema()
    
    schema = metadata()
    validated_pipe = ValidatedPipeline(pipe_csc, schema)
    validated_pipe = (validated_pipe
                      .validate()
                     )
    
    validated_pipe.write_csv(ddl_csv,ddl_csv_exception)

def remove_null_records(dataframe):
    # Remove null records because there are states for UK/Swiss (they aint have states)
    dataframe = dataframe.na.drop()
    return dataframe
    

In [15]:
country_ddl(spark)

cols: 1 rows: 5
+--------------------+
|          country_id|
+--------------------+
|             Ireland|
|United States Of ...|
|         Switzerland|
|                Hm12|
|      United Kingdom|
+--------------------+

root
 |-- country_id: string (nullable = true)

Validated count: 5
Exception count: 0


## Shareholders

In [16]:
@transform(spark,
    comp_details_ontology = Input('/company_details/ontology/company_details_ontology.parquet', spark),

    metadata = Metadata('/company_details/ddl/shareholders_metadata_ddl.csv',spark),
    ddl_csv = Output('/company_details/ddl/shareholders_ddl.csv'),
    ddl_csv_exception = Output('/company_details/exception/shareholders_ddl_exception.csv')
)
def shareholder_ddl(spark,comp_details_ontology, metadata, ddl_csv, ddl_csv_exception):
    pipe_share = Pipeline(comp_details_ontology)
    print(metadata)
    pipe_share = (pipe_share
                         .rename_columns({
                             "shareholders_name":'shareholder_id'
                         })
                         .select([
                             'shareholder_id',
 
                         ])
                         .distinct()
                         .transform(remove_null_records)
                        )
    pipe_share.show_dimensions()
    pipe_share.dataframe.show(10,0)
    pipe_share.dataframe.printSchema()
    
    schema = metadata()
    validated_pipe = ValidatedPipeline(pipe_share, schema)
    validated_pipe = (validated_pipe
                      .validate()
                     )
    
    validated_pipe.write_csv(ddl_csv,ddl_csv_exception)

def remove_null_records(dataframe):
    # Remove null records because there are states for UK/Swiss (they aint have states)
    dataframe = dataframe.na.drop()
    return dataframe
    

In [17]:
shareholder_ddl(spark)

<skainet_spark.transform.Metadata object at 0x7fdbb4003c10>
cols: 1 rows: 478
+-------------------------------------------------+
|shareholder_id                                   |
+-------------------------------------------------+
|Kohlberg Kravis Roberts & Co. LP                 |
|Vanguard Windsor Funds - Vanguard Windsor II Fund|
|FIL Investments International                    |
|Mantle Ridge LP                                  |
|First Eagle Investment Management LLC            |
|Mairs & Power, Inc.                              |
|Fidelity Growth Company Fund                     |
|MFS International Intrinsic Value Fund           |
|ValueAct Capital Management LP                   |
|Elaine P. Wynn                                   |
+-------------------------------------------------+
only showing top 10 rows

root
 |-- shareholder_id: string (nullable = true)

Validated count: 478
Exception count: 0


## company_IN_INDUSTRY_industry

In [18]:
@transform(spark,
    comp_details_ontology = Input('/company_details/ontology/company_details_ontology.parquet', spark),

    metadata = Metadata('/company_details/ddl/company_IN_INDUSTRY_industry_metadata_ddl.csv',spark),
    ddl_csv = Output('/company_details/ddl/company_IN_INDUSTRY_industry_ddl.csv'),
    ddl_csv_exception = Output('/company_details/exception/company_IN_INDUSTRY_industry_ddl_exception.csv')
)
def company_IN_INDUSTRY_industry_ddl(spark,comp_details_ontology,metadata, ddl_csv,ddl_csv_exception):
    pipe_si = Pipeline(comp_details_ontology)
    
    pipe_si = (pipe_si
                         .rename_columns({
                             'ticker_symbol':'company_id',
                             "industry":'industry_id'
                         })
                         .select([
                             'company_id',
                             'industry_id',
 
                         ])
                         .distinct()
                         .transform(remove_null_records)
                        )
    pipe_si.show_dimensions()
    pipe_si.dataframe.printSchema()
    pipe_si.dataframe.show(10,0)
    schema = metadata()
    validated_pipe = ValidatedPipeline(pipe_si, schema)
    validated_pipe = (validated_pipe
                      .validate()
                     )
    
    validated_pipe.write_csv(ddl_csv,ddl_csv_exception)
    
    

In [19]:
company_IN_INDUSTRY_industry_ddl(spark)

cols: 2 rows: 436
root
 |-- company_id: string (nullable = true)
 |-- industry_id: string (nullable = true)

+----------+-----------------------------------+
|company_id|industry_id                        |
+----------+-----------------------------------+
|NTRS      |Asset Management                   |
|WELL      |REIT—Healthcare Facilities         |
|ALL       |Insurance—Property & Casualty      |
|PM        |Tobacco                            |
|RTX       |Aerospace & Defense                |
|HRL       |Packaged Foods                     |
|CAT       |Farm & Heavy Construction Machinery|
|LOW       |Home Improvement Retail            |
|ANSS      |Software—Application               |
|RCL       |Travel Services                    |
+----------+-----------------------------------+
only showing top 10 rows

Validated count: 436
Exception count: 0


## industry_MEMBER_OF_sector

In [20]:
@transform(spark,
 comp_details_ontology = Input('/company_details/ontology/company_details_ontology.parquet', spark),

    metadata = Metadata('/company_details/ddl/industry_MEMBER_OF_sector_metadata_ddl.csv',spark),
    ddl_csv = Output('/company_details/ddl/industry_MEMBER_OF_sector_ddl.csv'),
    ddl_csv_exception = Output('/company_details/exception/industry_MEMBER_OF_sector_ddl_exception.csv')
)
def industry_MEMBER_OF_sector_ddl(spark,comp_details_ontology,metadata, ddl_csv,ddl_csv_exception):
    pipe_si = Pipeline(comp_details_ontology)
    
    pipe_si = (pipe_si
                         .rename_columns({
                             'sector':'sector_id',
                             "industry":'industry_id'
                         })
                         .select([
                             'industry_id',
                             'sector_id',
 
                         ])
                         .distinct()
                         .transform(remove_null_records)
                        )
    pipe_si.show_dimensions()
    pipe_si.dataframe.printSchema()
    pipe_si.dataframe.show(10,0)
    schema = metadata()
    validated_pipe = ValidatedPipeline(pipe_si, schema)
    validated_pipe = (validated_pipe
                      .validate()
                     )
    
    validated_pipe.write_csv(ddl_csv,ddl_csv_exception)
    
    

In [21]:
industry_MEMBER_OF_sector_ddl(spark)

cols: 2 rows: 119
root
 |-- industry_id: string (nullable = true)
 |-- sector_id: string (nullable = true)

+-------------------------------------+----------------------+
|industry_id                          |sector_id             |
+-------------------------------------+----------------------+
|Industrial Distribution              |Industrials           |
|Communication Equipment              |Technology            |
|Staffing & Employment Services       |Industrials           |
|Specialty Chemicals                  |Basic Materials       |
|Telecom Services                     |Communication Services|
|Discount Stores                      |Consumer Defensive    |
|Utilities—Regulated Electric         |Utilities             |
|Medical Devices                      |Healthcare            |
|Utilities—Independent Power Producers|Utilities             |
|Medical Care Facilities              |Healthcare            |
+-------------------------------------+----------------------+
only showi

## company_LOCALIZED_IN_city

In [22]:
@transform(spark,
comp_details_ontology = Input('/company_details/ontology/company_details_ontology.parquet', spark),

    metadata = Metadata('/company_details/ddl/company_LOCALIZED_IN_city_metadata_ddl.csv',spark),
    ddl_csv = Output('/company_details/ddl/company_LOCALIZED_IN_city_ddl.csv'),
    ddl_csv_exception = Output('/company_details/exception/company_LOCALIZED_IN_city_ddl_exception.csv')
)
def company_LOCALIZED_IN_city_ddl(spark,comp_details_ontology,metadata, ddl_csv,ddl_csv_exception):
    pipe_csc = Pipeline(comp_details_ontology)
    
    pipe_csc = (pipe_csc
                         .rename_columns({
                             'ticker_symbol':'company_id',
                             "city":'city_id'
                         })
                         .select([
                             'company_id',
                             'city_id',
 
                         ])
                         .distinct()
                         .transform(remove_null_records)
                        )
    pipe_csc.show_dimensions()
    pipe_csc.dataframe.printSchema()
    pipe_csc.dataframe.show(10,0)
    schema = metadata()
    validated_pipe = ValidatedPipeline(pipe_csc, schema)
    validated_pipe = (validated_pipe
                      .validate()
                     )
    
    validated_pipe.write_csv(ddl_csv,ddl_csv_exception)
    
    

In [23]:
company_LOCALIZED_IN_city_ddl(spark)

cols: 2 rows: 441
root
 |-- company_id: string (nullable = true)
 |-- city_id: string (nullable = true)

+----------+------------+
|company_id|city_id     |
+----------+------------+
|BSX       |Marlborough |
|PPL       |Allentown   |
|AMD       |Santa Clara |
|VRSK      |Jersey City |
|SBAC      |Boca Raton  |
|USB       |Minneapolis |
|PAYX      |Rochester   |
|AEP       |Columbus    |
|MDT       |Minneapolis |
|TEL       |Schaffhausen|
+----------+------------+
only showing top 10 rows

Validated count: 441
Exception count: 0


## city_IS_IN_state

In [24]:
@transform(spark,
comp_details_ontology = Input('/company_details/ontology/company_details_ontology.parquet', spark),

    metadata = Metadata('/company_details/ddl/city_IS_IN_state_metadata_ddl.csv',spark),
    ddl_csv = Output('/company_details/ddl/city_IS_IN_state_ddl.csv'),
    ddl_csv_exception = Output('/company_details/exception/city_IS_IN_state_ddl_exception.csv')
)
def city_IS_IN_state_ddl(spark,comp_details_ontology,metadata, ddl_csv,ddl_csv_exception):
    pipe_csc = Pipeline(comp_details_ontology)
    
    pipe_csc = (pipe_csc
                         .rename_columns({
                             "city":'city_id',
                             "state":'state_id'
                         })
                         .select([
                             'city_id',
                             'state_id',
 
                         ])
                         .distinct()
                         .transform(remove_null_records)
                        )
    pipe_csc.show_dimensions()
    pipe_csc.dataframe.printSchema()
    pipe_csc.dataframe.show(10,0)
    schema = metadata()
    validated_pipe = ValidatedPipeline(pipe_csc, schema)
    validated_pipe = (validated_pipe
                      .validate()
                     )
    
    validated_pipe.write_csv(ddl_csv,ddl_csv_exception)
    
    

In [25]:
city_IS_IN_state_ddl(spark)

cols: 2 rows: 208
root
 |-- city_id: string (nullable = true)
 |-- state_id: string (nullable = true)

+-----------------+-------------+
|city_id          |state_id     |
+-----------------+-------------+
|Cincinnati       |Ohio         |
|Pittsburgh       |Pennsylvania |
|Columbus         |Ohio         |
|Waltham          |Massachusetts|
|Newport Beach    |California   |
|Livonia          |Michigan     |
|Chattanooga      |Tennessee    |
|San Jose         |California   |
|Minneapolis      |Minnesota    |
|Greenwood Village|Colorado     |
+-----------------+-------------+
only showing top 10 rows

Validated count: 208
Exception count: 0


## city_IS_IN_country

In [26]:
@transform(spark,
comp_details_ontology = Input('/company_details/ontology/company_details_ontology.parquet', spark),

    metadata = Metadata('/company_details/ddl/city_IS_IN_country_metadata_ddl.csv',spark),
    ddl_csv = Output('/company_details/ddl/city_IS_IN_country_ddl.csv'),
    ddl_csv_exception = Output('/company_details/exception/city_IS_IN_country_ddl_exception.csv')
)
def city_IS_IN_country_ddl(spark,comp_details_ontology,metadata, ddl_csv,ddl_csv_exception):
    pipe_csc = Pipeline(comp_details_ontology)
    
    pipe_csc = (pipe_csc
                         .rename_columns({
                             "city":'city_id',
                             "country":'country_id'
                         })
                         .select([
                             'city_id',
                             'country_id',
 
                         ])
                         .distinct()
                        .transform(filter_city_in_state)
                         .transform(remove_null_records)
                        )
    pipe_csc.show_dimensions()
    pipe_csc.dataframe.printSchema()
    pipe_csc.dataframe.show(10,0)
    schema = metadata()
    validated_pipe = ValidatedPipeline(pipe_csc, schema)
    validated_pipe = (validated_pipe
                      .validate()
                     )
    
    validated_pipe.write_csv(ddl_csv,ddl_csv_exception)
    
def filter_city_in_state(dataframe):
    dataframe = dataframe.where(F.col('state').isNull())
    return dataframe

In [27]:
city_IS_IN_country_ddl(spark)

cols: 2 rows: 5
root
 |-- city_id: string (nullable = true)
 |-- country_id: string (nullable = true)

+-------------+--------------+
|city_id      |country_id    |
+-------------+--------------+
|Hertfordshire|United Kingdom|
|Dublin       |Hm12          |
|Schaffhausen |Switzerland   |
|Dublin       |Ireland       |
|London       |United Kingdom|
+-------------+--------------+

Validated count: 5
Exception count: 0


## state_PART_OF_country

In [28]:
@transform(spark,
comp_details_ontology = Input('/company_details/ontology/company_details_ontology.parquet', spark),

    metadata = Metadata('/company_details/ddl/state_PART_OF_country_metadata_ddl.csv',spark),
    ddl_csv = Output('/company_details/ddl/state_PART_OF_country_ddl.csv'),
    ddl_csv_exception = Output('/company_details/exception/state_PART_OF_country_ddl_exception.csv')
)
def state_PART_OF_country_ddl(spark,comp_details_ontology,metadata, ddl_csv,ddl_csv_exception):
    pipe_csc = Pipeline(comp_details_ontology)
    #pipe_csc.dataframe.select('country','state','ticker_symbol').where(F.col('country') != F.lit('United States Of America') ).show(100)
    pipe_csc = (pipe_csc
                         .rename_columns({
                             "state":'state_id',
                             "country":'country_id'
                         })
                         .select([
                             'state_id',
                             'country_id',
                             
 
                         ])
                         .distinct()
                        #.transform(filter_city_in_state)
                         .transform(remove_null_records)
                        )
    pipe_csc.show_dimensions()
    pipe_csc.dataframe.printSchema()
    pipe_csc.dataframe.show(50,0)
    schema = metadata()
    validated_pipe = ValidatedPipeline(pipe_csc, schema)
    validated_pipe = (validated_pipe
                      .validate()
                     )
    
    validated_pipe.write_csv(ddl_csv,ddl_csv_exception)
    


In [29]:
state_PART_OF_country_ddl(spark)

cols: 2 rows: 39
root
 |-- state_id: string (nullable = true)
 |-- country_id: string (nullable = true)

+--------------------+------------------------+
|state_id            |country_id              |
+--------------------+------------------------+
|Ohio                |United States Of America|
|District Of Columbia|United States Of America|
|Rhode Island        |United States Of America|
|Minnesota           |United States Of America|
|Texas               |United States Of America|
|Pennsylvania        |United States Of America|
|Tennessee           |United States Of America|
|Missouri            |United States Of America|
|Massachusetts       |United States Of America|
|New York            |United States Of America|
|North Carolina      |United States Of America|
|Georgia             |United States Of America|
|Utah                |United States Of America|
|Indiana             |United States Of America|
|Washington          |United States Of America|
|Illinois            |United St

## shareholder_HAS_SHARES_IN_company

In [30]:
@transform(spark,
comp_details_ontology = Input('/company_details/ontology/company_details_ontology.parquet', spark),

    metadata = Metadata('/company_details/ddl/shareholder_HAS_SHARES_IN_company_metadata_ddl.csv',spark),
    ddl_csv = Output('/company_details/ddl/shareholder_HAS_SHARES_IN_company_ddl.csv'),
    ddl_csv_exception = Output('/company_details/exception/shareholder_HAS_SHARES_IN_company_ddl_exception.csv')
)
def shareholder_HAS_SHARES_IN_company_ddl(spark,comp_details_ontology,metadata, ddl_csv,ddl_csv_exception):
    pipe_sh = Pipeline(comp_details_ontology)
    
    pipe_sh = (pipe_sh
                         .rename_columns({
                             "shareholders_name":'shareholder_id',
                             "ticker_symbol":'company_id',
                             "percentage":'percentage'
                         })
                         .select([
                             'shareholder_id',
                             'company_id',
                             'percentage'
                             
 
                         ])
                         .distinct()
                        #.transform(filter_city_in_state)
                         .transform(remove_null_records)
                        )
    pipe_sh.show_dimensions()
    pipe_sh.dataframe.printSchema()
    pipe_sh.dataframe.show(50,0)
    schema = metadata()
    validated_pipe = ValidatedPipeline(pipe_sh, schema)
    validated_pipe = (validated_pipe
                      .validate()
                     )
    
    validated_pipe.write_csv(ddl_csv,ddl_csv_exception)
    


In [31]:
shareholder_HAS_SHARES_IN_company_ddl()

cols: 3 rows: 6609
root
 |-- shareholder_id: string (nullable = true)
 |-- company_id: string (nullable = true)
 |-- percentage: double (nullable = true)

+-----------------------------------------------------------+----------+----------+
|shareholder_id                                             |company_id|percentage|
+-----------------------------------------------------------+----------+----------+
|T. Rowe Price Associates, Inc. (Investment Management)     |MET       |1.91      |
|Geode Capital Management LLC                               |BRK-B     |1.91      |
|Norges Bank (13F)                                          |PEP       |1.0       |
|Massachusetts Financial Services Co.                       |CMCSA     |2.97      |
|Hotchkis & Wiley Capital Management LLC                    |CMI       |1.6       |
|Vanguard 500 Index Fund                                    |GS        |1.89      |
|Capital Research & Management Co. (World Investors)        |NKE       |2.59      |
|T. R