In [None]:
# Import the MayStreet Data Python library - this is already provided for you inside Workbench.
import maystreet_data
# Import the well known Python Pandas library (https://pandas.pydata.org)
import pandas as pd


records_iter = maystreet_data.query(
    maystreet_data.DataSource.DATA_LAKE,
    f"""
    SELECT
        dt, guid, timestamps[1], data.headline, data.body, data.subjects
    FROM 
        "prod_lake.p_mst_data_lake".mt_news
    WHERE 
        dt='2023-05-10'
    LIMIT 10
""",
)

# Create a Pandas Data Frame from the iterator; an iterator will return one row at a time but we'd like
# to see them all.
data = pd.DataFrame(records_iter)

# Display the data in the Jupyter output cell.
data


In [None]:
# Import the MayStreet Data Python library - this is already provided for you inside Workbench.
import maystreet_data
# Import the well known Python Pandas library (https://pandas.pydata.org)
import pandas as pd

# Given a ticker identifier, in this case "COIN", find all the news items associated with it.
#
# Explanation of news_plus_pids_and_rics_columns subquery:
# Queries for all the columns in the mt_news table, and adds two columns, rics and pids.
# These are parsed from the data column of the news table. 
# The data column is a struct containing many items, and the one we are concerned about in this query is "subjects".
# The data.subjects is an array containing subject identifiers related to the news item. 
# There are several types of subjects, but the ones we are concerned with are the ones prefixed with P: and R:.
# Subjects prefixed with P: are Refinitiv perm ids, and those prefixed with R: are Refinitive Identification Codes (RICs).
# The columns "rics" and "pids" filter for items in data.subjects that are prefixed with R: and P: respectively,
# and also removes the prefixes. 
# The resulting items are compared against pids and rics from the mapping tables in the WHERE clause.

# Explanation of WHERE clause subqueries, going from deepest nesting and walking outwards:
# Joins the mt_pid_to_ticker and mt_pid_to_ric mappings tables and queries it for records associated with ticker "COIN".
# It casts the permid to a string (originally an int) for easier comparison to records in the mt_news table. 
# The result is ticker_mapping_subquery.
# Then it aggregates the resulting list of rics and pids into two individual arrays (extract_rics_and_pids_from_ticker_mapping_subquery), 
# then concatenates them together (concat_pids_and_rics_to_one_array_subquery).
# At this point the relation concat_pids_and_rics_to_one_array_subquery is a single array consisting of
# pid and rics associated with the ticker "COIN".
# This is then compared to a concatenated array of the rics and pids columns generated from news_plus_pids_and_rics_columns.
records_iter = maystreet_data.query(
    maystreet_data.DataSource.DATA_LAKE,
    f"""
    SELECT news_plus_pids_and_rics_columns.*
    FROM (
        SELECT *,
        transform(filter(data.subjects, x -> regexp_like(x, '^R:*')), x -> regexp_replace(x, '(^R:)(.*)', '$2') ) AS rics,
        transform(filter(data.subjects, (x) -> regexp_like(x, '^P:*')), (x) -> regexp_replace(x, '(^P:)(.*)', '$2')) AS pids
        FROM mt_news
    ) news_plus_pids_and_rics_columns
    WHERE arrays_overlap(concat(rics, pids),  
        (SELECT concat(ticker_rics, ticker_pids) FROM
            (SELECT array_distinct(ARRAY_AGG(ric)) AS ticker_rics, array_distinct(ARRAY_AGG(string_pid)) AS ticker_pids
                FROM (
                    SELECT DISTINCT ric, CAST(mt_pid_to_ticker.permid AS VARCHAR) AS string_pid
                    FROM mt_pid_to_ticker 
                    JOIN mt_pid_to_ric ON mt_pid_to_ticker.permid = mt_pid_to_ric.permid 
                    WHERE ticker='COIN'
                ) ticker_mapping_subquery
            ) extract_rics_and_pids_from_ticker_mapping_subquery
        ) concat_pids_and_rics_to_one_array_subquery
    ) compare_news_pids_and_rics_to_ticker_query_pids_and_rics_condition
    LIMIT 10
""",
)

# Create a Pandas Data Frame from the iterator; an iterator will return one row at a time but we'd like
# to see them all.
data = pd.DataFrame(records_iter)

# Display the data in the Jupyter output cell.
data