In [1]:
ldap_username = 'malininpa'

In [2]:
import sys, os, re
import pandas as pd
import numpy as np
import sqlalchemy
import subprocess

sys.path.append('/home/shared/')

from common.spark import Spark
from common.database import Database
import pyspark
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
import datetime

jar_paths = ['/etc/jdbc/*']


pd.set_option('max_columns', None)
pd.set_option('max_colwidth', None)

project_name = "row_count_onco"
feature_name = "row_count_onco"
schema_name = "common_analytics2" # ввести свою схему
project_path = f"/user/{ldap_username}/{feature_name}" 

sc = Spark(project_name)

# инициализация спарковского окружения

queue = None
ui_enable = 'true'
log_fl = None


spark_config = {
    "spark.dynamicAllocation.maxExecutors": "1000",
    'spark.driver.extraClassPath': ':'.join(jar_paths),
    'spark.executor.extraClassPath': ':'.join(jar_paths),
    'spark.jars': ','.join(jar_paths),
    'spark.ui.enabled': ui_enable,
    'spark.executor.memory': '32g',
    'spark.driver.memory': '32g',
    'spark.scheduler.mode': 'FAIR',
    'spark.driver.maxResultSize': '16g',
    'spark.yarn.queue': queue
}
spark = sc.run("loaded_ui", config=spark_config)  # simple or loaded
sc.load_simi()  # loads - пока не нужно

# подключение к гринпламу (либа sqlalchemy)
gp = Database('gp_etl', mode='token')

Starting Spark Session...
Spark Session row_count started
metadata <= ods_simi_ora_emias_simi_v2.metadata (hdp_active_flg==1)
tags <= ods_simi_ora_emias_simi_v2.document_tag (hdp_active_flg==1)
class <= ods_simi_ora_emias_simi_v2.document_class (hdp_active_flg==1)
association <= ods_simi_ora_emias_simi_v2.document_association (hdp_active_flg==1)
tagvalue <= ods_simi_docs_xml.docs_tagvalue
Авторизация через персональный SECRET_TOKEN


# Load

In [3]:
cct_to_cnt = (56080, 81051)

In [4]:
spark.sql(f"""
select a.*
from tagvalue a
where true
and document_class_id in {cct_to_cnt}
""").write.mode('overwrite').parquet(os.path.join(project_path, 'tagvalue_custom'))

In [21]:
spark.sql(f"""
select a.*
from tagvalue a
where true
and id = 'e10399d7-6f37-4ceb-8e32-157b84720169'
and document_created_date = '2022-09-05'
and document_class_id in {cct_to_cnt}
""").write.mode('overwrite').parquet(os.path.join(project_path, 'tagvalue_one_doc'))

In [35]:
df = spark.read.format('parquet').load(os.path.join(project_path, 'tagvalue_custom'))
df.createOrReplaceTempView('df')
df = df.toDF(*[c.lower() for c in df.columns])


# Processing

In [36]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import ArrayType, StringType, IntegerType
df_agg = (df
          .groupBy('xpath', 'id')
          .agg(
              F.count('id').alias('cnt')
          )
         )

In [37]:
df = df_agg.withColumn('xpath_star', F.regexp_replace('xpath', r'[^\]]+/|[^\]]+$', ''))

In [38]:
df = df.dropDuplicates(['id', 'xpath_star'])

In [39]:
df = df.withColumn('bracket_cnt', size(split(col('xpath_star'), r"\[")) - 1)

In [40]:
df = df.withColumn('xpath_base', F.regexp_extract('xpath_star', r'^(.*?)\[', 0))

In [41]:
df = df.filter(df.xpath_base != '')

In [42]:
df_max_bracket_cnt = (df
          .groupBy('xpath_base', 'id')
          .agg(
              F.max('bracket_cnt').alias('max_bracket_cnt')
          )
         )

In [43]:
df_max_bracket_cnt = (
    df_max_bracket_cnt
    .withColumnRenamed('xpath_base', 'xpath_base_max')
    .withColumnRenamed('id', 'id_max')    
)

In [44]:
df = df.join(df_max_bracket_cnt, (df.id == df_max_bracket_cnt.id_max) & (df.xpath_base == df_max_bracket_cnt.xpath_base_max) & (df.bracket_cnt == df_max_bracket_cnt.max_bracket_cnt))

In [45]:
df = (df
          .groupBy('xpath_base', 'id')
          .agg(
              F.count('xpath_star').alias('cnt')
          )
         )


In [46]:
onco_consilium_row_count = (df
          .groupBy('id')
          .agg(
              floor(round(exp(F.sum(log('cnt'))))).alias('product')
          )
         )

In [48]:
schema_name = 'common_analytics2'
table_name = 'onco_consilium_row_count'
project_path = os.path.join(f"/user/{ldap_username}/{schema_name}", table_name)
                             
onco_consilium_row_count.write.parquet(project_path, mode = 'overwrite')

In [49]:
def get_external_table_ddl(path, table):
    tpl = """DROP EXTERNAL TABLE IF EXISTS {table};
    CREATE EXTERNAL TABLE {table} ( 
    {fields} 
    )
    LOCATION ('pxf:///{path}?PROFILE=hdfs:parquet') ON ALL
    FORMAT 'CUSTOM' ( FORMATTER='pxfwritable_import' )
    ENCODING 'UTF8';"""
#     spark = sc.spark
#     df = spark.read.parquet(path)
    df = spark.read.format('parquet').load(path)
    dict_types = {"string": "text", "int": "integer"}
    types =list(map(lambda x: (x[0],dict_types[x[1]] if x[1] in dict_types.keys() else x[1]), df.dtypes))
    fields = ',\n'.join([f"{i[0]} {i[1]}" for i in types])
    return tpl.format(table=table, fields=fields, path=path)

In [50]:
project_path

'/user/malininpa/common_analytics2/onco_consilium_row_count'

In [51]:
q = get_external_table_ddl(project_path, f'{schema_name}.{table_name}')
q = q + f" GRANT ALL PRIVILEGES ON {schema_name}.{table_name} TO gpuser;"

In [52]:
with gp.con.connect().execution_options(autocommit=True) as conn:
    conn.execute(q)