## 1. mac_addr csv 가져오기

In [0]:
# csv path
csv_path = "file:/Workspace/EIC_데이터엔지니어/databricks_jira/eic/1218/mac_addr.csv"

# spark dataframe 
df = (spark.read
      .format("csv")
      .option("header", True)           # 첫 행을 컬럼명으로 사용
      .option("inferSchema", True)      # 간단한 경우 자동 스키마 추론
      # .option("delimiter", ",")       # 구분자 기본은 ',' (필요 시 지정)
      # .option("encoding", "utf-8")    # 인코딩 필요 시 지정
      .load(csv_path)
     )
    
# 맥 값 전처리 
from pyspark.sql import functions as F
df = df.withColumn(
    "mac_addr_2",
    F.concat_ws(
        ":",
        F.substring("MAC_ADDRESS", 1, 2),
        F.substring("MAC_ADDRESS", 3, 2),
        F.substring("MAC_ADDRESS", 5, 2),
        F.substring("MAC_ADDRESS", 7, 2),
        F.substring("MAC_ADDRESS", 9, 2),
        F.substring("MAC_ADDRESS", 11, 2),
    )
)

# 맥 해쉬 처리
tv_salt = dbutils.secrets.get('admin', 'salt')

df = df.withColumn("mac_addr_hashed", 
                   F.when(
                       F.col("mac_addr_2").isNull() | (F.col("mac_addr_2") == ''), 
                       None)\
                    .otherwise(F.sha2(F.concat(F.col("mac_addr_2"), F.lit(tv_salt)), 256)))

display(df)


In [0]:
import pandas as pd

# sdf -> pdf 
pdf = df.toPandas()
pdf.columns = ['_'.join(col.split(' ')) for col in pdf.columns]
pdf = pdf[['Production_Date'
           , 'Model.Suffix'
           , 'SET_ID'
           , 'MAC_ADDRESS'
           , 'mac_addr_hashed']]
pdf = pdf.reset_index(drop=False)
display(pdf)

In [0]:
spark.createDataFrame(pdf[['mac_addr_hashed']]).createOrReplaceTempView("tmp_mac_addr")

## 2. activation_date - min(crt_date) 구하기

In [0]:
df_result_1 = spark.sql(f''' 
    
    select raw.mac_addr
           , min(raw.crt_date) as min_crt_date -- 제공해야함
           , first(raw.Platform_code) as platform_code -- 제공해야함
           , max(raw.last_chg_date) as max_last_chg_date
           , date_format(max(raw.last_chg_date), 'yyyy-MM') as date_ym
    from   eic_data_ods.tlamp.activation_date raw
    inner join tmp_mac_addr tmp on raw.mac_addr = tmp.mac_addr_hashed
    group by mac_addr
''')
display(df_result_1)

In [0]:
df_result_1\
    .write.mode('overwrite')\
    .saveAsTable(f"sandbox.z_eunmi1_ko.temp_1218_1") # 중간 저장 

## 3. 노말로그 값 구하기
> 3-1) max_last_chg_date를 yyyy-MM 로 바꾸고, date_ym별 mac_addr 대한 마지막 normal_log 추출 (union 활용) <br/>
> 3-2) 단, max_last_chg_date < '2025-12' 이면, 데이터 없음으로 추출 (2년 보관 중)

In [0]:
df_result_2 = None

df_result_1 = spark.table(f"sandbox.z_eunmi1_ko.temp_1218_1")
for p_date_ym in df_result_1.select("date_ym").distinct().collect():
    print(p_date_ym.date_ym)

    t_df = df_result_1.where(f"date_ym = '{p_date_ym.date_ym}'")
    t_df.createOrReplaceTempView("df_result_1_tmp")
    
    _sqldf = spark.sql(f'''
    
        -- 조건 필터링 먼저
        WITH filtered AS (
            SELECT nl.mac_addr, nl.context_name, nl.message_id, nl.normal_log, nl.log_create_time
            FROM eic_data_ods.tlamp.normal_log_webos25 nl
            INNER JOIN df_result_1_tmp mac using(mac_addr)
            WHERE nl.date_ym = '{p_date_ym.date_ym}'
            AND nl.X_device_country = 'TR'
        )

        -- 최신 1건 선택 (윈도우 함수)
        SELECT
            mac_addr,
            context_name,          -- 최신행의 값
            message_id,            -- 최신행의 값
            normal_log,            -- 최신행의 값
            log_create_time
            -- X_device_platform   AS `platform_code`,      -- 필수 열
            -- X_device_product    AS `platform_version`,   -- 필수 열
            -- X_Device_Country    AS `country_code`,       -- 필수/조건 열
            -- log_create_time,                              -- 필수/조건 열
            -- X_Device_SDK_VERSION AS `dpv`,
            -- x_device_sales_model AS `sales_model_code`   -- 필수 열
        FROM (
        SELECT
            filtered.*,
            ROW_NUMBER() OVER (
                PARTITION BY mac_addr
                ORDER BY log_create_time, context_name, message_id DESC
            ) AS rn
        FROM filtered
        ) t
        WHERE rn = 1;
    ''')

    print(_sqldf.count())

    if _sqldf.count() > 0 :
        if df_result_2 is None:
            df_result_2 = _sqldf
        else:   
            df_result_2 = df_result_2.union(_sqldf)

display(df_result_2)

In [0]:
df_result_2\
    .write.mode('overwrite')\
    .saveAsTable(f"sandbox.z_eunmi1_ko.temp_1218_2")

#### 확인

In [0]:
%sql 
select * from sandbox.z_eunmi1_ko.temp_1218_2

In [0]:
%sql 
-- mac_addr, min_crt_date, platform_code
select * from sandbox.z_eunmi1_ko.temp_1218_1

In [0]:
%sql 
select mac.mac_addr_hashed, t1.min_crt_date, t1.log_create_time, t1.platform_code, t2.normal_log
from   tmp_mac_addr mac
left join sandbox.z_eunmi1_ko.temp_1218_1 t1 on mac.mac_addr_hashed = t1.mac_addr 
left join sandbox.z_eunmi1_ko.temp_1218_2 t2 on mac.mac_addr_hashed = t2.mac_addr