In [1]:
# [+] SparkSession 설정
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local').appName('udf').getOrCreate()

# Annotation 방식으로 UDF 등록하기 @
from pyspark.sql.functions import udf

In [2]:
# 샘플 데이터: 이름 및 나이
age_category = [
    ('김일', 16),
    ('박이', 28),
    ('강삼', 50),
    ('나사', 39),
    ('권오', 42),
    ('현육', 73),
    ('서칠', 20),
    ('명팔', 80)
]


In [3]:
# [+] 스키마 정의
schema = ['name', 'age']

In [4]:
# [+] 데이터프레임 생성
df = spark.createDataFrame(age_category, schema)

In [5]:
# temporary view 생성
df.createOrReplaceTempView('name_age')

In [6]:
# [+] 데이터프레임 출력
df.show()

+----+---+
|name|age|
+----+---+
|김일| 16|
|박이| 28|
|강삼| 50|
|나사| 39|
|권오| 42|
|현육| 73|
|서칠| 20|
|명팔| 80|
+----+---+



In [7]:
# udf 함수 생성

def age_category(age):
    if age < 35:                  #35살 미만 = 'young'
        return 'young'
    elif age >= 35 and age < 60:  #35살 이상 60살 미만 = 'adult'
        return 'adult'
    elif age >= 60:               #60살 이상 = 'senior'
        return 'senior'

In [8]:
#UDF 등록
spark.udf.register('age_category',age_category)

<function __main__.age_category(age)>

In [9]:
#SQL문 처리
spark.sql('SELECT name, age_category(age) AS age \
           FROM name_age').show()

+----+------+
|name|   age|
+----+------+
|김일| young|
|박이| young|
|강삼| adult|
|나사| adult|
|권오| adult|
|현육|senior|
|서칠| young|
|명팔|senior|
+----+------+

