In [1]:
# SparkSession 설정
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local').appName('udf').getOrCreate()

In [2]:
# 샘플 데이터 생성 (name,age)
person_info = [
    ('신짱구',5),
    ('김철수',18),
    ('이훈',36),
    ('신유리',44),
    ('맹구',71)
]

In [3]:
#스키마 정의
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType
schema = ['name','age']

In [4]:
# 데이터프레임 생성
df = spark.createDataFrame(data = person_info, schema = schema) #csv로 읽는게 아니고 data자체로 읽기때문

In [5]:
#데이터프레임 출력
df.show()

+------+---+
|  name|age|
+------+---+
|신짱구|  5|
|김철수| 18|
|  이훈| 36|
|신유리| 44|
|  맹구| 71|
+------+---+



In [6]:
#데이터프레임 스키마 출력
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)



In [7]:
# Temporary View 생성
df.createOrReplaceTempView('name_age')

In [8]:
# Annotation 방식으로 UDF 등록하기
from pyspark.sql.functions import udf

In [9]:
# UDF: 나이별 
def age_category(age):
    if age < 35:
        return 'young' 
    elif 35 <= age and age <= 59:
        return 'adult'
    else:
        return 'senior'

In [10]:
#udf 등록
spark.udf.register("age_category",age_category)

<function __main__.age_category(age)>

In [11]:
#SQL문 처리
spark.sql(
    "SELECT name, age_category(age) AS age_category FROM name_age").show()

+------+------------+
|  name|age_category|
+------+------------+
|신짱구|       young|
|김철수|       young|
|  이훈|       adult|
|신유리|       adult|
|  맹구|      senior|
+------+------------+

