이를 위해 pyspark과 Py4J 패키지를 설치한다. Py4J 패키지는 파이썬 프로그램이 자바가상머신상의 오브젝트들을 접근할 수 있게 해준다. Local Standalone Spark을 사용한다.

In [1]:
!pip install pyspark==3.3.1 py4j==0.10.9.5 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


**Spark Session**을 하나 만든다

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark Unit Test") \
    .getOrCreate()

In [3]:
!wget https://s3-geospatial.s3-us-west-2.amazonaws.com/name_gender.csv

--2023-07-22 05:58:49--  https://s3-geospatial.s3-us-west-2.amazonaws.com/name_gender.csv
Resolving s3-geospatial.s3-us-west-2.amazonaws.com (s3-geospatial.s3-us-west-2.amazonaws.com)... 3.5.87.129, 52.92.137.178, 52.218.236.225, ...
Connecting to s3-geospatial.s3-us-west-2.amazonaws.com (s3-geospatial.s3-us-west-2.amazonaws.com)|3.5.87.129|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 997 [text/csv]
Saving to: ‘name_gender.csv’


2023-07-22 05:58:50 (18.4 MB/s) - ‘name_gender.csv’ saved [997/997]



In [4]:
df = spark.read.option("header", True).csv("name_gender.csv")
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- gender: string (nullable = true)



In [5]:
df.count()

100

In [6]:
df.createOrReplaceTempView("namegender")
spark.sql("SELECT gender, COUNT(1) count FROM namegender GROUP BY 1").show()

+------+-----+
|gender|count|
+------+-----+
|     F|   65|
|     M|   28|
|Unisex|    7|
+------+-----+



In [7]:
from pyspark.sql.functions import pandas_udf
from pyspark.sql.types import *
import pandas as pd

# Define the UDF
@pandas_udf(StringType())
def upper_udf_f(s: pd.Series) -> pd.Series:
    return s.str.upper()

upperUDF = spark.udf.register("upper_udf", upper_udf_f)

In [8]:
def load_gender(spark, file_path):
    return spark.read.option("header", True).csv(file_path)

def get_gender_count(spark, df, field_to_count):
    df.createOrReplaceTempView("namegender_test")
    return spark.sql(f"SELECT {field_to_count}, COUNT(1) count FROM namegender_test GROUP BY 1")

In [9]:
df = load_gender(spark, "name_gender.csv")
get_gender_count(spark, df, "gender").show()
df.select(upperUDF("name").alias("NAME")).show()

+------+-----+
|gender|count|
+------+-----+
|     F|   65|
|     M|   28|
|Unisex|    7|
+------+-----+

+----------+
|      NAME|
+----------+
|  ADALEIGH|
|     AMRYN|
|    APURVA|
|    ARYION|
|    ALIXIA|
|ALYSSAROSE|
|    ARVELL|
|     AIBEL|
|   ATIYYAH|
|     ADLIE|
|    ANYELY|
|    AAMONI|
|     AHMAN|
|    ARLANE|
|   ARMONEY|
|   ATZHIRY|
| ANTONETTE|
|   AKEELAH|
| ABDIKADIR|
|    ARINZE|
+----------+
only showing top 20 rows



In [10]:
df.select(upperUDF("name").alias("NAME")).collect()

[Row(NAME='ADALEIGH'),
 Row(NAME='AMRYN'),
 Row(NAME='APURVA'),
 Row(NAME='ARYION'),
 Row(NAME='ALIXIA'),
 Row(NAME='ALYSSAROSE'),
 Row(NAME='ARVELL'),
 Row(NAME='AIBEL'),
 Row(NAME='ATIYYAH'),
 Row(NAME='ADLIE'),
 Row(NAME='ANYELY'),
 Row(NAME='AAMONI'),
 Row(NAME='AHMAN'),
 Row(NAME='ARLANE'),
 Row(NAME='ARMONEY'),
 Row(NAME='ATZHIRY'),
 Row(NAME='ANTONETTE'),
 Row(NAME='AKEELAH'),
 Row(NAME='ABDIKADIR'),
 Row(NAME='ARINZE'),
 Row(NAME='ARSHAUN'),
 Row(NAME='ALEXANDRO'),
 Row(NAME='AYRIAUNA'),
 Row(NAME='AQIB'),
 Row(NAME='ALLEYA'),
 Row(NAME='AAVAH'),
 Row(NAME='ANESTI'),
 Row(NAME='ADALAIDE'),
 Row(NAME='ANALENA'),
 Row(NAME='ALAEYAH'),
 Row(NAME='ALBENA'),
 Row(NAME='AIMI'),
 Row(NAME='ADWAITH'),
 Row(NAME='ARKADY'),
 Row(NAME='ASTYN'),
 Row(NAME='ADELEE'),
 Row(NAME='AGATA'),
 Row(NAME='ALEGNA'),
 Row(NAME='ALTAN'),
 Row(NAME='AHNALEIGH'),
 Row(NAME='ALGIE'),
 Row(NAME='ASHANTI'),
 Row(NAME='AISLYN'),
 Row(NAME='ADALEINE'),
 Row(NAME='ANTHNOY'),
 Row(NAME='ALGERNON'),
 Row(NAME='

## 유닛 테스트 코드 붙여보기

In [11]:
from unittest import TestCase

# 일반적으로는 아래 함수가 정의된 모듈을 임포트하고 그걸 테스트
#  - upper_udf_f
#  - load_gender
#  - get_gender_count

class UtilsTestCase(TestCase):
    spark = None

    @classmethod
    def setUpClass(cls) -> None:
        cls.spark = SparkSession.builder \
            .appName("Spark Unit Test") \
            .getOrCreate()

    def test_datafile_loading(self):
        sample_df = load_gender(self.spark, "name_gender.csv")
        result_count = sample_df.count()
        self.assertEqual(result_count, 100, "Record count should be 100")

    def test_gender_count(self):
        sample_df = load_gender(self.spark, "name_gender.csv")
        count_list = get_gender_count(self.spark, sample_df, "gender").collect()
        count_dict = dict()
        for row in count_list:
            count_dict[row["gender"]] = row["count"]
        self.assertEqual(count_dict["F"], 65, "Count for F should be 65")
        self.assertEqual(count_dict["M"], 28, "Count for M should be 28")
        self.assertEqual(count_dict["Unisex"], 7, "Count for Unisex should be 7")

    def test_upper_udf(self):
        test_data = [
            { "name": "John Kim" },
            { "name": "Johnny Kim"},
            { "name": "1234" }
        ]
        expected_results = [ "JOHN KIM", "JOHNNY KIM", "1234" ]

        upperUDF = self.spark.udf.register("upper_udf", upper_udf_f)
        test_df = self.spark.createDataFrame(test_data)
        names = test_df.select("name", upperUDF("name").alias("NAME")).collect()
        results = []
        for name in names:
            results.append(name["NAME"])
        self.assertCountEqual(results, expected_results)

    @classmethod
    def tearDownClass(cls) -> None:
        cls.spark.stop()

In [12]:
import unittest

unittest.main(argv=[''], verbosity=2, exit=False)

test_datafile_loading (__main__.UtilsTestCase) ... ok
test_gender_count (__main__.UtilsTestCase) ... ok
test_upper_udf (__main__.UtilsTestCase) ... ok

----------------------------------------------------------------------
Ran 3 tests in 4.486s

OK


<unittest.main.TestProgram at 0x7f41a42a5fa0>