# PySparkのTips

細かいTips、テクニックをまとめる。

In [8]:
from glob import glob

import polars as pl
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import col
from pyspark.sql import functions as F
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.functions import vector_to_array
import numpy as np

# Create a SparkSession。pythonからsparkを使う場合、セッションの作成が必要。
spark = SparkSession.builder.appName("Testing PySpark Example").getOrCreate()

# デフォルトのログレベルだと大量にログが出力されるので限定する。
spark.sparkContext.setLogLevel("ERROR")

In [3]:
# 各データ読み込み
df_receipt = spark.read.parquet("../../../100knocks-preprocess/docker/work/data/receipt.parquet")

# 店舗データ
df_store = spark.read.parquet("../../../100knocks-preprocess/docker/work/data/store.parquet")

# 顧客データ
df_customer = spark.read.parquet("../../../100knocks-preprocess/docker/work/data/customer.parquet")

# 製品データ
df_product = spark.read.parquet("../../../100knocks-preprocess/docker/work/data/product.parquet")

# 製品データ
df_category = spark.read.parquet("../../../100knocks-preprocess/docker/work/data/category.parquet")

                                                                                

## 特定カラムのユニークな値をリストとしてすべて取得
下記のような方法がある。どちらにせよめんどくさい。

In [4]:
# collectの結果の各値は対象カラムをキーとする辞書のような形で取得できる
[v["gender_cd"] for v in df_customer.select("gender_cd").distinct().collect()]

                                                                                

['0', '9', '1']

In [5]:
df_customer.select("gender_cd").dropDuplicates().rdd.flatMap(lambda x: x).collect()

                                                                                

['0', '9', '1']

In [6]:
# Dataframeとして取得したいなら.distinctでOK
df_customer.select("gender_cd").distinct().show()

+---------+
|gender_cd|
+---------+
|        0|
|        9|
|        1|
+---------+



## joinの結合条件にcontainsを用いる
結構便利。

In [29]:
# サンプルデータの作成
data_a = [
    (1, "This is a sample message"),
    (2, "Another example message"),
    (3, "Message with a keyword"),
    (4, "No match here"),
    (5, "samplesample"),
    (6, "sample keyword"),
]
columns_a = ["id", "message"]
df_a = spark.createDataFrame(data_a, columns_a)

data_b = [
    (1, "sample", "MSG001"),
    (2, "example", "MSG002"),
    (3, "keyword", "MSG003")
]
columns_b = ["id" ,"message_key", "MSG_No"]
df_b = spark.createDataFrame(data_b, columns_b)


In [30]:
df_a.show()

+---+--------------------+
| id|             message|
+---+--------------------+
|  1|This is a sample ...|
|  2|Another example m...|
|  3|Message with a ke...|
|  4|       No match here|
|  5|        samplesample|
|  6|      sample keyword|
+---+--------------------+



In [31]:
df_b.show()

+---+-----------+------+
| id|message_key|MSG_No|
+---+-----------+------+
|  1|     sample|MSG001|
|  2|    example|MSG002|
|  3|    keyword|MSG003|
+---+-----------+------+



In [32]:
df_a.join(
    df_b,
    # 結合条件にcontainsを使用。messageにmessage_keyが含まれていれば紐づける
    F.contains(df_a.message, df_b.message_key),
    "left"
).show()

+---+--------------------+----+-----------+------+
| id|             message|  id|message_key|MSG_No|
+---+--------------------+----+-----------+------+
|  1|This is a sample ...|   1|     sample|MSG001|
|  2|Another example m...|   2|    example|MSG002|
|  3|Message with a ke...|   3|    keyword|MSG003|
|  4|       No match here|NULL|       NULL|  NULL|
|  5|        samplesample|   1|     sample|MSG001|
|  6|      sample keyword|   1|     sample|MSG001|
|  6|      sample keyword|   3|    keyword|MSG003|
+---+--------------------+----+-----------+------+



"sample keyword"のように２つの単語に紐づく場合は2行紐づく。