### 정규표현식을 이용하여 텍스트에서 문자 추출하기

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

conf = SparkConf()
conf.set('spark.app.name', 'PySpark DataFrame 3')
conf.set('spark.master', 'local[*]')

spark = SparkSession.builder\
        .config(conf = conf)\
        .getOrCreate()

In [3]:
from pyspark.sql.types import StructField, StructType, StringType

schema = StructType([
    StructField('text', StringType(), True)
])

df = spark.read.schema(schema).format('text').load('transfer_cost.txt')

df.show(5, truncate = False)

+---------------------------------------------------------------------------+
|text                                                                       |
+---------------------------------------------------------------------------+
|On 2021-01-04 the cost per ton from 85001 to 85002 is $28.32 at ABC Hauling|
|On 2021-01-04 the cost per ton from 85001 to 85004 is $25.68 at ABC Hauling|
|On 2021-01-04 the cost per ton from 85001 to 85007 is 19.86 at ABC Hauling |
|On 2021-01-04 the cost per ton from 85001 to 85007 is 20.52 at Haul Today  |
|On 2021-01-04 the cost per ton from 85001 to 85010 is 20.72 at Haul Today  |
+---------------------------------------------------------------------------+
only showing top 5 rows



정규표현식을 사용하여 일정한 패턴의 텍스트 데이터에서 원하는 부분을 가져올 수 있다.
- `\S`: 공백이 아닌 문자
- `\d`: 숫자
- `+`: 앞의 문자가 1개 이상
- `.`: 임의의 문자 1개
- `*`: 앞의 문자가 0개 이상

In [None]:
import pyspark.sql.functions as f

regex_str = r'On (\S+) the cost per ton from (\d+) to (\d+) is (\S+) at (.*)'

df_with_new_cols = df.withColumn('week', f.regexp_extract('text', regex_str, 1))\
                    .withColumn('departure_zipcode', f.regexp_extract('text', regex_str, 2))\
                    .withColumn('arrival_zipcode', f.regexp_extract('text', regex_str, 3))\
                    .withColumn('cost', f.regexp_extract('text', regex_str, 4))\
                    .withColumn('vendor', f.regexp_extract('text', regex_str, 5))

In [5]:
df_with_new_cols.show(5)

+--------------------+----------+-----------------+---------------+------+-----------+
|                text|      week|departure_zipcode|arrival_zipcode|  cost|     vendor|
+--------------------+----------+-----------------+---------------+------+-----------+
|On 2021-01-04 the...|2021-01-04|            85001|          85002|$28.32|ABC Hauling|
|On 2021-01-04 the...|2021-01-04|            85001|          85004|$25.68|ABC Hauling|
|On 2021-01-04 the...|2021-01-04|            85001|          85007| 19.86|ABC Hauling|
|On 2021-01-04 the...|2021-01-04|            85001|          85007| 20.52| Haul Today|
|On 2021-01-04 the...|2021-01-04|            85001|          85010| 20.72| Haul Today|
+--------------------+----------+-----------------+---------------+------+-----------+
only showing top 5 rows



In [6]:
df_with_new_cols.printSchema()

root
 |-- text: string (nullable = true)
 |-- week: string (nullable = true)
 |-- departure_zipcode: string (nullable = true)
 |-- arrival_zipcode: string (nullable = true)
 |-- cost: string (nullable = true)
 |-- vendor: string (nullable = true)



In [7]:
df_final = df_with_new_cols.drop('text')

df_final.show(5)

+----------+-----------------+---------------+------+-----------+
|      week|departure_zipcode|arrival_zipcode|  cost|     vendor|
+----------+-----------------+---------------+------+-----------+
|2021-01-04|            85001|          85002|$28.32|ABC Hauling|
|2021-01-04|            85001|          85004|$25.68|ABC Hauling|
|2021-01-04|            85001|          85007| 19.86|ABC Hauling|
|2021-01-04|            85001|          85007| 20.52| Haul Today|
|2021-01-04|            85001|          85010| 20.72| Haul Today|
+----------+-----------------+---------------+------+-----------+
only showing top 5 rows



### 데이터프레임 저장하기

In [None]:
df_final.write.csv('extracted')

In [9]:
df_final.write.json('extracted')