In [None]:
!pip install pyspark==3.3.1 py4j==0.10.9.5

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Regex를 이용해서 아래와 같이 변환해보는 것이 목표

- 입력: “On 2021-01-04 the cost per ton from 85001 to 85002 is $28.32 at ABC Hauling”
    - regex 패턴: “On (\S+) the cost per ton from (\d+) to (\d+) is (\S+) at (.*)”
        -  \S (non-whitespace character), \d (numeric character)
- 출력: 

    | week | departure_zipcode | arrival_zipcode | cost | vendor |
    |---|---|---|---|---|
    | 2021-01-04 | 85001 | 85002 | $28.32 | ABC Hauling |

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

conf = SparkConf()
conf.set("spark.app.name", "PySpark DataFrame #3")
conf.set("spark.master", "local[*]")

spark = SparkSession.builder\
        .config(conf=conf)\
        .getOrCreate()

In [2]:
!wget https://s3-geospatial.s3.us-west-2.amazonaws.com/transfer_cost.txt

--2023-07-04 15:57:54--  https://s3-geospatial.s3.us-west-2.amazonaws.com/transfer_cost.txt
Resolving s3-geospatial.s3.us-west-2.amazonaws.com (s3-geospatial.s3.us-west-2.amazonaws.com)... 52.218.224.1, 3.5.79.188, 3.5.77.186, ...
Connecting to s3-geospatial.s3.us-west-2.amazonaws.com (s3-geospatial.s3.us-west-2.amazonaws.com)|52.218.224.1|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 286779 (280K) [text/plain]
Saving to: ‘transfer_cost.txt’


2023-07-04 15:57:56 (297 KB/s) - ‘transfer_cost.txt’ saved [286779/286779]



In [3]:
!ls -tl

total 788
-rw-r--r-- 1 jovyan users  13214 Jul  4 15:53 PySpark_DataFrame_3.ipynb
-rw-r--r-- 1 jovyan users  24246 Jul  4 15:23 PySpark_DataFrame_2.ipynb
-rw-r--r-- 1 jovyan users  30849 Jul  4 14:58 PySpark_DataFrame_1.ipynb
-rw-r--r-- 1 jovyan users  22881 Jul  4 14:30 PySpark_설치_및_테스트.ipynb
-rw-r--r-- 1 jovyan users  22056 Jul  4 12:19 PySpark_DataFrame_5.ipynb
-rw-r--r-- 1 jovyan users  32483 Jul  4 12:19 PySpark_DataFrame_4.ipynb
-rw-r--r-- 1 jovyan users  64553 Jul  4 12:19 1800.csv
-rw-r--r-- 1 jovyan users 286779 Apr 24  2022 transfer_cost.txt
-rw-r--r-- 1 jovyan users 146855 Apr 10  2022 customer-orders.csv
-rw-r--r-- 1 jovyan users 146855 Apr 10  2022 customer-orders.csv.1


In [4]:
!head -5 transfer_cost.txt

On 2021-01-04 the cost per ton from 85001 to 85002 is $28.32 at ABC Hauling
On 2021-01-04 the cost per ton from 85001 to 85004 is $25.68 at ABC Hauling
On 2021-01-04 the cost per ton from 85001 to 85007 is 19.86 at ABC Hauling
On 2021-01-04 the cost per ton from 85001 to 85007 is 20.52 at Haul Today
On 2021-01-04 the cost per ton from 85001 to 85010 is 20.72 at Haul Today


In [5]:
import pyspark.sql.functions as F
from pyspark.sql.types import *

# Schema 잡기 -> text 파일 읽기
schema = StructType([ StructField("text", StringType(), True)])
transfer_cost_df = spark.read.schema(schema).text("transfer_cost.txt")

In [10]:
transfer_cost_df.show(truncate=False)

+---------------------------------------------------------------------------+
|text                                                                       |
+---------------------------------------------------------------------------+
|On 2021-01-04 the cost per ton from 85001 to 85002 is $28.32 at ABC Hauling|
|On 2021-01-04 the cost per ton from 85001 to 85004 is $25.68 at ABC Hauling|
|On 2021-01-04 the cost per ton from 85001 to 85007 is 19.86 at ABC Hauling |
|On 2021-01-04 the cost per ton from 85001 to 85007 is 20.52 at Haul Today  |
|On 2021-01-04 the cost per ton from 85001 to 85010 is 20.72 at Haul Today  |
|On 2021-01-04 the cost per ton from 85001 to 85012 is $18.98 at ABC Hauling|
|On 2021-01-04 the cost per ton from 85001 to 85013 is 26.64 at Haul Today  |
|On 2021-01-04 the cost per ton from 85001 to 85020 is 26.34 at ABC Hauling |
|On 2021-01-04 the cost per ton from 85001 to 85021 is $20.15 at ABC Hauling|
|On 2021-01-04 the cost per ton from 85002 to 85001 is 21.57 at 

In [11]:
from pyspark.sql.functions import *
regex_str = r'On (\S+) the cost per ton from (\d+) to (\d+) is (\S+) at (.*)'

# 두번째 파라미터 : regexp_extract 와 같은 것이 UDF 
df_with_new_columns = transfer_cost_df\
    .withColumn('week', regexp_extract('text', regex_str, 1))\
    .withColumn('departure_zipcode', regexp_extract(column('text'), regex_str, 2))\
    .withColumn('arrival_zipcode', regexp_extract(transfer_cost_df.text, regex_str, 3))\
    .withColumn('cost', regexp_extract(col('text'), regex_str, 4))\
    .withColumn('vendor', regexp_extract(col('text'), regex_str, 5))

In [12]:
df_with_new_columns.printSchema()

root
 |-- text: string (nullable = true)
 |-- week: string (nullable = true)
 |-- departure_zipcode: string (nullable = true)
 |-- arrival_zipcode: string (nullable = true)
 |-- cost: string (nullable = true)
 |-- vendor: string (nullable = true)



In [13]:
final_df = df_with_new_columns.drop("text")

In [20]:
# refine된 df를 csv로 저장
final_df.write.csv("extracted_transfer_cost.csv")

In [21]:
!ls -tl

total 648
drwxr-xr-x 6 jovyan users    192 Jul  4 16:08 extracted_transfer_cost.csv
-rw-r--r-- 1 jovyan users  19506 Jul  4 16:07 PySpark_DataFrame_3.ipynb
drwxr-xr-x 5 jovyan users    160 Jul  4 16:06 extracted.csv
-rw-r--r-- 1 jovyan users  24246 Jul  4 15:23 PySpark_DataFrame_2.ipynb
-rw-r--r-- 1 jovyan users  30849 Jul  4 14:58 PySpark_DataFrame_1.ipynb
-rw-r--r-- 1 jovyan users  22881 Jul  4 14:30 PySpark_설치_및_테스트.ipynb
-rw-r--r-- 1 jovyan users  22056 Jul  4 12:19 PySpark_DataFrame_5.ipynb
-rw-r--r-- 1 jovyan users  32483 Jul  4 12:19 PySpark_DataFrame_4.ipynb
-rw-r--r-- 1 jovyan users  64553 Jul  4 12:19 1800.csv
-rw-r--r-- 1 jovyan users 286779 Apr 24  2022 transfer_cost.txt
-rw-r--r-- 1 jovyan users 146855 Apr 10  2022 customer-orders.csv


In [23]:
# 큰 데이터 프레임을 저장하려고 하면 hdfs에 저장을 하려고 할 때, 블록 단위로 나뉘어서 저장이 된다.
# 그래서 파일이 아닌 디렉토리로 저장이 되고, 데이터 프레임이 여러개 파일로 나뉘어져서 저장이 될 수 있다.
# 이 경우는 파일이 작아서 디렉토리 내부에 하나로 저장
# part로 시작하는 부분 파일이 1번째 데이터 블록이 되는 것
!ls -tl extracted_transfer_cost.csv/

total 156
-rw-r--r-- 1 jovyan users      0 Jul  4 16:08 _SUCCESS
-rw-r--r-- 1 jovyan users 156423 Jul  4 16:08 part-00000-5b782253-00d3-4493-9c09-2f9b58baa464-c000.csv


In [24]:
# 처음 레코드 5개 읽기
!head -5 extracted_transfer_cost.csv/part-00000-5b782253-00d3-4493-9c09-2f9b58baa464-c000.csv

2021-01-04,85001,85002,$28.32,ABC Hauling
2021-01-04,85001,85004,$25.68,ABC Hauling
2021-01-04,85001,85007,19.86,ABC Hauling
2021-01-04,85001,85007,20.52,Haul Today
2021-01-04,85001,85010,20.72,Haul Today


In [25]:
final_df.write.format("json").save("extracted.json")

In [26]:
!ls -tl extracted.json/

total 428
-rw-r--r-- 1 jovyan users      0 Jul  4 16:11 _SUCCESS
-rw-r--r-- 1 jovyan users 436305 Jul  4 16:11 part-00000-2be98280-167e-4bce-b077-6a32c9382762-c000.json


In [27]:
# 처음 레코드 하나만 읽기
!head -1 extracted.json/part-00000-2be98280-167e-4bce-b077-6a32c9382762-c000.json

{"week":"2021-01-04","departure_zipcode":"85001","arrival_zipcode":"85002","cost":"$28.32","vendor":"ABC Hauling"}
