In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("PySpark Example").getOrCreate()

24/12/03 10:27:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
#RDD 생성 - 분산데이터객체 (Resilient Distributed Dataset)
rdd = spark.sparkContext.parallelize([1,2,3,4,5])
rdd

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:274

In [4]:
squared_rdd = rdd.map(lambda x : x*x)
squared_rdd

PythonRDD[1] at RDD at PythonRDD.scala:53

In [11]:
# take(),
rdd.take(5)

[1, 2, 3, 4, 5]

In [12]:
squared_rdd.take(5)

[1, 4, 9, 16, 25]

In [14]:
# collect()
squared_rdd.collect()

[1, 4, 9, 16, 25]

# 데이터프레임 객체

In [15]:
data = [("Alice", 1), ("Bob", 2), ("Charlie", 3)]
df = spark.createDataFrame(data, ["Name", "Value"])
df

DataFrame[Name: string, Value: bigint]

In [16]:
df.show() # 분산객체

+-------+-----+
|   Name|Value|
+-------+-----+
|  Alice|    1|
|    Bob|    2|
|Charlie|    3|
+-------+-----+



In [24]:
filter = df.filter(df['Value']==2)
filter.show()

+----+-----+
|Name|Value|
+----+-----+
| Bob|    2|
+----+-----+



# RDBMS 데이터 - 테이블

In [25]:
df.createOrReplaceTempView("people")

In [27]:
select_sql = "SELECT * FROM people WHERE Value > 2" # sql문 활용

In [28]:
result_sql = spark.sql(select_sql)
result_sql.show()

+-------+-----+
|   Name|Value|
+-------+-----+
|Charlie|    3|
+-------+-----+



In [34]:
# DataFrame 생성 → select Df 추출 연습

# 새로운 샘플 데이터 생성
new_data = [("Dave", 4), ("Eve", 5), ("Frank", 6)]
df = spark.createDataFrame(new_data, ["Name", "Value"])
df.show()

# 'Name' 컬럼 선택
selected_df = df.select("Name")
selected_df.show()

# 'Value'가 5 이상인 행만 필터링
filtered_df = df.filter(df["Value"] >= 5)
filtered_df.show()

# SQL 쿼리를 사용하여 'Value'가 5 이상인 행을 필터링
df.createOrReplaceTempView("people")
filtered_df_sql = spark.sql("SELECT * FROM people WHERE Value >= 5")
filtered_df_sql.show()

+-----+-----+
| Name|Value|
+-----+-----+
| Dave|    4|
|  Eve|    5|
|Frank|    6|
+-----+-----+

+-----+
| Name|
+-----+
| Dave|
|  Eve|
|Frank|
+-----+

+-----+-----+
| Name|Value|
+-----+-----+
|  Eve|    5|
|Frank|    6|
+-----+-----+

+-----+-----+
| Name|Value|
+-----+-----+
|  Eve|    5|
|Frank|    6|
+-----+-----+



# MLlib

In [35]:
!pip install numpy

Collecting numpy
  Downloading numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Downloading numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m174.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
Successfully installed numpy-1.24.4


In [36]:
import numpy as np

In [37]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [39]:
# DataFrame 생성
df = spark.createDataFrame([("Alice", 25), ("Bob", 30), ("Charlie", 35)], ["name", "age"])

# DataFrame 연산
df_filtered = df.filter(df.age > 28)
df_filtered.show()

+-------+---+
|   name|age|
+-------+---+
|    Bob| 30|
|Charlie| 35|
+-------+---+



In [42]:
assembler = VectorAssembler(inputCols=['age'], outputCol='features')
vector_df = assembler.transform(df)

In [43]:
lr = LinearRegression(featuresCol='features', labelCol='age')
model = lr.fit(vector_df)

24/12/03 11:34:11 WARN Instrumentation: [3355206a] regParam is zero, which might cause numerical instability and overfitting.
24/12/03 11:34:11 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
24/12/03 11:34:11 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
24/12/03 11:34:11 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
24/12/03 11:34:11 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK


In [44]:
pred = model.transform(vector_df)
pred.show()

+-------+---+--------+------------------+
|   name|age|features|        prediction|
+-------+---+--------+------------------+
|  Alice| 25|  [25.0]|25.000000000000036|
|    Bob| 30|  [30.0]|30.000000000000004|
|Charlie| 35|  [35.0]| 34.99999999999997|
+-------+---+--------+------------------+



In [45]:
spark.stop()