In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("spark-dataframe").getOrCreate()

파일이나 다른 데이터 소스로 부터 스파크 데이터 프레임을 만드는 방법
* `spark.read.xxx(DataSource 경로)`

In [2]:
directory = "/home/tutor/SparkCourse/data"
filename = "titanic_train.csv"

Pandas로 csv 데이터 불러오기

In [3]:
import pandas as pd

titanic_pdf = pd.read_csv(f"{directory}/{filename}", header='infer')
titanic_pdf.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Spark로 csv 데이터 불러오기

In [4]:
# header=True : csv에 기록된 컬럼 정보 포함하기
# inferSchema=True : 데이터 타입 자동 유추
titanic_sdf = spark.read.csv(f"file:///{directory}/{filename}", header=True, inferSchema=True)
titanic_sdf

DataFrame[PassengerId: int, Survived: int, Pclass: int, Name: string, Sex: string, Age: double, SibSp: int, Parch: int, Ticket: string, Fare: double, Cabin: string, Embarked: string]

In [5]:
titanic_sdf.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

# Pandas DataFrame과 Spark DataFrame의 주요 차이

## Spark DataFrame은 SQL 연산과 비슷한 연산자를 제공
- `spark_dataframe.select('컬럼명')`
- `spark_dataframe.select('컬럼명').filter(...)`
    - `filter` : where절에 해당!
- `spark_dataframe.groupBy('컬럼명').count()`
- `spark_dataframe.withColumns('컬럼명', ...)`

## Spark Dataframe의 연산의 특징
- Spark DataFrame의 연산은 대부분 새로운 DataFrame 객체를 반환하는 형태로 구성
- 특히 DataFrame 객체에 직접 수정을 허용하지 않는다.
    - Spark DataFrame도 RDD의 Immutable 특징을 그대로 가져간다.
- **pandas의 경우**
    - `pandas_dataframe.drop('컬럼명', axis=1 ,inplace=True)` 호출하면 `pandas_dataframe` 객체 자체에서 `'컬럼명'`을 `drop` 시킨다.
- **Spark Dataframe의 경우**
    - `spark_dataframe_new = spark_dataframe.drop('컬럼명')`과 같이 `inplace`인자가 아예 없음
    

## Spark Dataframe은 `[ ]` 연산자 활용이 제한적이다.
- **Pandas의 경우**
    - 특정 컬럼값을 가져오거나, 새로운 컬럼을 만들기 위해서 사용
    - `pandas_dataframe['new_column'] = pandas_dataframe['column'] * 10`
- **Spark DataFrame의 경우**
    - `withColumns()` 메소드를 활용해야 한다.
        - `update` 효과가 있다.
    - `spark_dataframe.withColumns('new_column', col('column') * 10 )`
    - `withColumns()`, `filters()` 메소드에서 컬럼을 지정하기 위해서만 사용

## Spark Dataframe은 여러 컬럼을 접근(access)하기 위해 리스트(`[ ] `)를 사용하지 않는다.
- **Pandas의 경우**
    - `pandas_dataframe.drop(['컬럼1', '컬럼2], ...)`
- **Spark DataFrame의 경우**
    - `spark_dataframe.drop('컬럼1', '컬럼2')`

Pandas DataFrame의 head(), Spark DataFrame의 head() 비교

In [8]:
# pandas DF의 head(N)는 DataFrame의 선두 N개의 Record(Row)를 가지는 DataFrame이 반환.
titanic_pdf.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [11]:
# Spark Dataframe의 head(N)는 DataFrame의 선수 N개의 Row Object가 list로 반환
titanic_sdf.head(10)

[Row(PassengerId=1, Survived=0, Pclass=3, Name='Braund, Mr. Owen Harris', Sex='male', Age=22.0, SibSp=1, Parch=0, Ticket='A/5 21171', Fare=7.25, Cabin=None, Embarked='S'),
 Row(PassengerId=2, Survived=1, Pclass=1, Name='Cumings, Mrs. John Bradley (Florence Briggs Thayer)', Sex='female', Age=38.0, SibSp=1, Parch=0, Ticket='PC 17599', Fare=71.2833, Cabin='C85', Embarked='C'),
 Row(PassengerId=3, Survived=1, Pclass=3, Name='Heikkinen, Miss. Laina', Sex='female', Age=26.0, SibSp=0, Parch=0, Ticket='STON/O2. 3101282', Fare=7.925, Cabin=None, Embarked='S'),
 Row(PassengerId=4, Survived=1, Pclass=1, Name='Futrelle, Mrs. Jacques Heath (Lily May Peel)', Sex='female', Age=35.0, SibSp=1, Parch=0, Ticket='113803', Fare=53.1, Cabin='C123', Embarked='S'),
 Row(PassengerId=5, Survived=0, Pclass=3, Name='Allen, Mr. William Henry', Sex='male', Age=35.0, SibSp=0, Parch=0, Ticket='373450', Fare=8.05, Cabin=None, Embarked='S'),
 Row(PassengerId=6, Survived=0, Pclass=3, Name='Moran, Mr. James', Sex='male',

In [13]:
# spark Dataframe의 limit(N)가 DataFrame의 선수 N개의 Record를 가지는 DataFrame을 반환.
titanic_sdf.limit(10).show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

In [14]:
titanic_pdf

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [15]:
# Spark Dataframe을 print 하면 스키마 정보만 확인한다.
titanic_sdf

DataFrame[PassengerId: int, Survived: int, Pclass: int, Name: string, Sex: string, Age: double, SibSp: int, Parch: int, Ticket: string, Fare: double, Cabin: string, Embarked: string]

# info()에 대응하는 spark DataFrame의 로직
- `pandas dataframe`의 `info()`는 컬럼명, Data Type, not null 건수도 나온다.
- `spark dataframe`은 `info()`가 없으며, 대신 `describe()` 메소드로 스키마(컬럼명, data type)만 출력
- not null 건수를 위해서는 별도의 SQL 쿼리 작성이 필요.

In [16]:
titanic_pdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [17]:
titanic_sdf.describe()

DataFrame[summary: string, PassengerId: string, Survived: string, Pclass: string, Name: string, Sex: string, Age: string, SibSp: string, Parch: string, Ticket: string, Fare: string, Cabin: string, Embarked: string]

In [18]:
titanic_sdf.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [19]:
# Not null 카운트 확인하기
from pyspark.sql.functions import count, isnan, when, col

titanic_sdf.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in titanic_sdf.columns]).show()

+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|PassengerId|Survived|Pclass|Name|Sex|Age|SibSp|Parch|Ticket|Fare|Cabin|Embarked|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|          0|       0|     0|   0|  0|177|    0|    0|     0|   0|  687|       2|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+



**select()**

In [20]:
dict_01 = {'Name': ['민석', '민호','성현','현주', '상기'],
           'Year': [2011, 2016, 2015, 2015, 2011],
           'Gender': ['Male', 'Male', 'Male', 'Female', 'Male']
          }

In [21]:
data_pdf = pd.DataFrame(dict_01)
data_sdf = spark.createDataFrame(data_pdf) # 판다스 데이터 프레임을 스파크 데이터 프레임으로 만들기

In [22]:
data_pdf['Name']

0    민석
1    민호
2    성현
3    현주
4    상기
Name: Name, dtype: object

In [23]:
data_pdf[['Name', "Year"]]

Unnamed: 0,Name,Year
0,민석,2011
1,민호,2016
2,성현,2015
3,현주,2015
4,상기,2011


In [25]:
data_sdf.select("Name").show()

+----+
|Name|
+----+
|민석|
|민호|
|성현|
|현주|
|상기|
+----+



In [26]:
data_sdf.select("Name", "Year").show() # select Name, Year from data_sdf

+----+----+
|Name|Year|
+----+----+
|민석|2011|
|민호|2016|
|성현|2015|
|현주|2015|
|상기|2011|
+----+----+



In [27]:
data_sdf.select("*").show() # select * from data_sdf

+----+----+------+
|Name|Year|Gender|
+----+----+------+
|민석|2011|  Male|
|민호|2016|  Male|
|성현|2015|  Male|
|현주|2015|Female|
|상기|2011|  Male|
+----+----+------+



In [28]:
data_sdf.Name

Column<'Name'>

In [29]:
data_sdf["Name"]

Column<'Name'>

In [30]:
# 컬럼 속성으로 지정하여 select의 인자로 사용이 가능
data_sdf.select(data_sdf.Name, data_sdf.Year).show()

+----+----+
|Name|Year|
+----+----+
|민석|2011|
|민호|2016|
|성현|2015|
|현주|2015|
|상기|2011|
+----+----+



In [31]:
data_sdf.select(data_sdf["Name"], data_sdf["Year"]).show()

+----+----+
|Name|Year|
+----+----+
|민석|2011|
|민호|2016|
|성현|2015|
|현주|2015|
|상기|2011|
+----+----+



In [33]:
from pyspark.sql.functions import col 

# col() 함수를 이용하여 명시적으로 컬럼명을 지정할 수 있음.
data_sdf.select(col("Name"), col("Year")).show()

+----+----+
|Name|Year|
+----+----+
|민석|2011|
|민호|2016|
|성현|2015|
|현주|2015|
|상기|2011|
+----+----+



In [36]:
from pyspark.sql.functions import upper, lower, col

# select()에서 컬럼 데이터를 가공 후 생성 가능.
data_sdf.select("*", upper(col("Gender"))).show() # select *, upper(Gender) from data_sdf
data_sdf.select("*", upper(col("Gender")).alias("CAP_GENDER")).show() # select *, upper(Gender) as CAP_GENDER from data_sdf

+----+----+------+-------------+
|Name|Year|Gender|upper(Gender)|
+----+----+------+-------------+
|민석|2011|  Male|         MALE|
|민호|2016|  Male|         MALE|
|성현|2015|  Male|         MALE|
|현주|2015|Female|       FEMALE|
|상기|2011|  Male|         MALE|
+----+----+------+-------------+

+----+----+------+----------+
|Name|Year|Gender|CAP_GENDER|
+----+----+------+----------+
|민석|2011|  Male|      MALE|
|민호|2016|  Male|      MALE|
|성현|2015|  Male|      MALE|
|현주|2015|Female|    FEMALE|
|상기|2011|  Male|      MALE|
+----+----+------+----------+

