## Google Drive 연동

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## JAVA 설치
- JVM 실행 위해서는 JAVA 설치 필수

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

## Spark 설치
- 기존 : Web Link를 통해서 다운로드 받은 후, 압축파일 풀기
- 오늘 : 구글 드라이브에서 파일을 가져오기

In [None]:
%cd /content/drive/MyDrive/Colab Notebooks/2023/멀티캠퍼스/spark

/content/drive/MyDrive/Colab Notebooks/2023/멀티캠퍼스/spark


In [None]:
!pwd

/content/drive/MyDrive/Colab Notebooks/2023/멀티캠퍼스/spark


In [None]:
!ls

 ch02   chapter02		       'Spark Tutorial Template.ipynb'
 ch03   ml			        스파크00프로젝트.ipynb
 ch04   spark-3.1.1-bin-hadoop2.7.tgz  '스파크 구글코랩 템플릿.ipynb'


In [None]:
!cp -r spark-3.1.1-bin-hadoop2.7.tgz /content

In [None]:
%cd /content/

/content


In [None]:
!pwd

/content


In [None]:
!ls

drive  ngrok-stable-linux-amd64.zip  spark-3.1.1-bin-hadoop2.7
ngrok  sample_data		     spark-3.1.1-bin-hadoop2.7.tgz


In [None]:
!tar xf spark-3.1.1-bin-hadoop2.7.tgz > /dev/null

## 환경변수 설정
- 일반적으로 vi 편집기를 활용해서 작업
- 구글코랩 : os 라이브러리 사용해서 환경변수 지정

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"

## PySpark 설치

In [None]:
!pip install -q pyspark==3.1.1

## Ngrok 설정
- 회원가입해서 Token을 받는다.
- 싸이트 : https://ngrok.com/


In [None]:
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip &> /dev/null
!unzip ngrok-stable-linux-amd64.zip &> /dev/null

In [None]:
!pip install pyngrok



In [None]:
!./ngrok authtoken 7551hxosxCMUhFXL2A18g_7WXBHeNkUZ1Sg9eSgM7Kd

Authtoken saved to configuration file: /root/.ngrok2/ngrok.yml


## 테스트

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('mulCamp28').config('spark.ui.port', '4050').getOrCreate()
spark

In [None]:
strings = spark.read.text("/content/spark-3.1.1-bin-hadoop2.7/README.md")
print(strings)
print(type(strings))

DataFrame[value: string]
<class 'pyspark.sql.dataframe.DataFrame'>


In [None]:
filtered = strings.filter(strings.value.contains("Spark"))
print(filtered)
print(type(filtered))

DataFrame[value: string]
<class 'pyspark.sql.dataframe.DataFrame'>


In [None]:
!ls

drive  ngrok-stable-linux-amd64.zip    sample_data		  spark-3.1.1-bin-hadoop2.7.tgz
ngrok  ngrok-stable-linux-amd64.zip.1  spark-3.1.1-bin-hadoop2.7


In [None]:
# !cat spark-3.1.1-bin-hadoop2.7/README.md

In [None]:
filtered.count()

19

## CSV 파일 불러오기
- CSV 포맷으로 파일 읽어서 데이터프레임에 저장함
- 스키마 추론, 쉼표로 구분된 컬럼 이름이 제공되는 헤더가 있음 지정

### 교재

In [None]:
if len(sys.argv)
# 터미널에서 mnmcount.py data/mnm_dataset.csv / GCP 클라우드에서, 간단한 테스트

In [None]:
mnm_file = '/content/drive/MyDrive/Colab Notebooks/2023/멀티캠퍼스/spark/chapter02/data/mnm_dataset.csv'

mnm_df = spark.read.format('csv').option("header", "true").option("inferSchema", "true").load(mnm_file)
mnm_df.show(n=10, truncate=False) # show, pandas head()

+-----+------+-----+
|State|Color |Count|
+-----+------+-----+
|TX   |Red   |20   |
|NV   |Blue  |66   |
|CO   |Blue  |79   |
|OR   |Blue  |71   |
|WA   |Yellow|93   |
|WY   |Blue  |16   |
|CA   |Yellow|53   |
|WA   |Green |60   |
|OR   |Green |71   |
|TX   |Green |68   |
+-----+------+-----+
only showing top 10 rows



- GROUP BY State별, Color별 갯수를 요약해보자

In [None]:
count_mnm_df = mnm_df.select("State", "Color", "Count").groupBy("State", "Color").sum("Count").orderBy("sum(Count)", ascending=False)
count_mnm_df.show(n = 2, truncate=False)

+-----+------+----------+
|State|Color |sum(Count)|
+-----+------+----------+
|CA   |Yellow|100956    |
|WA   |Green |96486     |
+-----+------+----------+
only showing top 2 rows



In [None]:
count_mnm_df.count() # 행의 갯수

60

- CA만 확인하자

In [None]:
ca_count_mnm_df = mnm_df.select('*')\
  .where(mnm_df.State == 'CA')\
  .groupBy("State", "Color")\
  .sum("Count")\
  .orderBy("sum(Count)", ascending=False)

ca_count_mnm_df.show()

+-----+------+----------+
|State| Color|sum(Count)|
+-----+------+----------+
|   CA|Yellow|    100956|
|   CA| Brown|     95762|
|   CA| Green|     93505|
|   CA|   Red|     91527|
|   CA|Orange|     90311|
|   CA|  Blue|     89123|
+-----+------+----------+



In [None]:
ca_count_mnm_df.count()

6

In [None]:
get_ipython().system_raw('./ngrok http 4050 &')
!curl -s http://localhost:4040/api/tunnels

In [None]:
!curl -s http://localhost:4040/api/tunnels | python3 -c \
    "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"

http://4c95-34-139-255-88.ngrok-free.app


## Spark 종료

In [None]:
# spark.stop()