# Start Hadoop-YARN

In [1]:
# We have to start Hadoop and YARN if we want to use Spark in YARN cluster mode

In [2]:
# ! start-all.sh

In [1]:
import findspark
findspark.init("/opt/manual/spark/")

In [2]:
from pyspark.sql import SparkSession, functions as F

In [3]:
# Beware the master is yarn and enableHiveSupport()

spark = SparkSession.builder \
.appName("Spark YARN Example") \
.master("yarn") \
.enableHiveSupport() \
.getOrCreate()

# Read from HDFS

In [6]:
#! hdfs dfs -put ~/datasets/Advertising.csv hdfs://localhost:9000/user/train/datasets/

In [4]:
df_hdfs = spark.read.format("csv").option("header", True) \
.option("inferSchema", True).option("sep",",") \
.load("hdfs://localhost:9000/user/train/datasets/Advertising.csv")

In [5]:
df_hdfs.limit(5).toPandas()

Unnamed: 0,ID,TV,Radio,Newspaper,Sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


# Read From Hive

In [6]:
spark.sql("show databases").show()

+----------+
| namespace|
+----------+
| bookstore|
|   default|
|homecredit|
|    retail|
|     test1|
|     test2|
|     train|
+----------+



In [8]:
# ! hdfs dfs -ls hdfs://localhost:9000/user/hive/warehouse/test1.db

In [12]:
spark.sql("show tables").show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
|   test1|         advertising|      false|
|   test1|     advertising_ext|      false|
|   test1|advertising_sales...|      false|
|   test1|               churn|      false|
|   test1|      churn_exited_1|      false|
|   test1|          hotels_orc|      false|
|   test1|   hotels_orc_snappy|      false|
|   test1|      hotels_parquet|      false|
|   test1|     hotels_parquet2|      false|
|   test1|hotels_parquet_sn...|      false|
|   test1|          hotels_prt|      false|
|   test1|         hotels_text|      false|
|   test1|  hotels_text_snappy|      false|
|   test1|     mytable_renamed|      false|
|   test1|sales_part_countr...|      false|
|   test1|sales_partitioned...|      false|
|   test1|      world_classics|      false|
+--------+--------------------+-----------+



In [10]:
df_hive = spark.sql("select * from test1.advertising")

In [13]:
df_hive.limit(5).toPandas()

Unnamed: 0,id,tv,radio,newspaper,sales
0,,,,,
1,1.0,230.100006,37.799999,69.199997,22.1
2,2.0,44.5,39.299999,45.099998,10.4
3,3.0,17.200001,45.900002,69.300003,9.3
4,4.0,151.5,41.299999,58.5,18.5


In [14]:
spark.stop()

# Stop Hadoop YARN

In [None]:
# ! stop-all.sh