In [1]:
!hadoop fs -ls /public/hr_db/employees

Found 5 items
-rw-r--r--   2 hdfs supergroup          0 2021-01-28 11:23 /public/hr_db/employees/_SUCCESS
-rw-r--r--   2 hdfs supergroup       2123 2021-01-28 09:29 /public/hr_db/employees/part-m-00000
-rw-r--r--   2 hdfs supergroup       2159 2021-01-28 10:54 /public/hr_db/employees/part-m-00001
-rw-r--r--   2 hdfs supergroup       2145 2021-01-28 08:16 /public/hr_db/employees/part-m-00002
-rw-r--r--   2 hdfs supergroup       2121 2021-01-28 09:29 /public/hr_db/employees/part-m-00003


In [2]:
!hadoop fs -head /public/hr_db/employees/part-m-00002

154	Nanette	Cambrault	NCAMBRAU	011.44.1344.987668	1998-12-09	SA_REP	7500.00	0.20	145	80
155	Oliver	Tuvault	OTUVAULT	011.44.1344.486508	1999-11-23	SA_REP	7000.00	0.15	145	80
156	Janette	King	JKING	011.44.1345.429268	1996-01-30	SA_REP	10000.00	0.35	146	80
157	Patrick	Sully	PSULLY	011.44.1345.929268	1996-03-04	SA_REP	9500.00	0.35	146	80
158	Allan	McEwen	AMCEWEN	011.44.1345.829268	1996-08-01	SA_REP	9000.00	0.35	146	80
159	Lindsey	Smith	LSMITH	011.44.1345.729268	1997-03-10	SA_REP	8000.00	0.30	146	80
160	Louise	Doran	LDORAN	011.44.1345.629268	1997-12-15	SA_REP	7500.00	0.30	146	80
161	Sarath	Sewall	SSEWALL	011.44.1345.529268	1998-11-03	SA_REP	7000.00	0.25	146	80
162	Clara	Vishney	CVISHNEY	011.44.1346.129268	1997-11-11	SA_REP	10500.00	0.25	147	80
163	Danielle	Greene	DGREENE	011.44.1346.229268	1999-03-19	SA_REP	9500.00	0.15	147	80
164	Mattea	Marvins	MMARVINS	011.44.1346.329268	2000-01-24	SA_REP	7200.00	0.10	147	80
165	David	Lee	DLEE	011.44.1346.529268	2000-02-23	SA_REP	6800.00	0.10	147	80
166	S

In [3]:
import getpass as gp
from pyspark.sql import SparkSession, functions as F, types as T

In [4]:
user = gp.getuser()
user

'itv005077'

In [5]:
spark = SparkSession.builder \
    .appName(f'{user}-dataframe-writer-example') \
    .master('yarn') \
    .config('spark.sql.warehouse.dir', f'/user/{user}/warehouse') \
    .config('saprk.sql.catalogImplementation', 'hive') \
    .enableHiveSupport() \
    .getOrCreate()

In [6]:
spark

# Read from parquet file directly without using reader API's

In [7]:
spark.sql("SELECT * FROM parquet.`/public/trendytech/datasets/ordersparquet`").show()

+-----------+--------------------+--------+---------------+
|customer_id|          order_date|order_id|   order_status|
+-----------+--------------------+--------+---------------+
|      11599|2013-07-25 00:00:...|       1|         CLOSED|
|        256|2013-07-25 00:00:...|       2|PENDING_PAYMENT|
|      12111|2013-07-25 00:00:...|       3|       COMPLETE|
|       8827|2013-07-25 00:00:...|       4|         CLOSED|
|      11318|2013-07-25 00:00:...|       5|       COMPLETE|
|       7130|2013-07-25 00:00:...|       6|       COMPLETE|
|       4530|2013-07-25 00:00:...|       7|       COMPLETE|
|       2911|2013-07-25 00:00:...|       8|     PROCESSING|
|       5657|2013-07-25 00:00:...|       9|PENDING_PAYMENT|
|       5648|2013-07-25 00:00:...|      10|PENDING_PAYMENT|
|        918|2013-07-25 00:00:...|      11| PAYMENT_REVIEW|
|       1837|2013-07-25 00:00:...|      12|         CLOSED|
|       9149|2013-07-25 00:00:...|      13|PENDING_PAYMENT|
|       9842|2013-07-25 00:00:...|      

In [8]:
schema = T.StructType([
    T.StructField('employee_id', T.IntegerType()),
    T.StructField('first_name', T.StringType()),
    T.StructField('last_name', T.StringType()),
    T.StructField('email', T.StringType()),
    T.StructField('phone_number', T.StringType()),
    T.StructField('hire_date', T.DateType()),
    T.StructField('job_id', T.StringType()),
    T.StructField('salary', T.FloatType()),
    T.StructField('commission_pct', T.FloatType()),
    T.StructField('manager_id', T.IntegerType()),
    T.StructField('department_id', T.IntegerType()),
])

In [9]:
df_emp = spark.read \
    .format('csv') \
    .option('delimiter', '\t') \
    .schema(schema) \
    .load('/public/hr_db/employees')

In [10]:
df_emp.show()

+-----------+----------+----------+--------+------------------+----------+--------+-------+--------------+----------+-------------+
|employee_id|first_name| last_name|   email|      phone_number| hire_date|  job_id| salary|commission_pct|manager_id|department_id|
+-----------+----------+----------+--------+------------------+----------+--------+-------+--------------+----------+-------------+
|        127|     James|    Landry| JLANDRY|      650.124.1334|1999-01-14|ST_CLERK| 2400.0|          null|       120|           50|
|        128|    Steven|    Markle| SMARKLE|      650.124.1434|2000-03-08|ST_CLERK| 2200.0|          null|       120|           50|
|        129|     Laura|    Bissot| LBISSOT|      650.124.5234|1997-08-20|ST_CLERK| 3300.0|          null|       121|           50|
|        130|     Mozhe|  Atkinson|MATKINSO|      650.124.6234|1997-10-30|ST_CLERK| 2800.0|          null|       121|           50|
|        131|     James|    Marlow| JAMRLOW|      650.124.7234|1997-02-16|ST

In [11]:
df_emp.rdd.getNumPartitions()

2

In [12]:
spark.sparkContext.defaultMinPartitions

2

In [14]:
df_emp.write \
.format('csv') \
.option('header', True) \
.mode('overwrite') \
.save(f'/user/{user}/spark_write/non-part/default/')

In [15]:
!hadoop fs -ls -h spark_write/non-part/default

Found 3 items
-rw-r--r--   3 itv005077 supergroup          0 2023-07-02 15:49 spark_write/non-part/default/_SUCCESS
-rw-r--r--   3 itv005077 supergroup      4.2 K 2023-07-02 15:49 spark_write/non-part/default/part-00000-3c9f5af5-def1-4997-b1e6-fbac2b33c3db-c000.csv
-rw-r--r--   3 itv005077 supergroup      4.1 K 2023-07-02 15:49 spark_write/non-part/default/part-00001-3c9f5af5-def1-4997-b1e6-fbac2b33c3db-c000.csv


In [17]:
!hadoop fs -head spark_write/non-part/default/part-00001-3c9f5af5-def1-4997-b1e6-fbac2b33c3db-c000.csv

employee_id,first_name,last_name,email,phone_number,hire_date,job_id,salary,commission_pct,manager_id,department_id
100,Steven,King,SKING,515.123.4567,1987-06-17,AD_PRES,24000.0,"","",90
101,Neena,Kochhar,NKOCHHAR,515.123.4568,1989-09-21,AD_VP,17000.0,"",100,90
102,Lex,De Haan,LDEHAAN,515.123.4569,1993-01-13,AD_VP,17000.0,"",100,90
103,Alexander,Hunold,AHUNOLD,590.423.4567,1990-01-03,IT_PROG,9000.0,"",102,60
104,Bruce,Ernst,BERNST,590.423.4568,1991-05-21,IT_PROG,6000.0,"",103,60
105,David,Austin,DAUSTIN,590.423.4569,1997-06-25,IT_PROG,4800.0,"",103,60
106,Valli,Pataballa,VPATABAL,590.423.4560,1998-02-05,IT_PROG,4800.0,"",103,60
107,Diana,Lorentz,DLORENTZ,590.423.5567,1999-02-07,IT_PROG,4200.0,"",103,60
108,Nancy,Greenberg,NGREENBE,515.124.4569,1994-08-17,FI_MGR,12000.0,"",101,100
109,Daniel,Faviet,DFAVIET,515.124.4169,1994-08-16,FI_ACCOUNT,9000.0,"",108,100
110,John,Chen,JCHEN,515.124.4269,1997-09-28,FI_ACCOUNT,8200.0,"",108,100
111,Ismael,Sciarra,ISCIARRA,515.124.4369,1997-09-30,FI_AC

In [18]:
# deafult write mode is `errorIfExists` 
# AnalysisException: path <PATH> already exists.;
df_emp.write \
.format('csv') \
.option('header', True) \
.save(f'/user/{user}/spark_write/non-part/default/')

AnalysisException: path hdfs://m01.itversity.com:9000/user/itv005077/spark_write/non-part/default already exists.;

In [20]:
df_emp.write \
.format('csv') \
.option('header', True) \
.mode('append') \
.save(f'/user/{user}/spark_write/non-part/default/')

In [21]:
!hadoop fs -ls -h spark_write/non-part/default

Found 5 items
-rw-r--r--   3 itv005077 supergroup          0 2023-07-02 15:50 spark_write/non-part/default/_SUCCESS
-rw-r--r--   3 itv005077 supergroup      4.2 K 2023-07-02 15:49 spark_write/non-part/default/part-00000-3c9f5af5-def1-4997-b1e6-fbac2b33c3db-c000.csv
-rw-r--r--   3 itv005077 supergroup      4.2 K 2023-07-02 15:50 spark_write/non-part/default/part-00000-f226e1ae-146f-451c-ba01-111a17cf6548-c000.csv
-rw-r--r--   3 itv005077 supergroup      4.1 K 2023-07-02 15:49 spark_write/non-part/default/part-00001-3c9f5af5-def1-4997-b1e6-fbac2b33c3db-c000.csv
-rw-r--r--   3 itv005077 supergroup      4.1 K 2023-07-02 15:50 spark_write/non-part/default/part-00001-f226e1ae-146f-451c-ba01-111a17cf6548-c000.csv


In [22]:
!hadoop fs -rm -r spark_write

2023-07-02 15:50:44,370 INFO fs.TrashPolicyDefault: Moved: 'hdfs://m01.itversity.com:9000/user/itv005077/spark_write' to trash at: hdfs://m01.itversity.com:9000/user/itv005077/.Trash/Current/user/itv005077/spark_write1688327444343


In [23]:
!hadoop fs -ls 

Found 3 items
drwx------   - itv005077 supergroup          0 2023-05-30 14:03 .Trash
drwxr-xr-x   - itv005077 supergroup          0 2023-07-02 15:48 .sparkStaging
drwxr-xr-x   - itv005077 supergroup          0 2023-06-25 15:30 warehouse


In [24]:
spark.stop()