In [1]:
%matplotlib inline
import pyspark.sql.functions as F
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

In [2]:
aisles = spark.read \
    .options(header=True, inferSchema=True) \
    .csv("D:/DADOS USUARIO/Documents/springboard/capstone project/Instacart Kaggle/aisles.csv")
dptmts = sqlContext.read \
    .options(header=True, inferSchema=True) \
    .csv("D:/DADOS USUARIO/Documents/springboard/capstone project/Instacart Kaggle/departments.csv")
prod_in_orders = sqlContext.read \
    .options(header=True, inferSchema=True) \
    .csv("D:/DADOS USUARIO/Documents/springboard/capstone project/Instacart Kaggle/order_products__prior.csv")
all_orders = sqlContext.read \
    .options(header=True, inferSchema=True) \
    .csv("D:/DADOS USUARIO/Documents/springboard/capstone project/Instacart Kaggle/orders.csv")
train = sqlContext.read \
    .options(header=True, inferSchema=True) \
    .csv("D:/DADOS USUARIO/Documents/springboard/capstone project/Instacart Kaggle/order_products__train.csv")
products = sqlContext.read \
    .options(header=True, inferSchema=True) \
    .csv("D:/DADOS USUARIO/Documents/springboard/capstone project/Instacart Kaggle/products.csv")

# 1 - Merge of products, aisles and departments and delete dptmns & aisles

In [3]:
#The first join will connect the products tables
prod_full = products.join(aisles, on='aisle_id')
prod_full = prod_full.join(dptmts, on='department_id')
#delete aisles and dptmts dataframes
aisles.unpersist()
dptmts.unpersist()

DataFrame[department_id: int, department: string]

# 2 - Add the User_id to the prior and train dataset

In [4]:
#add user_id to train
train = train.join(all_orders, on='order_id')
train = train.drop('eval_set','order_number','order_dow','order_hour_of_day','days_since_prior_order')

# 3 - Inner join entre all orders e prod in orders

In [5]:
#add user_id to prior
orders_prod = prod_in_orders.join(all_orders, on='order_id')
prod_in_orders.unpersist()

DataFrame[order_id: int, product_id: int, add_to_cart_order: int, reordered: int]

In [6]:
orders_prod.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- add_to_cart_order: integer (nullable = true)
 |-- reordered: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- eval_set: string (nullable = true)
 |-- order_number: integer (nullable = true)
 |-- order_dow: integer (nullable = true)
 |-- order_hour_of_day: integer (nullable = true)
 |-- days_since_prior_order: double (nullable = true)



# 4 - Features from Products

In [7]:
from pyspark.sql import Window
windowval2 = Window.partitionBy('user_id', 'product_id').orderBy(['user_id', 'order_number', 'product_id']).rangeBetween(Window.unboundedPreceding, 0)
orders_prod = orders_prod.withColumn('prod_user_times', F.count('order_id').over(windowval2))

In [8]:
windowval3 = Window.partitionBy('product_id').orderBy('product_id').rangeBetween(Window.unboundedPreceding, 0)
orders_prod = orders_prod.withColumn('n_times_prod_ordered', F.count('order_id').over(windowval3))
orders_prod = orders_prod.withColumn('n_times_prod_reordered', F.sum('reordered').over(windowval3))

In [9]:
prod = orders_prod
prod1 = prod.select('product_id', 'prod_user_times').filter(prod.prod_user_times == 1).groupby('product_id').sum('prod_user_times')
prod2 = prod.select('product_id', 'prod_user_times').filter(prod.prod_user_times == 2).groupby('product_id').sum('prod_user_times')

In [10]:
prod1 = prod1.sort("product_id")
prod2 = prod2.sort("product_id")
prod1 = prod1.withColumnRenamed("sum(prod_user_times)", "first_ord_prod")
prod2 = prod2.withColumnRenamed("sum(prod_user_times)", "second_ord_prod")

In [11]:
prod = prod.drop('order_id','add_to_cart_order', 'reordered','user_id','eval_set','order_number','order_dow','order_hour_of_day','days_since_prior_order','prod_user_times')
prod = prod.groupby('product_id').agg({'n_times_prod_ordered': 'max', 'n_times_prod_reordered': 'max'})

In [12]:
prod = prod.sort("product_id")
prod = prod.withColumnRenamed("max(n_times_prod_ordered)", "times_prod_ordered")
prod = prod.withColumnRenamed("max(n_times_prod_reordered)", "times_prod_reordered")
prod = prod.join(prod1, on='product_id')
prod = prod.join(prod2, on='product_id')

In [13]:
prod = prod.withColumn('prod_reorder_probability', prod.second_ord_prod / prod.first_ord_prod)
prod = prod.withColumn('prod_reorder_times', 1 + prod.times_prod_reordered / prod.first_ord_prod)
prod = prod.withColumn('prod_reorder_ratio', prod.times_prod_reordered / prod.times_prod_ordered)

In [14]:
prod1.unpersist()
prod2.unpersist()

DataFrame[product_id: int, second_ord_prod: bigint]

In [15]:
prod = prod.drop('times_prod_reordered', 'first_ord_prod', 'second_ord_prod')

In [16]:
prod.printSchema()

root
 |-- product_id: integer (nullable = true)
 |-- times_prod_ordered: long (nullable = true)
 |-- prod_reorder_probability: double (nullable = true)
 |-- prod_reorder_times: double (nullable = true)
 |-- prod_reorder_ratio: double (nullable = true)



In [20]:
clusters = spark.read \
    .options(header=True, inferSchema=True) \
    .csv('D:/DADOS USUARIO/Documents/springboard/capstone project/Instacart Kaggle/modularity_class_user_product.csv')
clusters = clusters.drop('Label', 'timeset', 'd0')
clusters = clusters.withColumnRenamed('Id', 'product_id')
clusters = clusters.withColumnRenamed('Degree', 'product_degree_centrality')
clusters = clusters.withColumnRenamed('modularity_class', 'product_cluster')

In [21]:
clusters.printSchema()

root
 |-- product_id: integer (nullable = true)
 |-- product_degree_centrality: integer (nullable = true)
 |-- product_cluster: integer (nullable = true)



In [22]:
prod = prod.join(clusters, on=['product_id'], how='outer')

In [23]:
prod.show()

+----------+------------------+------------------------+------------------+-------------------+-------------------------+---------------+
|product_id|times_prod_ordered|prod_reorder_probability|prod_reorder_times| prod_reorder_ratio|product_degree_centrality|product_cluster|
+----------+------------------+------------------------+------------------+-------------------+-------------------------+---------------+
|       148|              4903|      0.8348235294117647| 2.307294117647059| 0.5665918825209055|                        3|              1|
|       463|                32|     0.07142857142857142|1.1428571428571428|              0.125|                     null|           null|
|       471|               138|      0.8333333333333334|               2.3| 0.5652173913043478|                     null|           null|
|       496|                37|      0.6086956521739131| 1.608695652173913| 0.3783783783783784|                     null|           null|
|       833|                12|   

# 5 - Features from Users

In [24]:
users = all_orders.filter(all_orders.eval_set == 'prior')
users = users.withColumn('dspo', users.days_since_prior_order)

In [25]:
users.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- eval_set: string (nullable = true)
 |-- order_number: integer (nullable = true)
 |-- order_dow: integer (nullable = true)
 |-- order_hour_of_day: integer (nullable = true)
 |-- days_since_prior_order: double (nullable = true)
 |-- dspo: double (nullable = true)



In [26]:
users = users.groupby('user_id').agg({'order_number': 'max', 'days_since_prior_order': 'sum', 'dspo': 'mean'})

In [27]:
us = orders_prod.groupby('user_id').count()
us = us.withColumnRenamed("count", "user_total_prod")

In [28]:
us1 = orders_prod.select('user_id', 'reordered').filter(orders_prod.reordered == 1).groupby('user_id').sum('reordered')
us2 = orders_prod.select('user_id', 'order_number').filter(orders_prod.order_number > 1).groupby('user_id').sum('order_number')
us3 = us1.join(us2, on='user_id')

In [29]:
us3 = us3.withColumnRenamed("sum(reordered)", "reord")
us3 = us3.withColumnRenamed("sum(order_number)", "on")

In [30]:
us3 = us3.withColumn('user_reorder_ratio', us3.reord / us3.on)

In [31]:
us3 = us3.drop('reord', 'on')

In [32]:
from pyspark.sql.functions import countDistinct
us4 = orders_prod.groupBy("user_id").agg(countDistinct("product_id"))
us4 = us4.withColumnRenamed('count(DISTINCT product_id)', 'distinct')

In [33]:
us = us.join(us1, on='user_id')
us = us.join(us2, on='user_id')
us = us.join(us3, on='user_id')
us = us.join(us4, on='user_id')
users = users.join(us, on='user_id')
users = users.withColumnRenamed('max(order_number)', 'user_orders')
users = users.withColumnRenamed('avg(dspo)', 'user_mean_days_since_prior')
users = users.withColumnRenamed('sum(days_since_prior_order)', 'user_period')
users = users.withColumnRenamed('sum(reordered)', 'sum_reordered')
users = users.withColumnRenamed('sum(order_number)', 'sum_order_number')
users = users.drop('sum_reordered', 'sum_order_number')

In [34]:
users = users.withColumn('user_average_basket', users.user_total_prod / users.user_orders)

In [35]:
us_orders = all_orders.select('user_id', 'order_id', 'eval_set', 'days_since_prior_order').filter(all_orders.eval_set != 'prior')

In [36]:
users = users.join(us_orders, on='user_id')

In [37]:
us.unpersist()
us1.unpersist()
us2.unpersist()
us3.unpersist()
us4.unpersist()
us_orders.unpersist()

DataFrame[user_id: int, order_id: int, eval_set: string, days_since_prior_order: double]

In [38]:
users.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- user_mean_days_since_prior: double (nullable = true)
 |-- user_orders: integer (nullable = true)
 |-- user_period: double (nullable = true)
 |-- user_total_prod: long (nullable = false)
 |-- user_reorder_ratio: double (nullable = true)
 |-- distinct: long (nullable = false)
 |-- user_average_basket: double (nullable = true)
 |-- order_id: integer (nullable = true)
 |-- eval_set: string (nullable = true)
 |-- days_since_prior_order: double (nullable = true)



# 6 - Create a Database

In [41]:
data = orders_prod.withColumn('order_number2', orders_prod.order_number)

In [42]:
windowval4 = Window.partitionBy('user_id', 'product_id').rangeBetween(Window.unboundedPreceding, 0)
data = data.withColumn('count_order_id', F.min('order_id').over(windowval4))
data = data.withColumn('min_order_number', F.min('order_number').over(windowval4))
data = data.withColumn('max_order_number', F.max('order_number2').over(windowval4))
data = data.withColumn('mean_add_to_cart_order', F.avg('add_to_cart_order').over(windowval4))

In [43]:
data = data.drop('eval_set', 'order_id', 'reordered')

In [44]:
data = data.join(prod, on='product_id')
data = data.join(users, on='user_id')

In [45]:
data = data.withColumn('up_order_rate', data.count_order_id / data.user_orders)
data = data.withColumn('up_orders_since_last_order', data.user_orders - data.max_order_number)
data = data.withColumn('up_order_rate_since_first_order', data.count_order_id / (data.user_orders - data.min_order_number + 1))

In [46]:
windowval5 = Window.partitionBy('user_id').orderBy('order_number').rangeBetween(Window.unboundedPreceding, 0)
windowval6 = Window.partitionBy('user_id', 'product_id').orderBy('order_number')
orders_prod = orders_prod.withColumn('dspo_cum_sum', F.sum('days_since_prior_order').over(windowval5))

In [47]:
opf = orders_prod
opf = opf.withColumn('MIN_dspo_cum_sum', F.min('dspo_cum_sum').over(windowval6))
opf = opf.withColumn('MAX_dspo_cum_sum', F.max('dspo_cum_sum').over(windowval6))
opf = opf.withColumn('COUNT_dspo_cum_sum', F.count('dspo_cum_sum').over(windowval6))
opf = opf.withColumn('SND_MAX_dspo_cum_sum', F.lag('dspo_cum_sum',1,0).over(windowval6))
opf = opf.withColumn('DIFF_dspo_cum_sum', opf.MAX_dspo_cum_sum - opf.SND_MAX_dspo_cum_sum)
opf = opf.withColumn('flex_freq', opf.DIFF_dspo_cum_sum / (opf.COUNT_dspo_cum_sum - 1))
opf = opf.withColumn('avg_flex_freq', F.mean('flex_freq').over(windowval6))
opf = opf.withColumn('std_flex_freq', F.stddev('flex_freq').over(windowval6))

In [48]:
opf = opf.select('user_id', 'product_id', 'avg_flex_freq', 'std_flex_freq')
data = data.join(opf, on=['user_id','product_id'], how='outer')

In [49]:
train = train.select('user_id', 'product_id', 'reordered')

In [50]:
data = data.join(train, on=['user_id','product_id'], how='outer')

In [51]:
data = data.drop("days_since_prior_order", "order_number2", 'add_to_cart_order', 'order_dow', 'order_hour_of_day')

In [52]:
data.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- order_number: integer (nullable = true)
 |-- prod_user_times: long (nullable = true)
 |-- n_times_prod_ordered: long (nullable = true)
 |-- n_times_prod_reordered: long (nullable = true)
 |-- count_order_id: integer (nullable = true)
 |-- min_order_number: integer (nullable = true)
 |-- max_order_number: integer (nullable = true)
 |-- mean_add_to_cart_order: double (nullable = true)
 |-- times_prod_ordered: long (nullable = true)
 |-- prod_reorder_probability: double (nullable = true)
 |-- prod_reorder_times: double (nullable = true)
 |-- prod_reorder_ratio: double (nullable = true)
 |-- product_degree_centrality: integer (nullable = true)
 |-- product_cluster: integer (nullable = true)
 |-- user_mean_days_since_prior: double (nullable = true)
 |-- user_orders: integer (nullable = true)
 |-- user_period: double (nullable = true)
 |-- user_total_prod: long (nullable = true)
 |-- user_reorder_rat

In [53]:
all_orders.unpersist()
train.unpersist()
products.unpersist()
prod_full.unpersist()
orders_prod.unpersist()
prod.unpersist()
users.unpersist()

DataFrame[user_id: int, user_mean_days_since_prior: double, user_orders: int, user_period: double, user_total_prod: bigint, user_reorder_ratio: double, distinct: bigint, user_average_basket: double, order_id: int, eval_set: string, days_since_prior_order: double]

# 7 - Train and Test datasets

In [54]:
train = data.filter(data.eval_set == 'train')

In [55]:
train = train.drop('eval_set', 'user_id', 'product_id', 'order_id')

In [56]:
train = train.na.fill(0, subset=['reordered'])

In [57]:
test = data.filter(data.eval_set == 'test')

In [58]:
test = test.drop('eval_set', 'user_id', 'reordered')

In [59]:
data.unpersist()

DataFrame[user_id: int, product_id: int, order_number: int, prod_user_times: bigint, n_times_prod_ordered: bigint, n_times_prod_reordered: bigint, count_order_id: int, min_order_number: int, max_order_number: int, mean_add_to_cart_order: double, times_prod_ordered: bigint, prod_reorder_probability: double, prod_reorder_times: double, prod_reorder_ratio: double, product_degree_centrality: int, product_cluster: int, user_mean_days_since_prior: double, user_orders: int, user_period: double, user_total_prod: bigint, user_reorder_ratio: double, distinct: bigint, user_average_basket: double, order_id: int, eval_set: string, up_order_rate: double, up_orders_since_last_order: int, up_order_rate_since_first_order: double, avg_flex_freq: double, std_flex_freq: double, reordered: int]

In [60]:
train.printSchema()

root
 |-- order_number: integer (nullable = true)
 |-- prod_user_times: long (nullable = true)
 |-- n_times_prod_ordered: long (nullable = true)
 |-- n_times_prod_reordered: long (nullable = true)
 |-- count_order_id: integer (nullable = true)
 |-- min_order_number: integer (nullable = true)
 |-- max_order_number: integer (nullable = true)
 |-- mean_add_to_cart_order: double (nullable = true)
 |-- times_prod_ordered: long (nullable = true)
 |-- prod_reorder_probability: double (nullable = true)
 |-- prod_reorder_times: double (nullable = true)
 |-- prod_reorder_ratio: double (nullable = true)
 |-- product_degree_centrality: integer (nullable = true)
 |-- product_cluster: integer (nullable = true)
 |-- user_mean_days_since_prior: double (nullable = true)
 |-- user_orders: integer (nullable = true)
 |-- user_period: double (nullable = true)
 |-- user_total_prod: long (nullable = true)
 |-- user_reorder_ratio: double (nullable = true)
 |-- distinct: long (nullable = true)
 |-- user_averag

In [61]:
test.printSchema()

root
 |-- product_id: integer (nullable = true)
 |-- order_number: integer (nullable = true)
 |-- prod_user_times: long (nullable = true)
 |-- n_times_prod_ordered: long (nullable = true)
 |-- n_times_prod_reordered: long (nullable = true)
 |-- count_order_id: integer (nullable = true)
 |-- min_order_number: integer (nullable = true)
 |-- max_order_number: integer (nullable = true)
 |-- mean_add_to_cart_order: double (nullable = true)
 |-- times_prod_ordered: long (nullable = true)
 |-- prod_reorder_probability: double (nullable = true)
 |-- prod_reorder_times: double (nullable = true)
 |-- prod_reorder_ratio: double (nullable = true)
 |-- product_degree_centrality: integer (nullable = true)
 |-- product_cluster: integer (nullable = true)
 |-- user_mean_days_since_prior: double (nullable = true)
 |-- user_orders: integer (nullable = true)
 |-- user_period: double (nullable = true)
 |-- user_total_prod: long (nullable = true)
 |-- user_reorder_ratio: double (nullable = true)
 |-- distin

# 8 - Write the data file

In [62]:
train.coalesce(1).write.format("com.databricks.spark.csv").option("header", "true").save("train.csv")

Py4JJavaError: An error occurred while calling o570.save.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply$mcV$sp(FileFormatWriter.scala:213)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:166)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:166)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:166)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:145)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:58)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:56)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:74)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:138)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:135)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:92)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:92)
	at org.apache.spark.sql.execution.datasources.DataSource.writeInFileFormat(DataSource.scala:435)
	at org.apache.spark.sql.execution.datasources.DataSource.write(DataSource.scala:471)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:50)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:58)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:56)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:74)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:138)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:135)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:92)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:92)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:609)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:233)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:217)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 102.0 failed 1 times, most recent failure: Lost task 0.0 in stage 102.0 (TID 8602, localhost, executor driver): org.apache.spark.SparkException: Task failed while writing rows
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:270)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonfun$apply$mcV$sp$1.apply(FileFormatWriter.scala:189)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonfun$apply$mcV$sp$1.apply(FileFormatWriter.scala:188)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.lang.Thread.run(Unknown Source)
Caused by: java.lang.OutOfMemoryError: Java heap space

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1517)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1505)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1504)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1504)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1732)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1687)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1676)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2029)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply$mcV$sp(FileFormatWriter.scala:186)
	... 44 more
Caused by: org.apache.spark.SparkException: Task failed while writing rows
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:270)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonfun$apply$mcV$sp$1.apply(FileFormatWriter.scala:189)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonfun$apply$mcV$sp$1.apply(FileFormatWriter.scala:188)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	... 1 more
Caused by: java.lang.OutOfMemoryError: Java heap space


In [257]:
test.coalesce(1).write.format("com.databricks.spark.csv").option("header", "true").save("test.csv")