## initialization

### imports

In [1]:
from initailFunctionsPath import *

### Spark instaniation

In [2]:
conf = SparkConf()
(conf
.set('spark.driver.memory', '140g')
.set('spark.executer.cores', '58')
.set('spark.shuffle.service.index.cache.size', '3g')
.setAppName('Practice') )
sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession(sc)

## data inputs

### load daily price and shrout data

In [3]:
price_df = (
    spark.read.parquet(PRICE_PATH + '/Cleaned_Stock_Prices_14001127.parquet')
    .filter(F.col('jalaliDate').between(MIN_ANALYSIS_DATE, MAX_ANALYSIS_DATE))
    .select(
        F.col('jalaliDate').alias('date'),
        F.col('name').alias('symbol'),
        'close_price',
        'close_price_adjusted',
        'shrout',
        (F.col('MarketCap') / 10**7).alias('mktcap')
    )
    .dropDuplicates()
)

display_df(price_df)

1055114
+--------+------+-----------+--------------------+------+---------+
|date    |symbol|close_price|close_price_adjusted|shrout|mktcap   |
+--------+------+-----------+--------------------+------+---------+
|13980221|آ س پ |1366.0     |1249.0              |9.0E8 |122940.0 |
|13990624|آ س پ |15262.0    |15127.0             |1.0E9 |1526200.0|
|13990904|آ س پ |11771.0    |11667.0             |1.0E9 |1177100.0|
+--------+------+-----------+--------------------+------+---------+
only showing top 3 rows



In [4]:
MIN_PRICE_DATE = price_df.agg(F.min('date')).collect()[0][0]
MAX_PRICE_DATE = price_df.agg(F.max('date')).collect()[0][0]
min_max(price_df)

+--------+--------+
|min_date|max_date|
+--------+--------+
|13980105|14001127|
+--------+--------+



### load valid symbols data

In [5]:
valid_symbols_df = (
    spark.read.parquet(VALID_SYMBOLS_PATH + '/validSymbols.parquet')
)

display_df(valid_symbols_df)

1279
+------+
|symbol|
+------+
|ختوقا |
|همراه |
|بالاس |
+------+
only showing top 3 rows



### load daily trade data

In [6]:
# trade_df = (
#     spark.read.parquet(PATH_PORTFOLIO + '/trade_df.parquet')
#     .join(valid_symbols_df, on = ['symbol'], how = 'inner')
# )

# display_df(trade_df)

In [7]:
# print('missing nTradeShares: ', round(trade_df.filter(F.col('nTradeShares') == 0).count() / trade_df.count(), 5))
# print('missing tradeSettlementValue: ', round(trade_df.filter(F.col('tradeSettlementValue') == 0).count() / trade_df.count(), 5))

In [8]:
# (
#     trade_df
#     .agg(F.max('date').alias('maxDate'))
#     .show()
# )

### load initial portfolio data

In [9]:
# portfolio_df = (
#     spark.read.parquet(PATH_PORTFOLIO + '/portfolio_df.parquet')
#     .join(valid_symbols_df, on = ['symbol'], how = 'inner')
# )

# display_df(portfolio_df)

### load daily portfolio data

In [10]:
daily_portfolio_df = (
    spark.read.parquet(PATH_PORTFOLIO + '/daily_portfolio_df.parquet')
    .join(valid_symbols_df, on = ['symbol'], how = 'inner')
)

display_df(daily_portfolio_df)

729153558
+------+--------+---------+------------------+----------+----------+
|symbol|date    |accountId|heldShares        |netCashOut|netCashIn |
+------+--------+---------+------------------+----------+----------+
|سیتا  |13990411|2        |333.1922995557436 |0.0       |-0.4725   |
|شاروم |13990515|2        |15.992553731595871|0.0       |-0.00945  |
|وکبهمن|13990919|11       |58.323434241748764|0.0       |-0.0811104|
+------+--------+---------+------------------+----------+----------+
only showing top 3 rows



### load adjusted initial portfolio data

In [11]:
adjusted_portfolio_df = (
    spark.read.parquet(PATH_PORTFOLIO + '/adjusted_portfolio_df.parquet')
    .join(valid_symbols_df, on = ['symbol'], how = 'inner')
)

display_df(adjusted_portfolio_df)

8859415
+------+--------+---------+-----------------+
|symbol|date    |accountId|nHeldShares      |
+------+--------+---------+-----------------+
|بزاگرس|13980105|9817865  |925.1959619952494|
|بزاگرس|13980105|13793671 |925.1959619952494|
|بزاگرس|13980105|3732476  |927.2565320665083|
+------+--------+---------+-----------------+
only showing top 3 rows



In [12]:
# Why count(adjusted_portfolio_df) > count(portfolio_df)?

# (
#     portfolio_df
#     .join(adjusted_portfolio_df, on = ['symbol', 'date', 'accountId'], how = 'left_anti')
#     .count()
# )

### load invalid holdings data (negative number of shares!)

In [13]:
# accountIds with negative holdings of a stock?

invalid_holdings_df = (
    spark.read.parquet(PATH_PORTFOLIO + '/invalid_holdings_df.parquet')
    .join(valid_symbols_df, on = ['symbol'], how = 'inner')
)

display_df(invalid_holdings_df)

70544770
+------+---------+--------------+
|symbol|accountId|invalidHolding|
+------+---------+--------------+
|اپال  |5        |1             |
|سبزوا |17       |1             |
|کویر  |28       |1             |
+------+---------+--------------+
only showing top 3 rows



### load flat daily trade data

In [14]:
# flat_trade_df = (
#     spark.read.parquet(PATH_PORTFOLIO + '/flat_trade_df.parquet')
#     .join(valid_symbols_df, on = ['symbol'], how = 'inner')
# )

# display_df(flat_trade_df)

### load adjusted flat trade data

In [15]:
adjusted_raw_flat_trade_df = (
    spark.read.parquet(PATH_PORTFOLIO + '/adjusted_raw_flat_trade_df.parquet')
    .join(valid_symbols_df, on = ['symbol'], how = 'inner')
)

display_df(adjusted_raw_flat_trade_df)

1829163066
+------+--------+---------+-----------------+---------------+-------+------+
|symbol|date    |accountId|nTradeShares     |settlementValue|cashOut|cashIn|
+------+--------+---------+-----------------+---------------+-------+------+
|تبرک  |13980105|93649    |5461.316674675636|-1.12          |0.0    |-1.12 |
|تبرک  |13980105|93649    |5461.316674675636|-1.121         |0.0    |-1.121|
|تبرک  |13980105|93649    |5461.316674675636|-1.125         |0.0    |-1.125|
+------+--------+---------+-----------------+---------------+-------+------+
only showing top 3 rows



In [16]:
# wwhy count(adjusted_raw_flat_trade_df) > count(flat_trade_df)?

## data preparation

### find new entrants

In [17]:
# new_entrant_account_ids_df = (
#     flat_trade_df
#     .groupBy('accountId')
#     .agg(
#         F.min('date').alias('firstDate')
#     )
#     .join(portfolio_df.select('accountId').distinct(), on = ['accountId'], how = 'left_anti')
# )

# display_df(new_entrant_account_ids_df)

In [18]:
PATH_OUTPUT

'C:/Users/Administrator/Heidari_Ra/Outputs/'

In [19]:
# (
#     new_entrant_account_ids_df
#     .write.mode('overwrite').parquet(PATH_OUTPUT + '{}'.format('/new_entrant_account_ids.parquet'))
# )

### time series of new entrants

In [20]:
# new_entrants_time_series_df = (
#     flat_trade_df
#     .select('date', 'accountId')
#     .dropDuplicates()
#     .join(new_entrant_account_ids_df, on = 'accountId', how = 'inner')
#     .withColumn('rank', F.row_number().over(Window.partitionBy('accountId').orderBy('date')))
#     .filter(F.col('rank') == 1)
#     .drop('rank')
#     .groupBy('date')
#     .count()
#     .orderBy('date')
# )

# display_df(new_entrants_time_series_df)

In [21]:
# (
#     new_entrants_time_series_df
#     .write.mode('overwrite').parquet(PATH_OUTPUT + '/new_entrantd_time_series_df.parquet')
# )

### calculate gain from trade

In [22]:
gain_from_trade_df = (
    adjusted_raw_flat_trade_df
    .groupBy('accountId')
    .agg(
        F.sum('cashOut').alias('netCashOut'),
        F.sum('cashIn').alias('netCashIn')
    )
)

display_df(gain_from_trade_df)

10733148
+---------+-----------------+-------------------+
|accountId|netCashOut       |netCashIn          |
+---------+-----------------+-------------------+
|9460558  |86.83102269999999|-92.51729630000001 |
|2088053  |19287.23959359999|-18874.875769300008|
|7625478  |6092.705459899998|-6088.591398299999 |
+---------+-----------------+-------------------+
only showing top 3 rows



### calculate value of the initial portfolio

In [23]:
initial_portfolio_value_df = (
    adjusted_portfolio_df
    .join(price_df.select('date', 'symbol', 'close_price_adjusted'), on = ['date', 'symbol'], how = 'left')
    .withColumnRenamed('close_price_adjusted','close_price')
    .dropna(subset = ['close_price'])
    .join(invalid_holdings_df, on = ['accountId', 'symbol'], how = 'left')
    .filter(F.col('invalidHolding').isNull())
    .withColumn('value', F.col('nHeldShares') * F.col('close_price'))
    .groupBy('accountId')
    .agg(
        (F.sum('value') / 10**7).alias('initialPortfolioValue')
    )
)

display_df(initial_portfolio_value_df)

4146519
+---------+---------------------+
|accountId|initialPortfolioValue|
+---------+---------------------+
|50219    |0.5334175            |
|81085    |51.522831599999996   |
|133042   |0.045924000000000006 |
+---------+---------------------+
only showing top 3 rows



In [24]:
print(initial_portfolio_value_df.filter(F.col('initialPortfolioValue').isNull()).count())
print(initial_portfolio_value_df.filter(F.col('initialPortfolioValue') <= 0).count())

0
0


### calculate value of the final portfolio

In [25]:
final_portfolio_value_df = (
    daily_portfolio_df
    .withColumn('rowNumber', F.row_number().over(Window.partitionBy('accountId', 'symbol').orderBy('date')))
    .withColumn('maxRowNumber', F.max('rowNumber').over(Window.partitionBy('accountId', 'symbol')))
    .filter(F.col('rowNumber') == F.col('maxRowNumber'))
    .filter(F.col('heldShares') > 0)
    .withColumn('date', F.lit(MAX_PRICE_DATE))
    .join(price_df.select('date', 'symbol', 'close_price_adjusted'), on = ['date', 'symbol'], how = 'left')
    .withColumnRenamed('close_price_adjusted','close_price')
    .dropna(subset = ['close_price'])
    .withColumn('value', F.col('heldShares') * F.col('close_price'))
    .groupBy('accountId')
    .agg(
        (F.sum('value') / 10**7).alias('finalPortfolioValue')
    )   
)

display_df(final_portfolio_value_df)
# count after join?

12472786
+---------+-------------------+
|accountId|finalPortfolioValue|
+---------+-------------------+
|3305619  |55.282747828171026 |
|3873383  |27.090303378168368 |
|3914262  |27.081233002856596 |
+---------+-------------------+
only showing top 3 rows



In [26]:
print(final_portfolio_value_df.filter(F.col('finalPortfolioValue').isNull()).count())
print(final_portfolio_value_df.filter(F.col('finalPortfolioValue') <= 0).count())

0
0


### symbols in with-one-symbol portfolios

In [27]:
stock_count_within_portfolio_df = ( 
    daily_portfolio_df
    .dropDuplicates(subset= ['accountId','symbol'])
    .withColumn('numberOfStocks', F.row_number().over(Window.partitionBy('accountId').orderBy(F.desc('heldShares'))))
    .withColumn('numberOfStocks', F.max('numberOfStocks').over(Window.partitionBy('accountId')))
)

display_df(stock_count_within_portfolio_df)

277399243
+------+--------+---------+------------------+----------+---------+--------------+
|symbol|date    |accountId|heldShares        |netCashOut|netCashIn|numberOfStocks|
+------+--------+---------+------------------+----------+---------+--------------+
|وسپهر |13990702|26       |1222.8704010606561|0.0       |-1.111   |2             |
|سکارون|13980105|26       |864.5322381930185 |0.0       |0.0      |2             |
|وسپهر |13990702|29       |1133.9343718926086|0.0       |-1.0302  |10            |
+------+--------+---------+------------------+----------+---------+--------------+
only showing top 3 rows



In [28]:
symbols_in_one_symbol_portfolios_df = (
    stock_count_within_portfolio_df
    .filter(F.col('numberOfStocks') == 1)
    .withColumn('accountNumbers', F.row_number().over(Window.partitionBy('symbol').orderBy(F.desc('heldShares'))))
    .withColumn('accountNumbers', F.max('accountNumbers').over(Window.partitionBy('symbol')))
    .dropDuplicates(subset = ['symbol'])
    .select(
        F.col('symbol'),
        F.col('accountNumbers'),
    )
    .orderBy(F.desc('accountNumbers'))
)

display_df(symbols_in_one_symbol_portfolios_df)

897
+------+--------------+
|symbol|accountNumbers|
+------+--------------+
|سمایه |157888        |
|وتوصا |136985        |
|دی    |96943         |
+------+--------------+
only showing top 3 rows



In [29]:
display_df(symbols_in_one_symbol_portfolios_df)

897
+------+--------------+
|symbol|accountNumbers|
+------+--------------+
|سمایه |157888        |
|وتوصا |136985        |
|دی    |96943         |
+------+--------------+
only showing top 3 rows



In [31]:
(
    symbols_in_one_symbol_portfolios_df
    .write.mode('overwrite').parquet(r"C:\Users\Administrator\Heidari_Ra\Outputs\\" + 'symbols_in_one_symbol_portfolios_df.parquet')
)

### time series of the net cash-in

In [32]:
max_portfolio_value_df = (
    final_portfolio_value_df
    .join(initial_portfolio_value_df, on = ['accountId'], how = 'outer')
    .fillna(0)
    .withColumn('maxPortfolioValue', F.greatest(F.col('initialPortfolioValue'), F.col('finalPortfolioValue')))
    .withColumn('type', F.when(F.col('maxPortfolioValue') < 10, 'lessThan10MT')
                         .when(F.col('maxPortfolioValue').between(10, 20), 'between10MTand20MT')
                         .when(F.col('maxPortfolioValue').between(20, 50), 'between20MTand50MT')
                         .otherwise('greaterThan50MT')
               )
    .select('accountId', 'type')
    .dropDuplicates()
)

display_df(max_portfolio_value_df)

12490421
+---------+------------+
|accountId|type        |
+---------+------------+
|2250     |lessThan10MT|
|15057    |lessThan10MT|
|29089    |lessThan10MT|
+---------+------------+
only showing top 3 rows



In [33]:
dates_list = price_df.select('date').distinct().orderBy('date').rdd.flatMap(lambda x: x).collect()

cash_time_series_df = (
    adjusted_raw_flat_trade_df
    .withColumn('netCash', F.col('cashIn') + F.col('cashOut'))
    .join(max_portfolio_value_df, on = 'accountId', how = 'inner')
    .groupBy('type', 'date')
    .agg(
        F.round((-F.sum('netCash'))).alias('netCash'),
        F.countDistinct('accountId').alias('nAccounts')
    )
    .orderBy('date', 'type')
)

display_df(cash_time_series_df)

2764
+------------------+--------+-------+---------+
|type              |date    |netCash|nAccounts|
+------------------+--------+-------+---------+
|between10MTand20MT|13980105|-2878.0|9837     |
|between20MTand50MT|13980105|-2674.0|13598    |
|greaterThan50MT   |13980105|19691.0|51532    |
+------------------+--------+-------+---------+
only showing top 3 rows



In [34]:
PATH_OUTPUT

'C:/Users/Administrator/Heidari_Ra/Outputs/'

In [36]:
(
    cash_time_series_df.write.mode('overwrite').parquet(PATH_OUTPUT + 'cash_time_series.parquet')
)

## calculate returns

In [37]:
return_df = (
    gain_from_trade_df
    .join(initial_portfolio_value_df, on = 'accountId', how = 'outer')
    .join(final_portfolio_value_df, on = 'accountId', how = 'outer')
    .fillna(0, subset = ['netCashIn', 'netCashOut', 'initialPortfolioValue', 'finalPortfolioValue'])
    .withColumn('return', 
                ((F.col('finalPortfolioValue') + F.col('netCashOut')) / (F.col('initialPortfolioValue') + (-F.col('netCashIn')))) - 1)
    .filter(F.col('return').isNotNull())
    .withColumn('returnDecile', F.ntile(N_QUANTILES).over(Window.partitionBy().orderBy('return')))
)

display_df(return_df)
# null returns?

12612568
+---------+----------+---------+---------------------+-------------------+------+------------+
|accountId|netCashOut|netCashIn|initialPortfolioValue|finalPortfolioValue|return|returnDecile|
+---------+----------+---------+---------------------+-------------------+------+------------+
|1187801  |0.0       |-0.5124  |0.0                  |0.0                |-1.0  |1           |
|12187596 |0.0       |-4.002426|0.0                  |0.0                |-1.0  |1           |
|2124119  |0.0       |-0.5015  |0.0                  |0.0                |-1.0  |1           |
+---------+----------+---------+---------------------+-------------------+------+------------+
only showing top 3 rows



In [38]:
(
    return_df
    .filter(F.col('return') == 0)
    .count()
)

88980

In [39]:
(
    return_df
    .groupBy('returnDecile')
    .agg(
        F.round(F.expr('percentile(return, array(0.5))')[0], 3).alias('medianReturn')
    )
    .show()
)

+------------+------------+
|returnDecile|medianReturn|
+------------+------------+
|           1|      -0.387|
|           2|      -0.172|
|           3|      -0.052|
|           4|       0.024|
|           5|       0.123|
|           6|       0.267|
|           7|       0.508|
|           8|       1.328|
|           9|       3.712|
|          10|       8.485|
+------------+------------+



In [41]:
(
    return_df
    .agg(
       F.round(F.min('return'), 2).alias('min'),
        F.round(F.expr('percentile(return, array(0.01))')[0], 2).alias('1%'),
        F.round(F.expr('percentile(return, array(0.1))')[0], 2).alias('10%'),
        F.round(F.expr('percentile(return, array(0.25))')[0], 2).alias('25%'),
        F.round(F.expr('percentile(return, array(0.5))')[0], 2).alias('50%'),
        F.round(F.mean('return'), 2).alias('mean'),
        F.round(F.expr('percentile(return, array(0.75))')[0], 2).alias('75%'),
        F.round(F.expr('percentile(return, array(0.9))')[0], 2).alias('90%'),
        F.round(F.expr('percentile(return, array(0.99))')[0], 2).alias('99%'),
        F.round(F.expr('percentile(return, array(0.999))')[0], 2).alias('99.9%'),
    )
    .show()
)

Py4JJavaError: An error occurred while calling o552.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 319.0 failed 1 times, most recent failure: Lost task 0.0 in stage 319.0 (TID 9445) (WIN-LN8FQNN8ISE executor driver): java.lang.IllegalArgumentException: Cannot grow BufferHolder by size 272947336 because the size after growing exceeds size limitation 2147483632
	at org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder.grow(BufferHolder.java:71)
	at org.apache.spark.sql.catalyst.expressions.codegen.UnsafeWriter.grow(UnsafeWriter.java:63)
	at org.apache.spark.sql.catalyst.expressions.codegen.UnsafeWriter.writeUnalignedBytes(UnsafeWriter.java:127)
	at org.apache.spark.sql.catalyst.expressions.codegen.UnsafeWriter.write(UnsafeWriter.java:118)
	at org.apache.spark.sql.catalyst.expressions.codegen.UnsafeWriter.write(UnsafeWriter.java:114)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.writeFields_0_2$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
	at org.apache.spark.sql.execution.aggregate.AggregationIterator.$anonfun$generateResultProjection$6(AggregationIterator.scala:282)
	at org.apache.spark.sql.execution.aggregate.ObjectAggregationIterator.next(ObjectAggregationIterator.scala:85)
	at org.apache.spark.sql.execution.aggregate.ObjectAggregationIterator.next(ObjectAggregationIterator.scala:32)
	at org.apache.spark.sql.execution.aggregate.ObjectAggregationIterator.processInputs(ObjectAggregationIterator.scala:151)
	at org.apache.spark.sql.execution.aggregate.ObjectAggregationIterator.<init>(ObjectAggregationIterator.scala:77)
	at org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec.$anonfun$doExecute$2(ObjectHashAggregateExec.scala:107)
	at org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec.$anonfun$doExecute$2$adapted(ObjectHashAggregateExec.scala:85)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndexInternal$2(RDD.scala:885)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndexInternal$2$adapted(RDD.scala:885)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2258)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2207)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2206)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2206)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1079)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1079)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1079)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2445)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2387)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2376)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2196)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2217)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2236)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2261)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1030)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1029)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:390)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3696)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2722)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3687)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3685)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2722)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2929)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:301)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:338)
	at jdk.internal.reflect.GeneratedMethodAccessor649.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: java.lang.IllegalArgumentException: Cannot grow BufferHolder by size 272947336 because the size after growing exceeds size limitation 2147483632
	at org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder.grow(BufferHolder.java:71)
	at org.apache.spark.sql.catalyst.expressions.codegen.UnsafeWriter.grow(UnsafeWriter.java:63)
	at org.apache.spark.sql.catalyst.expressions.codegen.UnsafeWriter.writeUnalignedBytes(UnsafeWriter.java:127)
	at org.apache.spark.sql.catalyst.expressions.codegen.UnsafeWriter.write(UnsafeWriter.java:118)
	at org.apache.spark.sql.catalyst.expressions.codegen.UnsafeWriter.write(UnsafeWriter.java:114)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.writeFields_0_2$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
	at org.apache.spark.sql.execution.aggregate.AggregationIterator.$anonfun$generateResultProjection$6(AggregationIterator.scala:282)
	at org.apache.spark.sql.execution.aggregate.ObjectAggregationIterator.next(ObjectAggregationIterator.scala:85)
	at org.apache.spark.sql.execution.aggregate.ObjectAggregationIterator.next(ObjectAggregationIterator.scala:32)
	at org.apache.spark.sql.execution.aggregate.ObjectAggregationIterator.processInputs(ObjectAggregationIterator.scala:151)
	at org.apache.spark.sql.execution.aggregate.ObjectAggregationIterator.<init>(ObjectAggregationIterator.scala:77)
	at org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec.$anonfun$doExecute$2(ObjectHashAggregateExec.scala:107)
	at org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec.$anonfun$doExecute$2$adapted(ObjectHashAggregateExec.scala:85)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndexInternal$2(RDD.scala:885)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndexInternal$2$adapted(RDD.scala:885)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more


In [None]:
(
    return_df.write.mode('overwrite').parquet(PATH_OUTPUT + '/return_output.parquet')
)

In [None]:
# return_df.filter(F.col('return') == return_df.select('return').rdd.max()[0]).show(20,False)

In [None]:
# (
#     price_df
#     .filter((F.col("symbol") == 'ودانا')&(F.col("date") > 13980305))
#     .orderBy("date")
#     .select(
#         F.col("date"),
#         F.col("close_price"),
#         F.col("close_price_adjusted"),
#     )
#     .show(20,False)
# )

### final portfolio value output

In [None]:
output_final_portfolio_value = (
    final_portfolio_value_df
    .join(return_df.select('accountId', 'return'), on = 'accountId')
    .withColumn('finalPortfolioValueDecile', F.ntile(N_QUANTILES).over(Window.partitionBy().orderBy('finalPortfolioValue')))
)

display_df(output_final_portfolio_value)

In [None]:
(
    output_final_portfolio_value
    .groupBy('finalPortfolioValueDecile')
    .agg(
        F.round(F.expr('percentile(finalPortfolioValue, array(0.5))')[0], 3).alias('medianFinalPortfolioValue'),
        F.round(F.mean('return'), 2).alias('meanReturn'),
        F.round(F.expr('percentile(return, array(0.5))')[0], 3).alias('medianReturn')
    )
    .orderBy('finalPortfolioValueDecile')
    .show()
)

In [None]:
(
    output_final_portfolio_value.write.mode('overwrite').parquet(PATH_OUTPUT + '/final_portfolio_output.parquet')
)

### initial portfolio value output

In [None]:
output_initial_portfolio_value = (
    initial_portfolio_value_df
    .join(return_df.select('accountId', 'return'), on = 'accountId')
    .withColumn('initialPortfolioValueDecile', F.ntile(N_QUANTILES).over(Window.partitionBy().orderBy('initialPortfolioValue')))
)

display_df(output_initial_portfolio_value)

In [None]:
(
    output_initial_portfolio_value
    .groupBy('initialPortfolioValueDecile')
    .agg(
        F.round(F.expr('percentile(initialPortfolioValue, array(0.5))')[0], 3).alias('medianInitialPortfolioValue'),
        F.round(F.mean('return'), 2).alias('meanReturn'),
        F.round(F.expr('percentile(return, array(0.5))')[0], 5).alias('medianReturn')
    )
    .orderBy('initialPortfolioValueDecile')
    .show()
)

In [None]:
(
    output_initial_portfolio_value.write.mode('overwrite').parquet(PATH_OUTPUT + '/inital_portfolio_output.parquet')
)

### calculate frequency of trades and active days

In [None]:
active_days_df = (
    adjusted_raw_flat_trade_df
    .groupBy('accountId', 'date')
    .agg(
        F.sum('cashIn').alias('netCashIn'),
        F.sum('cashOut').alias('netCashOut')
    )
    .withColumn('netCash', F.col('netCashIn') + F.col('netCashOut'))
    .groupBy('accountId')
    .agg(
        F.count(F.when(F.col('netCash') < 0, F.lit(1))).alias('nBuyDays'),
        F.count(F.when(F.col('netCash') > 0, F.lit(1))).alias('nSellDays')
    )
    .fillna(0, subset = ['nBuyDays', 'nSellDays'])
)

display_df(active_days_df)

In [None]:
# buy_trade_df = (
#     trade_df
#         .select(
#         'date',
#         'symbol',
#         F.col('buyerAccountId').alias('accountId'),
#         'nTradeShares',
#         (-F.col('tradeSettlementValue')).alias('settlementValue'),
#         )
# )

# sell_trade_df = (
#     trade_df
#         .select(
#             'date',
#             'symbol',
#             F.col('sellerAccountId').alias('accountId'),
#             (-F.col('nTradeShares')).alias('nTradeShares'),
#             F.col('tradeSettlementValue').alias('settlementValue')
#         )
# )

In [None]:
# trade_kpi_df = (
#     buy_trade_df
#     .union(sell_trade_df)
#     .groupBy('accountId')
#     .agg(
#         F.count(F.lit(1)).alias('tradeFrequency'),
#         F.mean(F.abs('settlementValue')).alias('meanTradeValue'),
#         F.sum('settlementValue').alias('netSumTradeValue'),
#         F.sum(F.abs('settlementValue')).alias('absSumTradeValue'),
#         F.countDistinct('date').alias('activeDays'),
#     )
#     .join(active_days_df, on = 'accountId')
# )

# display_df(trade_kpi_df)

In [None]:
# (
#     trade_kpi_df
#     .agg(
#         F.round(F.expr('percentile(tradeFrequency, array(0.25))')[0], 2).alias('25% percentile'),
#         F.round(F.expr('percentile(tradeFrequency, array(0.5))')[0], 2).alias('50% percentile'),
#         F.round(F.mean('tradeFrequency'), 2).alias('mean'),
#         F.round(F.expr('percentile(tradeFrequency, array(0.75))')[0], 2).alias('75% percentile'),
#         F.round(F.expr('percentile(tradeFrequency, array(0.9))')[0], 2).alias('90% percentile'),
#         F.round(F.expr('percentile(tradeFrequency, array(0.99))')[0], 2).alias('99% percentile'),
#         F.round(F.expr('percentile(tradeFrequency, array(0.999))')[0], 2).alias('99.9% percentile'),
#     )
#     .show()
# )

In [None]:
# (
#     trade_kpi_df
#     .agg(
#         F.round(F.expr('percentile(activeDays, array(0.25))')[0], 2).alias('25% percentile'),
#         F.round(F.expr('percentile(activeDays, array(0.5))')[0], 2).alias('50% percentile'),
#         F.round(F.mean('activeDays'), 2).alias('mean'),
#         F.round(F.expr('percentile(activeDays, array(0.75))')[0], 2).alias('75% percentile'),
#         F.round(F.expr('percentile(activeDays, array(0.9))')[0], 2).alias('90% percentile'),
#         F.round(F.expr('percentile(activeDays, array(0.99))')[0], 2).alias('99% percentile'),
#         F.round(F.expr('percentile(activeDays, array(0.999))')[0], 2).alias('99.9% percentile'),
#     )
#     .show()
# )

In [None]:
# (
#     trade_kpi_df
#     .agg(
#         F.round(F.expr('percentile(nBuyDays, array(0.25))')[0], 2).alias('25% percentile'),
#         F.round(F.expr('percentile(nBuyDays, array(0.5))')[0], 2).alias('50% percentile'),
#         F.round(F.mean('nBuyDays'), 2).alias('mean'),
#         F.round(F.expr('percentile(nBuyDays, array(0.75))')[0], 2).alias('75% percentile'),
#         F.round(F.expr('percentile(nBuyDays, array(0.9))')[0], 2).alias('90% percentile'),
#         F.round(F.expr('percentile(nBuyDays, array(0.99))')[0], 2).alias('99% percentile'),
#         F.round(F.expr('percentile(nBuyDays, array(0.999))')[0], 2).alias('99.9% percentile'),
#     )
#     .show()
# )

In [None]:
# (
#     trade_kpi_df
#     .agg(
#         F.round(F.expr('percentile(nSellDays, array(0.25))')[0], 2).alias('25% percentile'),
#         F.round(F.expr('percentile(nSellDays, array(0.5))')[0], 2).alias('50% percentile'),
#         F.round(F.mean('nSellDays'), 2).alias('mean'),
#         F.round(F.expr('percentile(nSellDays, array(0.75))')[0], 2).alias('75% percentile'),
#         F.round(F.expr('percentile(nSellDays, array(0.9))')[0], 2).alias('90% percentile'),
#         F.round(F.expr('percentile(nSellDays, array(0.99))')[0], 2).alias('99% percentile'),
#         F.round(F.expr('percentile(nSellDays, array(0.999))')[0], 2).alias('99.9% percentile'),
#     )
#     .show()
# )

In [None]:
# print(trade_kpi_df.count() - trade_kpi_df.dropna().count())

In [None]:
# trade_output_df = (
#     trade_kpi_df
#     .join(return_df.select('accountId', 'return').dropDuplicates(), on = ['accountId'])
#     .dropna()
#     .withColumn('tradeFrequencyDecile', F.ntile(N_QUANTILES).over(Window.partitionBy().orderBy('tradeFrequency')))
#     .withColumn('meanTradeValueDecile', F.ntile(N_QUANTILES).over(Window.partitionBy().orderBy('meanTradeValue')))
#     .withColumn('netSumTradeValueDecile', F.ntile(N_QUANTILES).over(Window.partitionBy().orderBy('netSumTradeValue')))
#     .withColumn('absSumTradeValueDecile', F.ntile(N_QUANTILES).over(Window.partitionBy().orderBy('absSumTradeValue')))
#     .withColumn('activeDaysDecile', F.ntile(N_QUANTILES).over(Window.partitionBy().orderBy('activeDays')))
#     .withColumn('nBuyDaysDecile', F.ntile(N_QUANTILES).over(Window.partitionBy().orderBy('nBuyDays')))
#     .withColumn('nSellDaysDecile', F.ntile(N_QUANTILES).over(Window.partitionBy().orderBy('nSellDays')))
# )

# display_df(trade_output_df)

In [None]:
# (
#     trade_output_df
#     .agg(
#         F.round(F.expr('percentile(return, array(0.5))')[0], 3).alias('medianReturn')
#     )
#     .show()
# )

In [None]:
# (
#     trade_output_df
#     .groupBy('tradeFrequencyDecile')
#     .agg(
#         F.round(F.expr('percentile(return, array(0.5))')[0], 3).alias('medianReturn')
#     )
#     .orderBy('tradeFrequencyDecile')
#     .show()
# )

In [None]:
# (
#     trade_output_df.write.mode('overwrite').parquet(PATH_OUTPUT + '/trade_output.parquet')
# )

### identify block holders

In [None]:
bh_df = (
    daily_portfolio_df
    .select('date', 'symbol', 'accountId', 'heldShares')
    .join(price_df.select('date', 'symbol', 'shrout'), on = ['date', 'symbol'])
    .withColumn('ownership', F.col('heldShares') / F.col('shrout'))
    .filter( (F.col('ownership') >= 0.01) & F.col('ownership').isNotNull() )
    .select('accountId')
    .distinct()
    .withColumn('isBH', F.lit(1))
)

display_df(bh_df)

In [None]:
bh_output_df = (
    return_df
    .select('accountId', 'return')
    .dropna()
    .join(bh_df, on = 'accountId', how = 'left')
    .fillna(0, 'isBH')
)

display_df(bh_output_df)

In [None]:
(
    bh_output_df
    .groupBy('isBH')
    .agg(
        F.round(F.expr('percentile(return, array(0.5))')[0], 3).alias('medianTradeFrequency')
    )
    .show()
)

In [None]:
(
    bh_output_df.write.mode('overwrite').parquet(PATH_OUTPUT + '/bhOutput.parquet')
)

### number of stocks within initial portfolio

In [None]:
# n_stocks_within_initial_portfolio_df = (
#     portfolio_df
#     .groupBy('accountId')
#     .agg(
#         F.count(F.lit(1)).alias('nStocksWithinInitialPortfolio')
#     )
#     .dropna()
# )

# display_df(n_stocks_within_initial_portfolio_df)

In [None]:
# n_stocks_within_initial_portfolio_output_df = (
#     return_df
#     .select('accountId', 'return')
#     .dropna()
#     .join(n_stocks_within_initial_portfolio_df, on = 'accountId', how = 'inner')
#     .withColumn('nStocksWithinInitialPortfolioDecile', F.ntile(N_QUANTILES).over(Window.partitionBy().orderBy('nStocksWithinInitialPortfolio')))
# )

# display_df(n_stocks_within_initial_portfolio_output_df)

In [None]:
# (
#     n_stocks_within_initial_portfolio_output_df
#     .groupBy('nStocksWithinInitialPortfolioDecile')
#     .agg(
#         F.round(F.expr('percentile(return, array(0.5))')[0], 3).alias('medianReturn')
#     )
#     .orderBy('nStocksWithinInitialPortfolioDecile')
#     .show()
# )

In [None]:
# (
#     n_stocks_within_initial_portfolio_output_df.write.mode('overwrite').parquet(PATH_OUTPUT + '/n_initial_portfolio.parquet')
# )

### number of stocks within final portfolio

In [None]:
n_stocks_within_final_portfolio_df = (
    daily_portfolio_df
    .withColumn('rowNumber', F.row_number().over(Window.partitionBy('accountId', 'symbol').orderBy('date')))
    .withColumn('maxRowNumber', F.max('rowNumber').over(Window.partitionBy('accountId', 'symbol')))
    .filter(F.col('rowNumber') == F.col('maxRowNumber'))
    .filter(F.col('heldShares') > 0)
    .withColumn('date', F.lit(MAX_PRICE_DATE))
    .join(price_df.select('date', 'symbol', 'close_price'), on = ['date', 'symbol'], how = 'left')
    .dropna(subset = ['close_price'])
    .groupBy('accountId')
    .agg(
        F.countDistinct('symbol').alias('nStocksWithinFinalPortfolio')
    )   
)

display_df(n_stocks_within_final_portfolio_df)

In [None]:
n_stocks_within_final_portfolio_output_df = (
    return_df
    .select('accountId', 'return')
    .dropna()
    .join(n_stocks_within_final_portfolio_df, on = 'accountId', how = 'inner')
    .withColumn('nStocksWithinFinalPortfolioDecile', F.ntile(N_QUANTILES).over(Window.partitionBy().orderBy('nStocksWithinFinalPortfolio')))
)

display_df(n_stocks_within_final_portfolio_output_df)

In [None]:
(
    n_stocks_within_final_portfolio_output_df
    .groupBy('nStocksWithinFinalPortfolioDecile')
    .agg(
        F.round(F.expr('percentile(return, array(0.5))')[0], 3).alias('medianReturn')
    )
    .orderBy('nStocksWithinFinalPortfolioDecile')
    .show()
)

In [None]:
(
    n_stocks_within_final_portfolio_output_df.write.mode('overwrite').parquet(PATH_OUTPUT + '/n_final_portfolio.parquet')
)

### turnover

In [None]:
# turnover_df = (
#     trade_kpi_df
#     .join(final_portfolio_value_df, on =['accountId'], how = 'right')
#     .fillna(0, subset = ['absSumTradeValue'])
#     .withColumn('turnover', F.col('absSumTradeValue') / F.col('finalPortfolioValue'))
#     .join(return_df.select('accountId', 'return'), on = 'accountId')
#     .withColumn('turnoverDecile', F.ntile(N_QUANTILES).over(Window.partitionBy().orderBy(F.col('turnover'))))
#     .select(
#         'accountId',
#         'turnover',
#         'turnoverDecile',
#         'return'
#     )
# )

# display_df(turnover_df)

In [None]:
# (
#     turnover_df
#     .groupBy('turnoverDecile')
#     .agg(
#         F.round(F.expr('percentile(return, array(0.5))')[0], 3).alias('medianReturn')
#     )
#     .orderBy('turnoverDecile')
#     .show()
# )

In [None]:
# (
#     turnover_df.write.mode('overwrite').parquet(PATH_OUTPUT + '/turnover.parquet')
# )