## initialization

### imports

In [1]:
from pyspark import SparkConf, SparkContext
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyarrow.parquet as pq
import os
from pyspark.sql import SparkSession
import pyspark.sql.types as T
import pyspark.sql.functions as F
from functools import reduce
from pyspark.sql.window import Window

### configs

In [2]:
PATH_TRADE = '/home/user1/Data/'
PATH_PORTFOLIO = '/home/user1/Data/Portfolio/'
PRICE_PATH =  '/home/user1/Data/'
VALID_SYMBOLS_PATH = '/home/user1/Data/'

HOUR_SECONDS = 60 * 60
MINUTE_SECONDS = 60

MIN_ANALYSIS_DATE = 13980101
MAX_ANALYSIS_DATE = 13980331

N_QUANTILES = 10

### general functions

In [3]:
def display_df(df):
    df.persist()
    print(df.count())
    df.show(3, False)

def min_max(df):
    return df.agg(F.min('date').alias('min_date'), F.max('date').alias('max_date')).show()

def modify_time(x):
    hour = x // 10000
    minute = (x % 10000) // 100
    second = x % 100
    return HOUR_SECONDS * 3600 + MINUTE_SECONDS * 60 + second
modify_time_udf = F.udf(modify_time, T.IntegerType())

dropSpace = F.udf(lambda x: x.replace(' ', ''), T.StringType())

mappingDict = {
              'ما  ' : 'ما',
              'جم  ' : 'جم',
              'جمپیلن' : 'جم پیلن',
              'افقملت' : 'افق ملت',
              'آسپ' : 'آ س پ',
              'آپ  ' : 'آپ',
              'سپ  ' : 'سپ',
              'غپاذر' : 'غپآذر',
              'هدشت' : 'دهدشت',
              'نگان' : 'زنگان',
              'فبورس' : 'فرابورس',
              'شیری' : 'دشیری',
              'وتعان' : 'وتعاون',
              'آس پ' : 'آ س پ',
              'انرژی1': 'انرژی 1',
              'انرژی2' : 'انرژی 2',
              'انرژی3' : 'انرژی 3',
              'انرژیح1' : 'انرژیح 1',
              'انرژیح2' : 'انرژیح 2',
              'انرژیح3' : 'انرژیح 3',
              'فناوا' : 'فن آوا',
              'فنآوا' : 'فن آوا',
              'امینیکم' : 'امین یکم',
              'هایوب' : 'های وب',
              'کیبیسی' : 'کی بی سی',
              'کیبیسیح' : 'کی بی سیح',
              'واتوس' : 'وآتوس'
              }

def replace_arabic_characters_and_correct_symbol_names(data):
    mapping = {
        'ك': 'ک',
        'گ': 'گ',
        'دِ': 'د',
        'بِ': 'ب',
        'زِ': 'ز',
        'ذِ': 'ذ',
        'شِ': 'ش',
        'سِ': 'س',
        'ى': 'ی',
        'ي': 'ی',
    }
    for i in mapping:
        data = (
            data
            .withColumn('symbol', F.regexp_replace('symbol', i, mapping[i]))
        )
    data = (
        data
        .withColumn(
        'symbol',
        F.when((F.col('symbol').substr(1, 1) == 'ذ') & (F.col('symbol') != 'ذوب'), F.col('symbol').substr(2, 30)).otherwise(
            F.col('symbol'))
        )
        .withColumn(
        'symbol',
        F.when(F.col('symbol').substr(1, 2) == 'گژ', F.col('symbol').substr(3, 30)).otherwise(
            F.col('symbol'))
        )
        .withColumn(
        'symbol',
        F.when(F.col('symbol').substr(1, 1) == 'ژ', F.col('symbol').substr(2, 30)).otherwise(
            F.col('symbol'))
        )
        .replace(mappingDict,subset=['symbol'])
    )
    return data

spaceDeleteUDF1 = F.udf(lambda s: s.replace('\u200d', ''), T.StringType())
spaceDeleteUDF2 = F.udf(lambda s: s.replace('\u200c', ''), T.StringType())

### Spark instaniation

In [4]:
conf = SparkConf()
conf.set('spark.driver.memory', '130g').set('spark.shuffle.service.index.cache.size', '1g').setAppName('Practice') #.set('spark.executer.cores', '58')
sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession(sc)

22/02/22 17:12:02 WARN Utils: Your hostname, user1-ubuntu resolves to a loopback address: 127.0.1.1; using 172.16.32.107 instead (on interface eth0)
22/02/22 17:12:02 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/22 17:12:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## data inputs

### load daily trade data

In [5]:
raw_trade_df = spark.read.parquet(PATH_TRADE + "tradeData.parquet")

display_df(raw_trade_df)
# capital increas?
# make sure trade value and number of shares are not zero!

                                                                                

27464132
+--------+------+------+------------------------------------+------------------------------------+------------+----------+--------------------+
|date    |time  |symbol|buyerAccountId                      |sellerAccountId                     |nTradeShares|tradePrice|tradeSettlementValue|
+--------+------+------+------------------------------------+------------------------------------+------------+----------+--------------------+
|13980110|115203|ثامان |ECC77A89-C9AD-494F-8B49-4F178C2D7F3E|2AF253F2-7044-44EA-858B-09DA6A224E86|24928       |3390.0    |8.450592            |
|13980110|120816|ثاباد |9288482E-9715-4DE3-AAB6-D20D2FB157DE|02F7AA8E-29E7-4594-ACD4-FDAEE6BA957B|2000        |2320.0    |0.464               |
|13980110|122054|آسیا  |5D9B391F-C8F1-48E8-A9D0-2215FFCB9FCB|C7470FDD-F4DA-472D-895A-EBD1ED249BAD|7573        |1876.0    |1.4206948           |
+--------+------+------+------------------------------------+------------------------------------+------------+----------+-----

In [6]:
min_max(raw_trade_df)



+--------+--------+
|min_date|max_date|
+--------+--------+
|13980105|13980329|
+--------+--------+



                                                                                

### load portfolio data

In [7]:
raw_portfolio_df = (
    spark.read.parquet(PATH_PORTFOLIO + '{}'.format('portfolio.parquet'))
)

for i in raw_portfolio_df.columns:
    raw_portfolio_df = (
        raw_portfolio_df
        .withColumn(i, spaceDeleteUDF1(i))
        .withColumn(i, spaceDeleteUDF2(i))
    )
    
for i in ['SPTROH', 'SPBYNS', 'SPBYNR', 'SPSLNS', 'SPPLGE']:
    raw_portfolio_df = (
        raw_portfolio_df
        .withColumn(i, F.col(i).cast('int'))
    )
    
raw_portfolio_df = (
    raw_portfolio_df
    .withColumn('SPTROH', F.col('SPTROH') + F.col('SPBYNS') + F.col('SPBYNR') + F.col('SPSLNS') + F.col('SPPLGE'))
    .select(
        'SPSYMB',
        'SPDATE',
        'SPACC#',
        'SPTROH'
    )
    .dropDuplicates()
    )

display_df(raw_portfolio_df)

                                                                                

12109854
+------+--------+------------------------------------+------+
|SPSYMB|SPDATE  |SPACC#                              |SPTROH|
+------+--------+------------------------------------+------+
|خساپا |13980105|7432095E-8BDB-41E5-B841-AAD78C46548B|12956 |
|خساپا |13980105|58EC36FA-E468-4D74-9C95-14B630E6C68F|7561  |
|ثامان |13980105|0DCCFEE2-FA77-44D8-A27D-139B38EDD72A|3824  |
+------+--------+------------------------------------+------+
only showing top 3 rows



### load daily price and shrout data

In [8]:
price_df = (
    spark.read.parquet(PRICE_PATH.format('Cleaned_Stock_Prices_14001116.parquet'))
    .filter(F.col('jalaliDate').between(MIN_ANALYSIS_DATE, MAX_ANALYSIS_DATE))
    .select(
        F.col('jalaliDate').alias('date'),
        F.col('name').alias('symbol'),
        'close_price',
        'close_price_adjusted',
        'shrout',
        (F.col('MarketCap') / 10**7).alias('mktcap')
    )
    .dropDuplicates()
)

price_df = replace_arabic_characters_and_correct_symbol_names(price_df)

display_df(price_df)

22/02/22 17:15:30 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
                                                                                

57091


22/02/22 17:15:47 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB


+--------+------+-----------+--------------------+------+---------+
|date    |symbol|close_price|close_price_adjusted|shrout|mktcap   |
+--------+------+-----------+--------------------+------+---------+
|13980221|آ س پ |1366.0     |1249.0              |9.0E8 |122940.0 |
|13980125|آتیمس |30009.0    |30380.0             |1.0E9 |3000900.0|
|13980202|آتیمس |30880.0    |31262.0             |1.0E9 |3088000.0|
+--------+------+-----------+--------------------+------+---------+
only showing top 3 rows



In [9]:
added_price_df = spark.createDataFrame(pd.DataFrame({
                                                        'date' : [13980105],
                                                        'symbol' : ['ومشان'],
                                                        'close_price' : [561],
                                                        'close_price_adjusted' : [np.nan],
                                                        'shrout' : [20000000],
                                                        'mktcap' : [1122]
                                                    })
                                      )

price_df = price_df.union(added_price_df)

In [10]:
min_max(price_df)

22/02/22 17:15:48 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB

+--------+--------+
|min_date|max_date|
+--------+--------+
|13980105|13980329|
+--------+--------+



                                                                                

In [11]:
MIN_PRICE_DATE = price_df.agg(F.min('date')).collect()[0][0]
MAX_PRICE_DATE = price_df.agg(F.max('date')).collect()[0][0]

price_df.agg(F.countDistinct('symbol')).show()
price_df.filter(F.col('date') == MIN_PRICE_DATE).agg(F.countDistinct('symbol')).show()
price_df.filter(F.col('date') == MAX_PRICE_DATE).agg(F.countDistinct('symbol')).show()

22/02/22 17:15:52 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:15:56 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:15:58 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:16:06 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
                                                                                

+-------------+
|count(symbol)|
+-------------+
|         1031|
+-------------+



22/02/22 17:16:07 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:16:10 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
                                                                                

+-------------+
|count(symbol)|
+-------------+
|         1014|
+-------------+



22/02/22 17:16:11 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:16:13 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB


+-------------+
|count(symbol)|
+-------------+
|         1024|
+-------------+



                                                                                

### load valid symbols data

In [12]:
valid_symbols_df = (
    spark.read.parquet(VALID_SYMBOLS_PATH + '{}'.format('Symbols_14001116.parquet'))
    .select('Ticker')
    .withColumnRenamed('Ticker','symbol')
    .dropDuplicates()
)

valid_symbols_df = replace_arabic_characters_and_correct_symbol_names(valid_symbols_df)

display_df(valid_symbols_df)

22/02/22 17:16:22 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
                                                                                

701


22/02/22 17:16:26 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:16:26 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB


+------+
|symbol|
+------+
|دسبحان|
|فن آوا|
|دلقما |
+------+
only showing top 3 rows



In [13]:
ETFs = ['آتیمس',
 'آرمانی',
 'آساس',
 'آسام',
 'آسامید',
 'آوا',
 'آکورد',
 'آگاس',
 'ارزش',
 'اطلس',
 'اعتماد',
 'افران',
 'افق ملت',
 'الماس',
 'امین یکم',
 'انار',
 'اهرم',
 'اوج',
 'اوصتا',
 'بذر',
 'تاراز',
 'تصمیم',
 'ثبات',
 'ثروتم',
 'ثمین',
 'ثهام',
 'خاتم',
 'دارا',
 'دارا یکم',
 'داریوش',
 'داریک',
 'رماس',
 'رویش',
 'زر',
 'زرین',
 'زیتون',
 'سبز',
 'سحرخیز',
 'سخند',
 'سرو',
 'سپاس',
 'سپر',
 'سپیدما',
 'سیناد',
 'صایند',
 'صغرب',
 'صنم',
 'صنوین',
 'طلا',
 'عیار',
 'فراز',
 'فردا',
 'فیروزا',
 'مانی',
 'مثقال',
 'مدیر',
 'نارون',
 'نسیم',
 'نهال',
 'هامرز',
 'همای',
 'وبازار',
 'ویستا',
 'پادا',
 'پارند',
 'پالایش',
 'کارا',
 'کاردان',
 'کاریس',
 'کارین',
 'کامیاب',
 'کمند',
 'کهربا',
 'کیان',
 'گنبد',
 'گنجین',
 'گنجینه',
 'گوهر',
 'یارا',
 'یاقوت',
 'فیروزه',]
ETFs = [(i,) for i in ETFs]
ETFs = spark.createDataFrame(data= ETFs,  schema= valid_symbols_df.schema)
valid_symbols_df = valid_symbols_df.union(ETFs).dropDuplicates()
display_df(valid_symbols_df)


right_offers = ['آ س پح',
 'آرمانح',
 'آریانح',
 'آرینح',
 'آکنتورح',
 'اتکامح',
 'اتکایح',
 'اخابرح',
 'ارفعح',
 'اعتلاح',
 'افراح',
 'افقح',
 'البرزح',
 'امیدح',
 'امینح',
 'اوانح',
 'بالبرح',
 'بایکاح',
 'بترانسح',
 'بتکح',
 'بدکوح',
 'برکتح',
 'بزاگرسح',
 'بساماح',
 'بسویچح',
 'بشهابح',
 'بصباح',
 'بفجرح',
 'بموتوح',
 'بمیلاح',
 'بنوح',
 'بنیروح',
 'بهپاکح',
 'بپاسح',
 'بکابح',
 'بکامح',
 'بکهنوجح',
 'تاصیکوح',
 'تاپکیشح',
 'تاپیکوح',
 'تایراح',
 'تجلیح',
 'تشتادح',
 'تفیروح',
 'تلیسهح',
 'تماوندح',
 'تمحرکهح',
 'تملتح',
 'تنوینح',
 'توریلح',
 'تپمپیح',
 'تپکوح',
 'تکشاح',
 'تکمباح',
 'تکنارح',
 'تکنوح',
 'تیپیکوح',
 'ثابادح',
 'ثاختح',
 'ثاصفاح',
 'ثالوندح',
 'ثامانح',
 'ثاژنح',
 'ثباغح',
 'ثترانح',
 'ثرودح',
 'ثشاهدح',
 'ثشرقح',
 'ثعتماح',
 'ثعمراح',
 'ثغربح',
 'ثفارسح',
 'ثقزویح',
 'ثمسکنح',
 'ثنورح',
 'ثنوساح',
 'ثپردیسح',
 'جمح',
 'جهرمح',
 'حبندرح',
 'حتایدح',
 'حتوکاح',
 'حخزرح',
 'حسیناح',
 'حفاریح',
 'حپارساح',
 'حپتروح',
 'حکشتیح',
 'حکمتح',
 'خاذینح',
 'خاهنح',
 'خاورح',
 'خبهمنح',
 'ختراکح',
 'ختورح',
 'ختوقاح',
 'خدیزلح',
 'خریختح',
 'خرینگح',
 'خزامیاح',
 'خزرح',
 'خساپاح',
 'خشرقح',
 'خصدراح',
 'خفناورح',
 'خفنرح',
 'خفولاح',
 'خلنتح',
 'خمحرکهح',
 'خمحورح',
 'خمهرح',
 'خموتورح',
 'خنصیرح',
 'خودروح',
 'خوسازح',
 'خپارسح',
 'خپویشح',
 'خچرخشح',
 'خکارح',
 'خکاوهح',
 'خکمکح',
 'خگسترح',
 'دابورح',
 'دارابح',
 'داروح',
 'داسوهح',
 'دالبرح',
 'دامینح',
 'داناح',
 'دبالکح',
 'دتمادح',
 'دتهرانح',
 'دتوزیعح',
 'دتولیح',
 'دتولیدح',
 'دجابرح',
 'ددامح',
 'درازکح',
 'درهآورح',
 'دروزح',
 'دزهراویح',
 'دسانکوح',
 'دسبحاح',
 'دسبحانح',
 'دسیناح',
 'دشیریح',
 'دشیمیح',
 'دعبیدح',
 'دفاراح',
 'دفراح',
 'دقاضیح',
 'دلرح',
 'دلقماح',
 'دهدشتح',
 'دپارسح',
 'دکوثرح',
 'دکپسولح',
 'دکیمیح',
 'دیرانح',
 'رانفورح',
 'رتاپح',
 'رتکوح',
 'رمپناح',
 'رنیکح',
 'رپارسح',
 'رکیشح',
 'زفکاح',
 'زقیامح',
 'زملاردح',
 'زمگساح',
 'زنجانح',
 'زنگانح',
 'زگلدشتح',
 'ساذریح',
 'سارابح',
 'ساربیلح',
 'ساروجح',
 'سارومح',
 'سامانح',
 'سباقرح',
 'سبجنوح',
 'سبحانح',
 'سبهانح',
 'سترانح',
 'سجامح',
 'سخاشح',
 'سخزرح',
 'سخوافح',
 'سخوزح',
 'سدبیرح',
 'سدشتح',
 'سدورح',
 'سرودح',
 'سرچشمهح',
 'سشرقح',
 'سشمالح',
 'سصفهاح',
 'سصوفیح',
 'سغربح',
 'سفارح',
 'سفارسح',
 'سفارودح',
 'سفاسیتح',
 'سفانوح',
 'سقاینح',
 'سلارح',
 'سمازنح',
 'سمایهح',
 'سمگاح',
 'سنوینح',
 'سنیرح',
 'سهرمزح',
 'سهگمتح',
 'سپاهاح',
 'سپح',
 'سپرمیح',
 'سکارونح',
 'سکردح',
 'سکرماح',
 'سیدکوح',
 'سیلامح',
 'شاراکح',
 'شاملاح',
 'شبریزح',
 'شبهرنح',
 'شتهرانح',
 'شتولیح',
 'شتوکاح',
 'شجمح',
 'شخارکح',
 'شدوصح',
 'شرانلح',
 'شرنگیح',
 'شزنگح',
 'شسمح',
 'شسیناح',
 'شصدفح',
 'شصفهاح',
 'شفاراح',
 'شفارسح',
 'شفنح',
 'شلردح',
 'شلعابح',
 'شموادح',
 'شنفتح',
 'شپارسح',
 'شپاسح',
 'شپاکساح',
 'شپتروح',
 'شپدیسح',
 'شپلیح',
 'شپناح',
 'شکبیرح',
 'شکربنح',
 'شکفح',
 'شکلرح',
 'شگلح',
 'شیرازح',
 'شیرانح',
 'صباح',
 'غاذرح',
 'غالبرح',
 'غبشهرح',
 'غبهنوشح',
 'غبهپاکح',
 'غدامح',
 'غدشتح',
 'غسالمح',
 'غشاذرح',
 'غشانح',
 'غشصفاح',
 'غشهدابح',
 'غشهدح',
 'غشوکوح',
 'غصینوح',
 'غمارگح',
 'غمشهدح',
 'غمهراح',
 'غمینوح',
 'غنابح',
 'غنوشح',
 'غنیلیح',
 'غویتاح',
 'غپاکح',
 'غپینوح',
 'غچینح',
 'غگرجیح',
 'غگرگح',
 'غگلح',
 'فاذرح',
 'فاراکح',
 'فاسمینح',
 'فافزاح',
 'فالبرح',
 'فالومح',
 'فاماح',
 'فاهوازح',
 'فایراح',
 'فباهنرح',
 'فبیراح',
 'فجامح',
 'فجرح',
 'فجوشح',
 'فخاسح',
 'فخوزح',
 'فرآورح',
 'فرومح',
 'فزرینح',
 'فساح',
 'فسازانح',
 'فسدیدح',
 'فسربح',
 'فسپاح',
 'فلاتح',
 'فلامیح',
 'فلولهح',
 'فماکح',
 'فملیح',
 'فن آواح',
 'فنفتح',
 'فنوالح',
 'فنوردح',
 'فولادح',
 'فولاژح',
 'فولایح',
 'فوکاح',
 'فپنتاح',
 'قاسمح',
 'قجامح',
 'قرنح',
 'قزوینح',
 'قشرینح',
 'قشهدح',
 'قشکرح',
 'قشیرح',
 'قصفهاح',
 'قلرستح',
 'قنقشح',
 'قنیشاح',
 'قپارسح',
 'لابساح',
 'لازماح',
 'لبوتانح',
 'لخانهح',
 'لخزرح',
 'لراداح',
 'لسرماح',
 'لوتوسح',
 'لپیامح',
 'لکماح',
 'مادیراح',
 'مدارانح',
 'مرقامح',
 'معیارح',
 'ملتح',
 'میدکوح',
 'میهنح',
 'نبروجح',
 'نتوسح',
 'نشیراح',
 'نمرینوح',
 'نوینح',
 'نیروح',
 'هجرتح',
 'همراهح',
 'وآذرح',
 'وآرینح',
 'وآفریح',
 'وآیندح',
 'واتیح',
 'وارسح',
 'واعتبارح',
 'والبرح',
 'وامیدح',
 'وانصارح',
 'وبانکح',
 'وبشهرح',
 'وبملتح',
 'وبهمنح',
 'وبوعلیح',
 'وبیمهح',
 'وتجارتح',
 'وتعاونح',
 'وتوسح',
 'وتوسمح',
 'وتوسکاح',
 'وتوشهح',
 'وتوصاح',
 'وتوکاح',
 'وثوقح',
 'وحافظح',
 'وخارزمح',
 'وخاورح',
 'وداناح',
 'ودیح',
 'ورازیح',
 'ورناح',
 'وزمینح',
 'وساختح',
 'وساپاح',
 'وسدیدح',
 'وسرمدح',
 'وسناح',
 'وسپهح',
 'وسکابح',
 'وسیناح',
 'وسینح',
 'وشمالح',
 'وصناح',
 'وصندوقح',
 'وصنعتح',
 'وغدیرح',
 'وقوامح',
 'ولبهمنح',
 'ولتجارح',
 'ولرازح',
 'ولساپاح',
 'ولشرقح',
 'ولصنمح',
 'ولغدرح',
 'ولقمانح',
 'ولملتح',
 'ولیزح',
 'ومعادنح',
 'وملتح',
 'ومللح',
 'وملیح',
 'ونفتح',
 'ونوینح',
 'ونیروح',
 'ونیکیح',
 'وهامونح',
 'وهورح',
 'وپارسح',
 'وپاسارح',
 'وپتروح',
 'وپخشح',
 'وکادوح',
 'وکارح',
 'وکوثرح',
 'وگردشح',
 'پارتاح',
 'پارسانح',
 'پارسیانح',
 'پاساح',
 'پاکشوح',
 'پتایرح',
 'پترولح',
 'پخشح',
 'پدرخشح',
 'پرداختح',
 'پردیسح',
 'پسهندح',
 'پشاهنح',
 'پلاستح',
 'پلاسکح',
 'پلولهح',
 'پنکاح',
 'پکرمانح',
 'پکویرح',
 'پکیانح',
 'چافستح',
 'چفیبرح',
 'چکارلح',
 'چکارنح',
 'چکاوهح',
 'کابگنح',
 'کاذرح',
 'کازروح',
 'کاسپینح',
 'کالبرح',
 'کاماح',
 'کاوهح',
 'کایتاح',
 'کبافقح',
 'کترامح',
 'کتوکاح',
 'کحافظح',
 'کخاکح',
 'کدماح',
 'کرازیح',
 'کرماشاح',
 'کرویح',
 'کزغالح',
 'کساوهح',
 'کساپاح',
 'کسراح',
 'کسرامح',
 'کسعدیح',
 'کطبسح',
 'کفرآورح',
 'کفراح',
 'کفپارسح',
 'کقزویح',
 'کلوندح',
 'کماسهح',
 'کمرجانح',
 'کمنگنزح',
 'کمیناح',
 'کنورح',
 'کهرامح',
 'کهمداح',
 'کوثرح',
 'کورزح',
 'کویرح',
 'کپارسح',
 'کپاناح',
 'کپرورح',
 'کپشیرح',
 'کچادح',
 'کگازح',
 'کگلح',
 'کگهرح',
 'کی بی سیح',
 'کیسونح',
 'گوهرانح',
 'گکیشح']
right_offers = [(i,) for i in right_offers]
right_offers = spark.createDataFrame(data= right_offers,  schema= valid_symbols_df.schema)
valid_symbols_df = valid_symbols_df.union(right_offers).dropDuplicates()
display_df(valid_symbols_df)

handly_collected_valid_symbols = [
    'فسلير',
    'نگین',
    'نیرو',
    'غگز',
    'آینده'
]
handly_collected_valid_symbols = [(i,) for i in handly_collected_valid_symbols]
handly_collected_valid_symbols = spark.createDataFrame(data= handly_collected_valid_symbols,  schema= valid_symbols_df.schema)
valid_symbols_df = valid_symbols_df.union(handly_collected_valid_symbols).dropDuplicates()
display_df(valid_symbols_df)


invalid_symbols = [
                    'وکوثر',
                    'حکمت',
                    'ومهر',
                    'جوین',
                    'وقوام',
                    'ممسنی',
                    'غناب',
                    'کارا',
                    'کاوه',
                    'گنگین',
                    'وپویا',
                    'ولپارس',
                    'نوری',
                    'شجی',
                    'ثعمسا',
                    'بگیلان',
                    'بجهرم',
                    'آرمانح',
                    'شساخت',
                    'ثخوز',
                    'قاروم', # capital increase
                    'امید',
                    'وهنر',
                    'تنوین',
                    'وگردش',
                    'آریان', # two different stocks with the same symbol
                    'وآتوس', 
                    'همراه' # capital increase!
    
]

valid_symbols_df = valid_symbols_df.filter(~F.col('symbol').isin(invalid_symbols))
display_df(valid_symbols_df)

22/02/22 17:16:27 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:16:28 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
                                                                                

782


22/02/22 17:16:30 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:16:31 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB


+------+
|symbol|
+------+
|دسبحان|
|فن آوا|
|دلقما |
+------+
only showing top 3 rows



22/02/22 17:16:32 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:16:33 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
                                                                                

1299


22/02/22 17:16:35 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:16:35 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB


+------+
|symbol|
+------+
|دسبحان|
|فن آوا|
|بکابح |
+------+
only showing top 3 rows



22/02/22 17:16:36 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:16:39 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
                                                                                

1304


22/02/22 17:16:40 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:16:41 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB


+------+
|symbol|
+------+
|دسبحان|
|فن آوا|
|بکابح |
+------+
only showing top 3 rows



22/02/22 17:16:42 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
                                                                                

1282


22/02/22 17:16:44 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:16:44 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB


+------+
|symbol|
+------+
|دسبحان|
|فن آوا|
|بکابح |
+------+
only showing top 3 rows



## data preparation

### prepare trade data

In [14]:
trade_df = (
    raw_trade_df
    .withColumn('secondsWithinDay', modify_time_udf('time'))
    .join(valid_symbols_df, on = ['symbol'], how = 'inner')
    .select(
        'date',
        'time',
        'secondsWithinDay',
        'symbol',
        'nTradeShares',
        'tradePrice',
        'tradeSettlementValue',
        dropSpace(F.col('buyerAccountId')).alias('buyerAccountId'),
        dropSpace(F.col('sellerAccountId')).alias('sellerAccountId')
    )
)

display_df(trade_df)
# Note: 'time' columns is not reliable!

22/02/22 17:16:45 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:16:47 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
                                                                                

26223382


22/02/22 17:17:02 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB


+--------+------+----------------+------+------------+----------+--------------------+------------------------------------+------------------------------------+
|date    |time  |secondsWithinDay|symbol|nTradeShares|tradePrice|tradeSettlementValue|buyerAccountId                      |sellerAccountId                     |
+--------+------+----------------+------+------------+----------+--------------------+------------------------------------+------------------------------------+
|13980110|115203|12963603        |ثامان |24928       |3390.0    |8.450592            |ECC77A89-C9AD-494F-8B49-4F178C2D7F3E|2AF253F2-7044-44EA-858B-09DA6A224E86|
|13980110|120816|12963616        |ثاباد |2000        |2320.0    |0.464               |9288482E-9715-4DE3-AAB6-D20D2FB157DE|02F7AA8E-29E7-4594-ACD4-FDAEE6BA957B|
|13980110|122054|12963654        |آسیا  |7573        |1876.0    |1.4206948           |5D9B391F-C8F1-48E8-A9D0-2215FFCB9FCB|C7470FDD-F4DA-472D-895A-EBD1ED249BAD|
+--------+------+----------------+

In [15]:
print('missing nTradeShares: ', round(trade_df.filter(F.col('nTradeShares') == 0).count() / trade_df.count(), 5))
print('missing tradeSettlementValue: ', round(trade_df.filter(F.col('tradeSettlementValue') == 0).count() / trade_df.count(), 5))

22/02/22 17:17:03 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:17:04 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
                                                                                

missing nTradeShares:  0.0


22/02/22 17:17:06 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:17:07 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB


missing tradeSettlementValue:  0.0


### prepare initial portfolio data

In [16]:
mapping = (
    dict(
    zip(
        ['SPDATE', 'SPSYMB', 'SPACC#', 'SPTROH'],
        ['date', 'symbol', 'accountId', 'nHeldShares'],
    )
    )
)

portfolio_df = (
    raw_portfolio_df
    .select(
    [F.col(c).alias(mapping.get(c, c)) for c in raw_portfolio_df.columns]
    )
    .select(
        'date',
        'symbol',
        dropSpace(F.col('accountId')).alias('accountId'),
        'nHeldShares'
    )
)

portfolio_df = replace_arabic_characters_and_correct_symbol_names(portfolio_df)
display_df(portfolio_df)


replaceChar = F.udf(lambda s: s[:-1], T.StringType())

def agg(x):
    t = ''
    for i in x.split(' '):
        t += i
    return t

def cleaning(data):
    data = (
        data
        .withColumn(
        'symbol',
        F.when(F.col('symbol').endswith('ج'),
         replaceChar(F.col('symbol'))).otherwise(
            F.col('symbol')
        )
        )
    )
    for i in ['اوج', 'بکهنوج', 'ساروج', 'نبروج', 'وسخراج']:
            data = (
                data
                .withColumn(
                    'symbol', F.when(F.col('symbol') == i[:-1], i).otherwise(F.col('symbol'))
                )
            )
    data = data.dropDuplicates()
    return data

portfolio_df = cleaning(portfolio_df)

                                                                                

12109854
+--------+------+------------------------------------+-----------+
|date    |symbol|accountId                           |nHeldShares|
+--------+------+------------------------------------+-----------+
|13980105|خساپا |7432095E-8BDB-41E5-B841-AAD78C46548B|12956      |
|13980105|خساپا |58EC36FA-E468-4D74-9C95-14B630E6C68F|7561       |
|13980105|ثامان |0DCCFEE2-FA77-44D8-A27D-139B38EDD72A|3824       |
+--------+------+------------------------------------+-----------+
only showing top 3 rows



In [17]:
portfolio_df = (
    portfolio_df
    .join(valid_symbols_df, on = ['symbol'], how = 'inner')
    .groupBy(['accountId','date','symbol'])
    .agg(
        F.sum('nHeldShares').alias('nHeldShares')
    )
)

display_df(portfolio_df)

22/02/22 17:17:21 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:17:38 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB
22/02/22 17:17:48 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB
                                                                                

8769334


22/02/22 17:17:53 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB


+------------------------------------+--------+------+-----------+
|accountId                           |date    |symbol|nHeldShares|
+------------------------------------+--------+------+-----------+
|D29753D9-5B38-4CCE-8AEB-51AA71A12851|13980105|توریل |22970      |
|24AC333D-8E93-42F6-8CD4-C1D8AE4AA12A|13980105|وایران|5449       |
|3494AD79-A6C4-4864-854B-7951C2049D9C|13980105|کهرام |948        |
+------------------------------------+--------+------+-----------+
only showing top 3 rows



In [18]:
(
    portfolio_df
    .filter(F.col('nHeldShares') < 0)
    .count()
)

22/02/22 17:17:54 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB
                                                                                

0

#### check symbols

In [19]:
price_symbols = price_df.select('symbol').distinct().withColumn('price', F.lit(1))
trade_symbols = trade_df.select('symbol').distinct().withColumn('trade', F.lit(1))
portfolio_symbols = portfolio_df.select('symbol').distinct().withColumn('portfolio', F.lit(1))

symbols_df = (
    trade_symbols
    .join(portfolio_symbols, on = ['symbol'], how = 'outer')
    .join(price_symbols, on = ['symbol'], how = 'outer')
)

print(symbols_df.filter(F.col('price').isNull()).select('symbol').rdd.flatMap(lambda x: x).collect())

22/02/22 17:17:57 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:17:57 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB
22/02/22 17:17:58 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:18:10 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
22/02/22 17:18:15 WARN DAGScheduler: Broadcasting large task binary with size 35.9 MiB


[]


                                                                                

In [20]:
w = Window().partitionBy("symbol").orderBy('date')
w2 = Window().partitionBy("symbol").orderBy(price_df.date.desc())

price_return_df = (price_df.select('symbol', 'mktcap',
   ((
       F.first("close_price",True).over(w2) - F.first("close_price",True).over(w)
       ) 
       / F.first("close_price",True).over(w)
       ).alias('price_return'),
       )
   .dropDuplicates(['symbol'])
)
tempt = price_df.na.drop(
    how = 'any',
)
large_small_stocks =( 
    tempt[tempt.date    == 13980328]
    .withColumn('sizeDecile', F.ntile(10).over(Window.partitionBy().orderBy('mktcap')))
) 
price_return_df = price_return_df .join(
    large_small_stocks.select(
        F.col('symbol'),
    F.col('sizeDecile'),
    ) , on =['symbol']
).select(
    F.col('symbol'),
    F.col('mktcap'),
    F.col('price_return'),
    F.col('sizeDecile'),
)


In [21]:
(
    price_return_df
    .groupBy('sizeDecile')
    .agg(
        F.round(F.expr('percentile(mktcap, array(0.5))')[0], 3).alias('medianmktcap'),
        F.round(F.mean('price_return'), 2).alias('meanReturn'),
        F.round(F.expr('percentile(price_return, array(0.5))')[0], 3).alias('medianReturn')
    )
    .orderBy('sizeDecile')
    .show()
)

22/02/22 17:18:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/02/22 17:18:17 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:18:18 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:18:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/02/22 17:18:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/02/22 17:18:26 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:18:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/02/22 17:18:28 WARN DAGScheduler: Broadcasting large task binary with size 1

+----------+------------+----------+------------+
|sizeDecile|medianmktcap|meanReturn|medianReturn|
+----------+------------+----------+------------+
|         1|    33021.78|      0.52|       0.044|
|         2|     69850.0|      0.49|       0.389|
|         3|    111675.0|       0.6|       0.493|
|         4|    147260.0|      0.61|       0.514|
|         5|    203036.0|      0.62|       0.562|
|         6|    296800.0|      0.57|       0.519|
|         7|    435278.0|      0.69|       0.545|
|         8|    703566.0|      0.78|       0.558|
|         9|   1570800.0|      0.62|       0.383|
|        10|   8119500.0|      0.25|       0.205|
+----------+------------+----------+------------+



### general insights

#### check compatibility of the two datasets

In [22]:
common_investors_df = (
    trade_df
    .select(F.col('buyerAccountId').alias('accountId'))
    .union(trade_df.select(F.col('sellerAccountId').alias('accountId')))
    .dropDuplicates()
    .withColumn('trade', F.lit(1))
    .join(portfolio_df.select('accountId', F.lit(1).alias('portfolio')).dropDuplicates(), on = ['accountId'], how = 'outer')
    .fillna(0, subset = ['trade', 'portfolio'])
)

display_df(common_investors_df)

22/02/22 17:18:32 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:18:33 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB
22/02/22 17:18:43 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB
22/02/22 17:18:53 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:19:19 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
                                                                                

4371963


22/02/22 17:19:25 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB


+------------------------------------+-----+---------+
|accountId                           |trade|portfolio|
+------------------------------------+-----+---------+
|00019AFD-C89B-4B63-98BB-18BF5A112C6F|0    |1        |
|000217BA-3C48-4458-8CAA-691CA19C7187|0    |1        |
|000249D5-649B-4C99-AE9E-82AB979E80C9|0    |1        |
+------------------------------------+-----+---------+
only showing top 3 rows



In [23]:
trade_only = common_investors_df.filter( (F.col('trade') == 1) & (F.col('portfolio') == 0)).count()
all_trade = common_investors_df.filter(F.col('trade') == 1).count()

print('share of missing portfolio accounts among traders:', round(100 * trade_only / all_trade, 2), '%')
# It seems reasonable to attribute this missing portion to the new entrants!

22/02/22 17:19:27 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
22/02/22 17:19:31 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB

share of missing portfolio accounts among traders: 21.13 %


                                                                                

In [24]:
portfolio_only = common_investors_df.filter( (F.col('trade') == 0) & (F.col('portfolio') == 1)).count()
all_portfolio = common_investors_df.filter(F.col('portfolio') == 1).count()

print('share of missing trades among investors who have nitial portfolio:', round(100 * portfolio_only / all_portfolio, 2), '%')
# It seems reasonable to attribute this missing portion to the new entrants!

22/02/22 17:19:34 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
22/02/22 17:19:38 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB

share of missing trades among investors who have nitial portfolio: 86.21 %


                                                                                

#### number of unique investors

In [25]:
(
    trade_df
    .select(F.col('buyerAccountId').alias('accountId'))
    .union(trade_df.select(F.col('sellerAccountId').alias('accountId')))
    .dropDuplicates()
    .count()
)

22/02/22 17:19:40 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:19:48 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:19:51 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:20:04 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
                                                                                

737297

#### number of stocks within investors' initial portfolios

In [26]:
(
    portfolio_df
    .groupBy('accountId')
    .count()
    .agg(
        F.expr('percentile(count, array(0.25))')[0].alias('25%'),
        F.expr('percentile(count, array(0.50))')[0].alias('50%'),
        F.round(F.mean('count'), 4).alias('mean'),
        F.expr('percentile(count, array(0.75))')[0].alias('75%'),
        F.expr('percentile(count, array(0.9))')[0].alias('90%'),
        F.expr('percentile(count, array(0.99))')[0].alias('99%'),
    )
    .show()
)

22/02/22 17:20:06 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB
22/02/22 17:20:14 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB
22/02/22 17:20:17 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB


+---+---+------+---+---+----+
|25%|50%|  mean|75%|90%| 99%|
+---+---+------+---+---+----+
|1.0|1.0|2.0799|2.0|4.0|15.0|
+---+---+------+---+---+----+



                                                                                

#### distribution of trade value

In [27]:
(
    trade_df
    .agg(
        F.expr('percentile(tradeSettlementValue, array(0.1))')[0].alias('10%'),
        F.expr('percentile(tradeSettlementValue, array(0.25))')[0].alias('25%'),
        F.expr('percentile(tradeSettlementValue, array(0.50))')[0].alias('50%'),
        F.round(F.mean('tradeSettlementValue'), 4).alias('mean'),
        F.expr('percentile(tradeSettlementValue, array(0.75))')[0].alias('75%'),
        F.expr('percentile(tradeSettlementValue, array(0.90))')[0].alias('90%'),
        F.expr('percentile(tradeSettlementValue, array(0.95))')[0].alias('95%'),
        F.expr('percentile(tradeSettlementValue, array(0.99))')[0].alias('99%')
    )
    .show()
)

22/02/22 17:20:18 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:20:25 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
[Stage 376:>                                                        (0 + 1) / 1]

+-------+-------+-------+------+-----------+------------------+------+-----------------+
|    10%|    25%|    50%|  mean|        75%|               90%|   95%|              99%|
+-------+-------+-------+------+-----------+------------------+------+-----------------+
|0.09855|0.25481|1.00125|3.3809|2.965605225|7.5548333000000145|12.528|32.33864667300008|
+-------+-------+-------+------+-----------+------------------+------+-----------------+



                                                                                

#### compare trade value of new entrants with other investors

In [28]:
(
    common_investors_df
    .select(F.col('accountId').alias('buyerAccountId'), F.col('portfolio').alias('hasPortfolio'))
    .join(trade_df, on = ['buyerAccountId'], how = 'right')
    .groupBy('hasPortfolio')
    .agg(
        F.round(F.expr('percentile(tradeSettlementValue, array(0.5))')[0], 2).alias('median_buyValue'),
        F.round(F.mean('tradeSettlementValue'), 2).alias('mean_buyValue')
    )
    .show()
)

22/02/22 17:25:42 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
22/02/22 17:25:47 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:25:59 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
22/02/22 17:26:06 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
22/02/22 17:26:49 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
[Stage 429:>                                                        (0 + 1) / 1]

+------------+---------------+-------------+
|hasPortfolio|median_buyValue|mean_buyValue|
+------------+---------------+-------------+
|           1|            1.0|         3.44|
|           0|           0.78|         2.57|
+------------+---------------+-------------+



                                                                                

## make daily portfolios

### flatten trade data

In [29]:
buy_trade_df = (
    trade_df
        .select(
        'date',
        'symbol',
        F.col('buyerAccountId').alias('accountId'),
        'nTradeShares',
        (-F.col('tradeSettlementValue')).alias('settlementValue'),
        )
)

sell_trade_df = (
    trade_df
        .select(
            'date',
            'symbol',
            F.col('sellerAccountId').alias('accountId'),
            (-F.col('nTradeShares')).alias('nTradeShares'),
            F.col('tradeSettlementValue').alias('settlementValue')
        )
)

raw_flat_trade_df = (
    buy_trade_df
    .union(sell_trade_df)
    .groupBy(['date', 'symbol', 'accountId'])
    .agg(
        F.sum('nTradeShares').alias('nTradeShares'),
        F.sum(F.when(F.col('settlementValue') > 0, F.col('settlementValue'))).alias('cashOut'),
        F.sum(F.when(F.col('settlementValue') < 0, F.col('settlementValue'))).alias('cashIn')
    )
    .fillna(0, subset = ['cashOut', 'cashIn'])
    .orderBy('date', 'accountId')
)

display_df(raw_flat_trade_df)

22/02/22 17:26:56 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:27:10 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB
22/02/22 17:29:09 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:29:12 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:29:30 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
                                                                                

15170298


22/02/22 17:29:35 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB


+--------+------+------------------------------------+------------+---------+-------+
|date    |symbol|accountId                           |nTradeShares|cashOut  |cashIn |
+--------+------+------------------------------------+------------+---------+-------+
|13980105|غصینو |0000448F-BDB5-48FF-9539-DB56E27ACCF9|1775        |0.0      |-1.42  |
|13980105|ثنام  |0002412C-87BD-4EEB-8E14-00FCD817C052|7105        |0.0      |-0.9947|
|13980105|ومعلم |0002412C-87BD-4EEB-8E14-00FCD817C052|-229        |0.0717915|0.0    |
+--------+------+------------------------------------+------------+---------+-------+
only showing top 3 rows



In [30]:
print(round(100*raw_flat_trade_df.filter(
    (F.col('cashIn') != 0)&
    (F.col('cashOut') != 0) ).count() / raw_flat_trade_df.count(), 2),'%')

22/02/22 17:29:36 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:29:38 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB

3.42 %


                                                                                

In [31]:
print(raw_flat_trade_df.filter(F.col('nTradeShares') == 0).count())
print(trade_df.filter(F.col('tradeSettlementValue') == 0).count())

22/02/22 17:29:40 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
                                                                                

187070


22/02/22 17:29:42 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB

0


                                                                                

In [32]:
print(raw_flat_trade_df.filter(F.col('cashIn') > 0 ).count())
print(raw_flat_trade_df.filter(F.col('cashOut') < 0 ).count())

22/02/22 17:29:44 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
                                                                                

0


22/02/22 17:29:45 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB

0


                                                                                

### Active days

In [33]:
# active_days_df = (
#     raw_flat_trade_df
#     .groupBy('accountId', 'date')
#     .agg(
#         F.sum('cashIn').alias('netCashIn'),
#         F.sum('cashOut').alias('netCashOut'),
#     )
#     .withColumn('netCash', F.col('netCashIn') + F.col('netCashOut'))
#     .groupBy('accountId')
#     .agg(
#         F.count(F.when(F.col('netCash') < 0, F.lit(1))).alias('nBuyDays'),
#         F.count(F.when(F.col('netCash') > 0, F.lit(1))).alias('nSellDays'),
#     )
#     .fillna(0, subset = ['nBuyDays', 'nSellDays'])
# )

# display_df(active_days_df)

In [34]:
# active_days_df.write.mode('overwrite').parquet('/home/user1/Data/activeDays.parquet')

### make daily portfolios

In [35]:
def make_daily_portfolio():
    window = (
        Window.partitionBy('accountId', 'symbol')
        .orderBy('date')
        .rowsBetween(Window.unboundedPreceding, Window.currentRow)
    )
    return (F.sum('nHeldShares').over(window), F.sum('cashOut').over(window), F.sum('cashIn').over(window))

raw_daily_portfolio_df = (
    portfolio_df
    .select('date',
            'symbol', 
            'accountId', 
            'nHeldShares', 
            F.lit(0).alias('cashOut'),
            F.lit(0).alias('cashIn')
           )
    .union(
        raw_flat_trade_df
        .withColumnRenamed('nTradeShares', 'nHeldShares')
    )
    .groupBy('date', 'symbol', 'accountId')
    .agg(
        F.sum('nHeldShares').alias('nHeldShares'),
        F.sum('cashOut').alias('cashOut'),
        F.sum('cashIn').alias('cashIn')
    )
    .orderBy('accountId', 'date')
    .withColumn('heldShares', make_daily_portfolio()[0])
    .withColumn('netCashOut', make_daily_portfolio()[1])
    .withColumn('netCashIn', make_daily_portfolio()[2])
    .drop('nHeldShares', 'settlementValue', 'cashIn', 'cashOut')
)

display_df(raw_daily_portfolio_df)

22/02/22 17:29:51 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:30:28 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
22/02/22 17:30:49 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
22/02/22 17:30:53 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
22/02/22 17:31:20 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
22/02/22 17:31:48 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
                                                                                

23765673


22/02/22 17:31:59 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB


+--------+------+------------------------------------+----------+----------+---------+
|date    |symbol|accountId                           |heldShares|netCashOut|netCashIn|
+--------+------+------------------------------------+----------+----------+---------+
|13980105|چکاپا |00026661-733B-49E0-AC93-ED5812290A9B|1740      |0.0       |0.0      |
|13980221|چکاپا |00026661-733B-49E0-AC93-ED5812290A9B|0         |0.83346   |0.0      |
|13980105|ارفع  |00029878-6EF9-49A7-B231-61DBED05BDE7|32796     |0.0       |0.0      |
+--------+------+------------------------------------+----------+----------+---------+
only showing top 3 rows



#### invalid holdings

In [36]:
invalid_holdings_df = (
    raw_daily_portfolio_df
    .filter(F.col('heldShares') < 0)
    .select('accountId', 'symbol')
    .dropDuplicates()
    .withColumn('invalidHolding', F.lit(1))
)
display_df(invalid_holdings_df)


flat_trade_df = (
    raw_flat_trade_df
    .join(invalid_holdings_df, on = ['accountId', 'symbol'], how = 'left')
    .filter(F.col('invalidHolding').isNull())
    .drop('invalidHolding')
)
display_df(flat_trade_df)


daily_portfolio_df = (
    raw_daily_portfolio_df
    .join(invalid_holdings_df, on = ['accountId', 'symbol'], how = 'left')
    .filter(F.col('invalidHolding').isNull())
    .drop('invalidHolding')
)
display_df(daily_portfolio_df)

22/02/22 17:32:03 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
                                                                                

171376


22/02/22 17:32:06 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB


+------------------------------------+-------+--------------+
|accountId                           |symbol |invalidHolding|
+------------------------------------+-------+--------------+
|006BE8C1-A950-44B8-B2B5-FD293FAFD7B6|وساپا  |1             |
|0122EC9B-B0A1-4510-8D41-55731733AE20|ومعلم  |1             |
|01ED3611-405C-432F-B2A0-E0CC337FA36F|فرابورس|1             |
+------------------------------------+-------+--------------+
only showing top 3 rows



22/02/22 17:32:09 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
22/02/22 17:32:11 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
                                                                                

14601858


22/02/22 17:32:16 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB


+------------------------------------+------+--------+------------+---------+-------+
|accountId                           |symbol|date    |nTradeShares|cashOut  |cashIn |
+------------------------------------+------+--------+------------+---------+-------+
|0000448F-BDB5-48FF-9539-DB56E27ACCF9|غصینو |13980105|1775        |0.0      |-1.42  |
|0002412C-87BD-4EEB-8E14-00FCD817C052|ثنام  |13980105|7105        |0.0      |-0.9947|
|0002412C-87BD-4EEB-8E14-00FCD817C052|ومعلم |13980105|-229        |0.0717915|0.0    |
+------------------------------------+------+--------+------------+---------+-------+
only showing top 3 rows



22/02/22 17:32:20 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
22/02/22 17:32:21 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
                                                                                

23160625


22/02/22 17:32:29 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB


+------------------------------------+------+--------+----------+----------+---------+
|accountId                           |symbol|date    |heldShares|netCashOut|netCashIn|
+------------------------------------+------+--------+----------+----------+---------+
|00026661-733B-49E0-AC93-ED5812290A9B|چکاپا |13980105|1740      |0.0       |0.0      |
|00026661-733B-49E0-AC93-ED5812290A9B|چکاپا |13980221|0         |0.83346   |0.0      |
|00029878-6EF9-49A7-B231-61DBED05BDE7|ارفع  |13980105|32796     |0.0       |0.0      |
+------------------------------------+------+--------+----------+----------+---------+
only showing top 3 rows



### calculate gain from trade

In [37]:
gain_from_trade_df = (
    flat_trade_df
    .groupBy('accountId')
    .agg(
        F.sum('cashOut').alias('netCashOut'),
        F.sum('cashIn').alias('netCashIn'),
    )
#     .withColumn('sumTradeValue', F.col('netCashOut') + F.col('netCashIn'))
#     .withColumn('absSumTradeValue', F.col('netCashOut') - F.col('netCashIn'))
)

display_df(gain_from_trade_df)

22/02/22 17:32:31 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:32:39 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB
                                                                                

727023


22/02/22 17:32:42 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB


+------------------------------------+------------------+-------------------+
|accountId                           |netCashOut        |netCashIn          |
+------------------------------------+------------------+-------------------+
|00359890-DDBC-4ED6-85EF-6265CF05DF25|20.6662591        |-46.4365793        |
|018942A6-B259-41C2-951E-0EB9BB59A917|434.3139919       |-485.2588124       |
|019CED96-7DAB-4AEA-88C0-0FBA460524CD|443.46564280000007|-400.21939330000015|
+------------------------------------+------------------+-------------------+
only showing top 3 rows



In [38]:
# print(round(gain_from_trade_df.filter(F.col('sumTradeValue') < 0).count() / gain_from_trade_df.count() , 2))

### calculate value of the initial portfolio

In [39]:
initial_portfolio_value_df = (
    portfolio_df
    .join(price_df.select('date', 'symbol', 'close_price'), on = ['date', 'symbol'], how = 'left')
    .dropna(subset = ['close_price'])
    .join(invalid_holdings_df, on = ['accountId', 'symbol'], how = 'left')
    .filter(F.col('invalidHolding').isNull())
    .withColumn('value', F.col('nHeldShares') * F.col('close_price'))
    .groupBy('accountId')
    .agg(
        (F.sum('value') / 10**7).alias('initialPortfolioValue')
    )
)

display_df(initial_portfolio_value_df)
# count after join?

22/02/22 17:32:46 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
22/02/22 17:32:49 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:32:49 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB
22/02/22 17:33:02 WARN DAGScheduler: Broadcasting large task binary with size 36.0 MiB
22/02/22 17:33:12 WARN DAGScheduler: Broadcasting large task binary with size 36.0 MiB
                                                                                

4215194


22/02/22 17:33:17 WARN DAGScheduler: Broadcasting large task binary with size 36.0 MiB


+------------------------------------+---------------------+
|accountId                           |initialPortfolioValue|
+------------------------------------+---------------------+
|C25D209E-2DCB-410C-9106-59E3C3E4C65A|97.1563532           |
|0EAD58BA-E047-4097-B5D8-C3EF111727CE|253.3049908          |
|4DFC1FC2-675B-4AB7-84A3-A54BE91FFF72|1.4946859            |
+------------------------------------+---------------------+
only showing top 3 rows



                                                                                

In [40]:
print(initial_portfolio_value_df.filter(F.col('initialPortfolioValue').isNull()).count())
print(initial_portfolio_value_df.filter(F.col('initialPortfolioValue') <= 0).count())

22/02/22 17:33:20 WARN DAGScheduler: Broadcasting large task binary with size 36.0 MiB
                                                                                

0


22/02/22 17:33:26 WARN DAGScheduler: Broadcasting large task binary with size 36.0 MiB

0


                                                                                

In [41]:
(
    initial_portfolio_value_df
    .agg(
        F.round(F.min('initialPortfolioValue'), 2).alias('min'),
        F.round(F.expr('percentile(initialPortfolioValue, array(0.01))')[0], 2).alias('1%'),
        F.round(F.expr('percentile(initialPortfolioValue, array(0.25))')[0], 2).alias('25%'),
        F.round(F.expr('percentile(initialPortfolioValue, array(0.5))')[0], 2).alias('50%'),
        F.round(F.mean('initialPortfolioValue'), 2).alias('mean'),
        F.round(F.expr('percentile(initialPortfolioValue, array(0.75))')[0], 2).alias('75%'),
        F.round(F.expr('percentile(initialPortfolioValue, array(0.9))')[0], 2).alias('90%'),
        F.round(F.expr('percentile(initialPortfolioValue, array(0.99))')[0], 2).alias('99%'),
        F.round(F.expr('percentile(initialPortfolioValue, array(0.999))')[0], 2).alias('99.9%'),
    )
    .show()
)

22/02/22 17:33:32 WARN DAGScheduler: Broadcasting large task binary with size 36.0 MiB
22/02/22 17:33:37 WARN DAGScheduler: Broadcasting large task binary with size 36.0 MiB
[Stage 841:>                                                        (0 + 1) / 1]

+---+----+----+----+-----+----+-----+------+-------+
|min|  1%| 25%| 50%| mean| 75%|  90%|   99%|  99.9%|
+---+----+----+----+-----+----+-----+------+-------+
|0.0|0.01|0.22|0.77|202.0|3.36|13.91|246.35|7846.94|
+---+----+----+----+-----+----+-----+------+-------+



                                                                                

In [42]:
# initial_portfolio_value_df.write.mode('overwrite').parquet('/home/user1/Data/initial_portfolio_value_df.parquet')

### calculate value of the final portfolio

In [43]:
final_portfolio_value_df = (
    daily_portfolio_df
    .withColumn('rowNumber', F.row_number().over(Window.partitionBy('accountId', 'symbol').orderBy('date')))
    .withColumn('maxRowNumber', F.max('rowNumber').over(Window.partitionBy('accountId', 'symbol')))
    .filter(F.col('rowNumber') == F.col('maxRowNumber'))
    .filter(F.col('heldShares') > 0)
    .withColumn('date', F.lit(MAX_PRICE_DATE))
    .join(price_df.select('date', 'symbol', 'close_price'), on = ['date', 'symbol'], how = 'left')
    .dropna(subset = ['close_price'])
    .withColumn('value', F.col('heldShares') * F.col('close_price'))
    .groupBy('accountId')
    .agg(
        (F.sum('value') / 10**7).alias('finalPortfolioValue')
    )   
)

display_df(final_portfolio_value_df)
# count after join?

22/02/22 17:34:25 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
22/02/22 17:34:26 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:34:36 WARN DAGScheduler: Broadcasting large task binary with size 36.0 MiB
22/02/22 17:34:45 WARN DAGScheduler: Broadcasting large task binary with size 35.9 MiB
                                                                                

4313618


22/02/22 17:34:51 WARN DAGScheduler: Broadcasting large task binary with size 35.9 MiB


+------------------------------------+-------------------+
|accountId                           |finalPortfolioValue|
+------------------------------------+-------------------+
|5B291534-2FE5-4A4A-B75B-B4D44060A1AF|30.643591          |
|0F6CC9D2-EF36-4537-BEBF-32F9386A468B|55.4984274         |
|33046B40-C4EB-4CEE-974F-D6F1185F3434|15.5446366         |
+------------------------------------+-------------------+
only showing top 3 rows



                                                                                

In [44]:
print(final_portfolio_value_df.filter(F.col('finalPortfolioValue').isNull()).count())
print(final_portfolio_value_df.filter(F.col('finalPortfolioValue') <= 0).count())

22/02/22 17:34:54 WARN DAGScheduler: Broadcasting large task binary with size 35.9 MiB
                                                                                

0


22/02/22 17:35:00 WARN DAGScheduler: Broadcasting large task binary with size 35.9 MiB

0


                                                                                

In [45]:
(
    final_portfolio_value_df
    .agg(
        F.round(F.min('finalPortfolioValue'), 2).alias('min'),
        F.round(F.expr('percentile(finalPortfolioValue, array(0.1))')[0], 2).alias('10%'),
        F.round(F.expr('percentile(finalPortfolioValue, array(0.25))')[0], 2).alias('25%'),
        F.round(F.expr('percentile(finalPortfolioValue, array(0.5))')[0], 2).alias('50%'),
        F.round(F.mean('finalPortfolioValue'), 2).alias('mean'),
        F.round(F.expr('percentile(finalPortfolioValue, array(0.75))')[0], 2).alias('75%'),
        F.round(F.expr('percentile(finalPortfolioValue, array(0.9))')[0], 2).alias('90%'),
        F.round(F.expr('percentile(finalPortfolioValue, array(0.99))')[0], 2).alias('99%'),
        F.round(F.expr('percentile(finalPortfolioValue, array(0.999))')[0], 2).alias('99.9%'),
    )
    .show()
)

22/02/22 17:35:07 WARN DAGScheduler: Broadcasting large task binary with size 36.0 MiB
22/02/22 17:35:11 WARN DAGScheduler: Broadcasting large task binary with size 36.0 MiB
[Stage 1007:>                                                       (0 + 1) / 1]

+---+---+----+----+------+----+-----+------+--------+
|min|10%| 25%| 50%|  mean| 75%|  90%|   99%|   99.9%|
+---+---+----+----+------+----+-----+------+--------+
|0.0|0.1|0.39|1.31|253.34|5.34|20.99|359.29|10445.95|
+---+---+----+----+------+----+-----+------+--------+



                                                                                

In [46]:
# final_portfolio_value_df.write.mode('overwrite').parquet('/home/user1/Data/final_portfolio_value_df.parquet')

## calculate returns

In [47]:
return_df = (
    gain_from_trade_df
    .join(initial_portfolio_value_df, on = 'accountId', how = 'outer')
    .join(final_portfolio_value_df, on = 'accountId', how = 'outer')
    .fillna(0, subset = ['netCashIn', 'netCashOut', 'initialPortfolioValue', 'finalPortfolioValue'])
    .withColumn('return', 
                ((F.col('finalPortfolioValue') + F.col('netCashOut')) / (F.col('initialPortfolioValue') + (-F.col('netCashIn')))) - 1)
    .filter(F.col('return').isNotNull())
    .withColumn('returnDecile', F.ntile(N_QUANTILES).over(Window.partitionBy().orderBy('return')))
)

display_df(return_df)
# null returns?

22/02/22 17:35:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/02/22 17:36:04 WARN DAGScheduler: Broadcasting large task binary with size 36.0 MiB
22/02/22 17:36:08 WARN DAGScheduler: Broadcasting large task binary with size 36.1 MiB
22/02/22 17:36:11 WARN DAGScheduler: Broadcasting large task binary with size 36.1 MiB
                                                                                

4362652


22/02/22 17:36:27 WARN DAGScheduler: Broadcasting large task binary with size 36.1 MiB
[Stage 1059:>                                                       (0 + 1) / 1]

+------------------------------------+-------------+---------+---------------------+-------------------+-------------------+------------+
|accountId                           |netCashOut   |netCashIn|initialPortfolioValue|finalPortfolioValue|return             |returnDecile|
+------------------------------------+-------------+---------+---------------------+-------------------+-------------------+------------+
|92C26116-9961-4133-BAD7-793AA7BEF8F1|16183.5065146|0.0      |106698.0908808       |0.0                |-0.8483243103882736|1           |
|D227A812-A177-4D00-8C99-2F95F6229465|0.0          |-1762.5  |379.5506             |765.051            |-0.64284177040449  |1           |
|FD0C12C3-368D-4EF8-A6C5-DA93CD0DF1BC|0.0          |-1762.5  |379.5506             |765.051            |-0.64284177040449  |1           |
+------------------------------------+-------------+---------+---------------------+-------------------+-------------------+------------+
only showing top 3 rows



                                                                                

In [48]:
(
    return_df
    .filter(F.col('return') == 0)
    .count()
)


22/02/22 17:36:33 WARN DAGScheduler: Broadcasting large task binary with size 36.1 MiB
                                                                                

100530

In [49]:
(
    return_df
    .groupBy('returnDecile')
    .agg(
        F.round(F.expr('percentile(return, array(0.5))')[0], 3).alias('medianReturn')
    )
    .show()
)

22/02/22 17:36:40 WARN DAGScheduler: Broadcasting large task binary with size 36.1 MiB
[Stage 1111:>                                                       (0 + 1) / 1]

+------------+------------+
|returnDecile|medianReturn|
+------------+------------+
|           1|      -0.003|
|           2|       0.074|
|           3|       0.125|
|           4|       0.199|
|           5|       0.234|
|           6|       0.366|
|           7|       0.456|
|           8|       0.568|
|           9|       0.778|
|          10|       1.328|
+------------+------------+



                                                                                

In [50]:
(
    return_df
    .agg(
       F.round(F.min('return'), 2).alias('min'),
        F.round(F.expr('percentile(return, array(0.01))')[0], 2).alias('1%'),
        F.round(F.expr('percentile(return, array(0.1))')[0], 2).alias('10%'),
        F.round(F.expr('percentile(return, array(0.25))')[0], 2).alias('25%'),
        F.round(F.expr('percentile(return, array(0.5))')[0], 2).alias('50%'),
        F.round(F.mean('return'), 2).alias('mean'),
        F.round(F.expr('percentile(return, array(0.75))')[0], 2).alias('75%'),
        F.round(F.expr('percentile(return, array(0.9))')[0], 2).alias('90%'),
        F.round(F.expr('percentile(return, array(0.99))')[0], 2).alias('99%'),
        F.round(F.expr('percentile(return, array(0.999))')[0], 2).alias('99.9%'),
    )
    .show()
)

22/02/22 17:36:52 WARN DAGScheduler: Broadcasting large task binary with size 36.1 MiB
[Stage 1137:>                                                       (0 + 1) / 1]

+-----+-----+----+----+----+----+----+----+----+-----+
|  min|   1%| 10%| 25%| 50%|mean| 75%| 90%| 99%|99.9%|
+-----+-----+----+----+----+----+----+----+----+-----+
|-0.85|-0.08|0.04|0.12|0.29|0.51|0.57|0.97|5.21| 5.34|
+-----+-----+----+----+----+----+----+----+----+-----+



                                                                                

In [51]:
return_df.write.mode('overwrite').parquet('/home/user1/Data/Esmaeil/return_output.parquet')

22/02/22 17:37:55 WARN DAGScheduler: Broadcasting large task binary with size 36.3 MiB
                                                                                

### final portfolio value output

In [52]:
output_final_portfolio_value = (
    final_portfolio_value_df
    .join(return_df.select('accountId', 'return'), on = 'accountId')
    .withColumn('finalPortfolioValueDecile', F.ntile(N_QUANTILES).over(Window.partitionBy().orderBy('finalPortfolioValue')))
)

display_df(output_final_portfolio_value)

22/02/22 17:38:04 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/02/22 17:38:13 WARN DAGScheduler: Broadcasting large task binary with size 36.1 MiB
22/02/22 17:38:16 WARN DAGScheduler: Broadcasting large task binary with size 36.1 MiB
22/02/22 17:38:20 WARN DAGScheduler: Broadcasting large task binary with size 36.1 MiB
                                                                                

4313613


22/02/22 17:38:35 WARN DAGScheduler: Broadcasting large task binary with size 36.1 MiB


+------------------------------------+-------------------+-------------------+-------------------------+
|accountId                           |finalPortfolioValue|return             |finalPortfolioValueDecile|
+------------------------------------+-------------------+-------------------+-------------------------+
|82F6B832-AF99-4DEF-B80B-EAEA734EC0B3|1.7E-6             |0.14925041323980337|1                        |
|CEBDD90D-C07D-4500-A436-32550D6E9556|2.52E-5            |0.0                |1                        |
|16900519-92CB-4EF5-BCC5-87B095791742|2.52E-5            |0.0                |1                        |
+------------------------------------+-------------------+-------------------+-------------------------+
only showing top 3 rows



                                                                                

In [53]:
(
    output_final_portfolio_value
    .groupBy('finalPortfolioValueDecile')
    .agg(
        F.round(F.expr('percentile(finalPortfolioValue, array(0.5))')[0], 3).alias('medianFinalPortfolioValue'),
        F.round(F.mean('return'), 2).alias('meanReturn'),
        F.round(F.expr('percentile(return, array(0.5))')[0], 3).alias('medianReturn')
    )
    .orderBy('finalPortfolioValueDecile')
    .show()
)

22/02/22 17:38:43 WARN DAGScheduler: Broadcasting large task binary with size 36.1 MiB
[Stage 1247:>                                                       (0 + 1) / 1]

+-------------------------+-------------------------+----------+------------+
|finalPortfolioValueDecile|medianFinalPortfolioValue|meanReturn|medianReturn|
+-------------------------+-------------------------+----------+------------+
|                        1|                    0.028|      0.22|       0.074|
|                        2|                    0.176|      0.39|       0.366|
|                        3|                    0.392|      0.41|       0.366|
|                        4|                    0.605|      0.42|       0.366|
|                        5|                    1.097|      0.42|       0.222|
|                        6|                    1.685|      0.79|       0.337|
|                        7|                    2.848|      0.72|       0.335|
|                        8|                    5.343|      0.73|       0.415|
|                        9|                   11.776|      0.62|        0.37|
|                       10|                   55.482|      0.41|

                                                                                

In [54]:
output_final_portfolio_value.write.mode('overwrite').parquet('/home/user1/Data/Esmaeil/final_portfolio_output.parquet')

22/02/22 17:38:58 WARN DAGScheduler: Broadcasting large task binary with size 36.3 MiB
                                                                                

### initial portfolio value output

In [55]:
output_initial_portfolio_value = (
    initial_portfolio_value_df
    .join(return_df.select('accountId', 'return'), on = 'accountId')
    .withColumn('initialPortfolioValueDecile', F.ntile(N_QUANTILES).over(Window.partitionBy().orderBy('initialPortfolioValue')))
)

display_df(output_initial_portfolio_value)

22/02/22 17:39:04 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/02/22 17:39:12 WARN DAGScheduler: Broadcasting large task binary with size 36.1 MiB
22/02/22 17:39:16 WARN DAGScheduler: Broadcasting large task binary with size 36.1 MiB
22/02/22 17:39:19 WARN DAGScheduler: Broadcasting large task binary with size 36.1 MiB
                                                                                

4215194


22/02/22 17:39:34 WARN DAGScheduler: Broadcasting large task binary with size 36.1 MiB


+------------------------------------+---------------------+-------------------+---------------------------+
|accountId                           |initialPortfolioValue|return             |initialPortfolioValueDecile|
+------------------------------------+---------------------+-------------------+---------------------------+
|82F6B832-AF99-4DEF-B80B-EAEA734EC0B3|1.7E-6               |0.14925041323980337|1                          |
|CEBDD90D-C07D-4500-A436-32550D6E9556|2.52E-5              |0.0                |1                          |
|16900519-92CB-4EF5-BCC5-87B095791742|2.52E-5              |0.0                |1                          |
+------------------------------------+---------------------+-------------------+---------------------------+
only showing top 3 rows



                                                                                

In [56]:
(
    output_initial_portfolio_value
    .groupBy('initialPortfolioValueDecile')
    .agg(
        F.round(F.expr('percentile(initialPortfolioValue, array(0.5))')[0], 3).alias('medianInitialPortfolioValue'),
        F.round(F.mean('return'), 2).alias('meanReturn'),
        F.round(F.expr('percentile(return, array(0.5))')[0], 5).alias('medianReturn')
    )
    .orderBy('initialPortfolioValueDecile')
    .show()
)

22/02/22 17:39:43 WARN DAGScheduler: Broadcasting large task binary with size 36.1 MiB
[Stage 1359:>                                                       (0 + 1) / 1]

+---------------------------+---------------------------+----------+------------+
|initialPortfolioValueDecile|medianInitialPortfolioValue|meanReturn|medianReturn|
+---------------------------+---------------------------+----------+------------+
|                          1|                      0.026|       0.3|     0.07377|
|                          2|                      0.122|      0.43|     0.36643|
|                          3|                      0.225|      0.75|     0.32882|
|                          4|                      0.379|      0.73|     0.43381|
|                          5|                      0.563|      0.51|     0.31347|
|                          6|                      1.032|      0.68|     0.38178|
|                          7|                      1.816|      0.58|     0.32526|
|                          8|                      3.359|      0.49|     0.37011|
|                          9|                      7.661|      0.41|     0.34505|
|               

                                                                                

In [57]:
output_initial_portfolio_value.write.mode('overwrite').parquet('/home/user1/Data/Esmaeil/inital_portfolio_output.parquet')

22/02/22 17:39:57 WARN DAGScheduler: Broadcasting large task binary with size 36.3 MiB
                                                                                

### calculate frequency of trades and active days

In [58]:
active_days_df = (
    raw_flat_trade_df
    .groupBy('accountId', 'date')
    .agg(
        F.sum('cashIn').alias('netCashIn'),
        F.sum('cashOut').alias('netCashOut')
    )
    .withColumn('netCash', F.col('netCashIn') + F.col('netCashOut'))
    .groupBy('accountId')
    .agg(
        F.count(F.when(F.col('netCash') < 0, F.lit(1))).alias('nBuyDays'),
        F.count(F.when(F.col('netCash') > 0, F.lit(1))).alias('nSellDays')
    )
    .fillna(0, subset = ['nBuyDays', 'nSellDays'])
)

display_df(active_days_df)

22/02/22 17:40:04 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB
22/02/22 17:40:15 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB
                                                                                

737297


22/02/22 17:40:17 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB


+------------------------------------+--------+---------+
|accountId                           |nBuyDays|nSellDays|
+------------------------------------+--------+---------+
|00359890-DDBC-4ED6-85EF-6265CF05DF25|12      |6        |
|018942A6-B259-41C2-951E-0EB9BB59A917|12      |14       |
|019CED96-7DAB-4AEA-88C0-0FBA460524CD|12      |14       |
+------------------------------------+--------+---------+
only showing top 3 rows



In [59]:
trade_kpi_df = (
    buy_trade_df
    .union(sell_trade_df)
    .groupBy('accountId')
    .agg(
        F.count(F.lit(1)).alias('tradeFrequency'),
        F.mean(F.abs('settlementValue')).alias('meanTradeValue'),
        F.sum('settlementValue').alias('netSumTradeValue'),
        F.sum(F.abs('settlementValue')).alias('absSumTradeValue'),
        F.countDistinct('date').alias('activeDays'),
    )
    .join(active_days_df, on = 'accountId')
)

display_df(trade_kpi_df)

22/02/22 17:40:20 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:41:30 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB
22/02/22 17:44:13 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 128689 ms exceeds timeout 120000 ms
22/02/22 17:44:21 WARN SparkContext: Killing executors is not supported by current scheduler.
22/02/22 17:45:38 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB
22/02/22 17:45:48 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB
                                                                                

737297


22/02/22 17:45:51 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB


+------------------------------------+--------------+-------------------+--------------------+-----------------+----------+--------+---------+
|accountId                           |tradeFrequency|meanTradeValue     |netSumTradeValue    |absSumTradeValue |activeDays|nBuyDays|nSellDays|
+------------------------------------+--------------+-------------------+--------------------+-----------------+----------+--------+---------+
|00066819-D00A-433B-9638-13C5933EC09F|7             |0.4669592285714286 |-0.7658454          |3.2687146        |6         |4       |2        |
|000E126E-9959-4796-A329-C9839A3C0FED|5             |0.45314329999999997|0.043616500000000086|2.2657165        |5         |3       |2        |
|00187BB3-23F8-4225-A743-1E4C5015024E|34            |2.2187337499999997 |-58.936947499999995 |75.43694749999999|6         |5       |1        |
+------------------------------------+--------------+-------------------+--------------------+-----------------+----------+--------+---------+

In [60]:
(
    trade_kpi_df
    .agg(
        F.round(F.expr('percentile(tradeFrequency, array(0.25))')[0], 2).alias('25% percentile'),
        F.round(F.expr('percentile(tradeFrequency, array(0.5))')[0], 2).alias('50% percentile'),
        F.round(F.mean('tradeFrequency'), 2).alias('mean'),
        F.round(F.expr('percentile(tradeFrequency, array(0.75))')[0], 2).alias('75% percentile'),
        F.round(F.expr('percentile(tradeFrequency, array(0.9))')[0], 2).alias('90% percentile'),
        F.round(F.expr('percentile(tradeFrequency, array(0.99))')[0], 2).alias('99% percentile'),
        F.round(F.expr('percentile(tradeFrequency, array(0.999))')[0], 2).alias('99.9% percentile'),
    )
    .show()
)

22/02/22 17:45:52 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB
22/02/22 17:45:55 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB


+--------------+--------------+-----+--------------+--------------+--------------+----------------+
|25% percentile|50% percentile| mean|75% percentile|90% percentile|99% percentile|99.9% percentile|
+--------------+--------------+-----+--------------+--------------+--------------+----------------+
|           5.0|          10.0|71.13|          36.0|         125.0|         973.0|          4172.7|
+--------------+--------------+-----+--------------+--------------+--------------+----------------+



                                                                                

In [61]:
(
    trade_kpi_df
    .agg(
        F.round(F.expr('percentile(meanTradeValue, array(0.25))')[0], 2).alias('25% percentile'),
        F.round(F.expr('percentile(meanTradeValue, array(0.5))')[0], 2).alias('50% percentile'),
        F.round(F.mean('meanTradeValue'), 2).alias('mean'),
        F.round(F.expr('percentile(meanTradeValue, array(0.75))')[0], 2).alias('75% percentile'),
        F.round(F.expr('percentile(meanTradeValue, array(0.9))')[0], 2).alias('90% percentile'),
        F.round(F.expr('percentile(meanTradeValue, array(0.99))')[0], 2).alias('99% percentile'),
        F.round(F.expr('percentile(meanTradeValue, array(0.999))')[0], 2).alias('99.9% percentile'),
    )
    .show()
)

22/02/22 17:45:56 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB
22/02/22 17:45:58 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB
[Stage 1506:>                                                       (0 + 1) / 1]

+--------------+--------------+----+--------------+--------------+--------------+----------------+
|25% percentile|50% percentile|mean|75% percentile|90% percentile|99% percentile|99.9% percentile|
+--------------+--------------+----+--------------+--------------+--------------+----------------+
|          0.42|          0.71|3.48|          1.76|          3.62|         12.21|           32.28|
+--------------+--------------+----+--------------+--------------+--------------+----------------+



                                                                                

In [62]:
(
    trade_kpi_df
    .agg(
        F.round(F.expr('percentile(netSumTradeValue, array(0.25))')[0], 2).alias('25% percentile'),
        F.round(F.expr('percentile(netSumTradeValue, array(0.5))')[0], 2).alias('50% percentile'),
        F.round(F.mean('netSumTradeValue'), 2).alias('mean'),
        F.round(F.expr('percentile(netSumTradeValue, array(0.75))')[0], 2).alias('75% percentile'),
        F.round(F.expr('percentile(netSumTradeValue, array(0.9))')[0], 2).alias('90% percentile'),
        F.round(F.expr('percentile(netSumTradeValue, array(0.99))')[0], 2).alias('99% percentile'),
        F.round(F.expr('percentile(netSumTradeValue, array(0.999))')[0], 2).alias('99.9% percentile'),
    )
    .show()
)

22/02/22 17:46:14 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB
22/02/22 17:46:16 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB
[Stage 1533:>                                                       (0 + 1) / 1]

+--------------+--------------+----+--------------+--------------+--------------+----------------+
|25% percentile|50% percentile|mean|75% percentile|90% percentile|99% percentile|99.9% percentile|
+--------------+--------------+----+--------------+--------------+--------------+----------------+
|         -2.56|         -0.67| 0.0|          0.43|          8.89|        160.36|         1738.53|
+--------------+--------------+----+--------------+--------------+--------------+----------------+



                                                                                

In [63]:
(
    trade_kpi_df
    .agg(
        F.round(F.expr('percentile(absSumTradeValue, array(0.25))')[0], 2).alias('25% percentile'),
        F.round(F.expr('percentile(absSumTradeValue, array(0.5))')[0], 2).alias('50% percentile'),
        F.round(F.mean('absSumTradeValue'), 2).alias('mean'),
        F.round(F.expr('percentile(absSumTradeValue, array(0.75))')[0], 2).alias('75% percentile'),
        F.round(F.expr('percentile(absSumTradeValue, array(0.9))')[0], 2).alias('90% percentile'),
        F.round(F.expr('percentile(absSumTradeValue, array(0.99))')[0], 2).alias('99% percentile'),
        F.round(F.expr('percentile(absSumTradeValue, array(0.999))')[0], 2).alias('99.9% percentile'),
    )
    .show()
)

22/02/22 17:46:33 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB
22/02/22 17:46:35 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB
[Stage 1560:>                                                       (0 + 1) / 1]

+--------------+--------------+------+--------------+--------------+--------------+----------------+
|25% percentile|50% percentile|  mean|75% percentile|90% percentile|99% percentile|99.9% percentile|
+--------------+--------------+------+--------------+--------------+--------------+----------------+
|          2.93|          6.25|240.49|         45.11|        257.59|       3311.74|        22552.29|
+--------------+--------------+------+--------------+--------------+--------------+----------------+



                                                                                

In [64]:
(
    trade_kpi_df
    .agg(
        F.round(F.expr('percentile(activeDays, array(0.25))')[0], 2).alias('25% percentile'),
        F.round(F.expr('percentile(activeDays, array(0.5))')[0], 2).alias('50% percentile'),
        F.round(F.mean('activeDays'), 2).alias('mean'),
        F.round(F.expr('percentile(activeDays, array(0.75))')[0], 2).alias('75% percentile'),
        F.round(F.expr('percentile(activeDays, array(0.9))')[0], 2).alias('90% percentile'),
        F.round(F.expr('percentile(activeDays, array(0.99))')[0], 2).alias('99% percentile'),
        F.round(F.expr('percentile(activeDays, array(0.999))')[0], 2).alias('99.9% percentile'),
    )
    .show()
)

22/02/22 17:46:50 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB
22/02/22 17:46:52 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB


+--------------+--------------+-----+--------------+--------------+--------------+----------------+
|25% percentile|50% percentile| mean|75% percentile|90% percentile|99% percentile|99.9% percentile|
+--------------+--------------+-----+--------------+--------------+--------------+----------------+
|           3.0|           7.0|10.22|          13.0|          25.0|          49.0|            55.0|
+--------------+--------------+-----+--------------+--------------+--------------+----------------+



                                                                                

In [65]:
(
    trade_kpi_df
    .agg(
        F.round(F.expr('percentile(nBuyDays, array(0.25))')[0], 2).alias('25% percentile'),
        F.round(F.expr('percentile(nBuyDays, array(0.5))')[0], 2).alias('50% percentile'),
        F.round(F.mean('nBuyDays'), 2).alias('mean'),
        F.round(F.expr('percentile(nBuyDays, array(0.75))')[0], 2).alias('75% percentile'),
        F.round(F.expr('percentile(nBuyDays, array(0.9))')[0], 2).alias('90% percentile'),
        F.round(F.expr('percentile(nBuyDays, array(0.99))')[0], 2).alias('99% percentile'),
        F.round(F.expr('percentile(nBuyDays, array(0.999))')[0], 2).alias('99.9% percentile'),
    )
    .show()
)

22/02/22 17:46:53 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB
22/02/22 17:46:55 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB


+--------------+--------------+----+--------------+--------------+--------------+----------------+
|25% percentile|50% percentile|mean|75% percentile|90% percentile|99% percentile|99.9% percentile|
+--------------+--------------+----+--------------+--------------+--------------+----------------+
|           2.0|           4.0|5.66|           7.0|          14.0|          27.0|            35.0|
+--------------+--------------+----+--------------+--------------+--------------+----------------+



                                                                                

In [66]:
(
    trade_kpi_df
    .agg(
        F.round(F.expr('percentile(nSellDays, array(0.25))')[0], 2).alias('25% percentile'),
        F.round(F.expr('percentile(nSellDays, array(0.5))')[0], 2).alias('50% percentile'),
        F.round(F.mean('nSellDays'), 2).alias('mean'),
        F.round(F.expr('percentile(nSellDays, array(0.75))')[0], 2).alias('75% percentile'),
        F.round(F.expr('percentile(nSellDays, array(0.9))')[0], 2).alias('90% percentile'),
        F.round(F.expr('percentile(nSellDays, array(0.99))')[0], 2).alias('99% percentile'),
        F.round(F.expr('percentile(nSellDays, array(0.999))')[0], 2).alias('99.9% percentile'),
    )
    .show()
)

22/02/22 17:46:57 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB
22/02/22 17:46:58 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB


+--------------+--------------+----+--------------+--------------+--------------+----------------+
|25% percentile|50% percentile|mean|75% percentile|90% percentile|99% percentile|99.9% percentile|
+--------------+--------------+----+--------------+--------------+--------------+----------------+
|           1.0|           3.0|4.56|           6.0|          12.0|          25.0|            34.0|
+--------------+--------------+----+--------------+--------------+--------------+----------------+



                                                                                

In [67]:
print(trade_kpi_df.count() - trade_kpi_df.dropna().count())

22/02/22 17:46:59 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB
22/02/22 17:47:02 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB

0


                                                                                

In [68]:
trade_output_df = (
    trade_kpi_df
    .join(return_df.select('accountId', 'return').dropDuplicates(), on = ['accountId'])
    .dropna()
    .withColumn('tradeFrequencyDecile', F.ntile(N_QUANTILES).over(Window.partitionBy().orderBy('tradeFrequency')))
    .withColumn('meanTradeValueDecile', F.ntile(N_QUANTILES).over(Window.partitionBy().orderBy('meanTradeValue')))
    .withColumn('netSumTradeValueDecile', F.ntile(N_QUANTILES).over(Window.partitionBy().orderBy('netSumTradeValue')))
    .withColumn('absSumTradeValueDecile', F.ntile(N_QUANTILES).over(Window.partitionBy().orderBy('absSumTradeValue')))
    .withColumn('activeDaysDecile', F.ntile(N_QUANTILES).over(Window.partitionBy().orderBy('activeDays')))
    .withColumn('nBuyDaysDecile', F.ntile(N_QUANTILES).over(Window.partitionBy().orderBy('nBuyDays')))
    .withColumn('nSellDaysDecile', F.ntile(N_QUANTILES).over(Window.partitionBy().orderBy('nSellDays')))
)

display_df(trade_output_df)

22/02/22 17:47:04 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/02/22 17:47:04 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/02/22 17:47:04 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/02/22 17:47:04 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/02/22 17:47:04 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/02/22 17:47:04 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/02/22 1

727991


22/02/22 17:47:41 WARN DAGScheduler: Broadcasting large task binary with size 36.2 MiB


+------------------------------------+--------------+--------------+----------------+----------------+----------+--------+---------+------------------+--------------------+--------------------+----------------------+----------------------+----------------+--------------+---------------+
|accountId                           |tradeFrequency|meanTradeValue|netSumTradeValue|absSumTradeValue|activeDays|nBuyDays|nSellDays|return            |tradeFrequencyDecile|meanTradeValueDecile|netSumTradeValueDecile|absSumTradeValueDecile|activeDaysDecile|nBuyDaysDecile|nSellDaysDecile|
+------------------------------------+--------------+--------------+----------------+----------------+----------+--------+---------+------------------+--------------------+--------------------+----------------------+----------------------+----------------+--------------+---------------+
|F323D25A-C1B3-44FA-A147-ED2AA4466413|2             |0.012146      |0.0             |0.024292        |1         |0       |0        |0.0 

                                                                                

In [69]:
print(round(trade_output_df.filter(F.col('return') >= 0.35).count() / trade_output_df.count() ,2))

22/02/22 17:47:49 WARN DAGScheduler: Broadcasting large task binary with size 36.2 MiB
22/02/22 17:47:57 WARN DAGScheduler: Broadcasting large task binary with size 36.2 MiB
[Stage 1831:>                                                       (0 + 1) / 1]

0.1


                                                                                

In [70]:
(
    trade_output_df
    .agg(
        F.round(F.expr('percentile(return, array(0.5))')[0], 3).alias('medianReturn')
    )
    .show()
)

22/02/22 17:48:05 WARN DAGScheduler: Broadcasting large task binary with size 36.2 MiB
[Stage 1865:>                                                       (0 + 1) / 1]

+------------+
|medianReturn|
+------------+
|        0.15|
+------------+



                                                                                

In [71]:
(
    trade_output_df
    .groupBy('tradeFrequencyDecile')
    .agg(
        F.round(F.expr('percentile(return, array(0.5))')[0], 3).alias('medianReturn')
    )
    .orderBy('tradeFrequencyDecile')
    .show()
)

22/02/22 17:48:14 WARN DAGScheduler: Broadcasting large task binary with size 36.2 MiB
[Stage 1899:>                                                       (0 + 1) / 1]

+--------------------+------------+
|tradeFrequencyDecile|medianReturn|
+--------------------+------------+
|                   1|       0.093|
|                   2|        0.17|
|                   3|        0.18|
|                   4|       0.187|
|                   5|       0.182|
|                   6|       0.164|
|                   7|        0.13|
|                   8|       0.108|
|                   9|       0.096|
|                  10|       0.082|
+--------------------+------------+



                                                                                

In [72]:
trade_output_df.write.mode('overwrite').parquet('/home/user1/Data/Esmaeil/trade_output.parquet')

22/02/22 17:48:20 WARN DAGScheduler: Broadcasting large task binary with size 36.4 MiB
                                                                                

### identify block holders

In [73]:
bh_df = (
    daily_portfolio_df
    .select('date', 'symbol', 'accountId', 'heldShares')
    .join(price_df.select('date', 'symbol', 'shrout'), on = ['date', 'symbol'])
    .withColumn('ownership', F.col('heldShares') / F.col('shrout'))
    .filter( (F.col('ownership') >= 0.01) & F.col('ownership').isNotNull() )
    .select('accountId')
    .distinct()
    .withColumn('isBH', F.lit(1))
)

display_df(bh_df)

22/02/22 17:48:27 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:48:27 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
22/02/22 17:48:40 WARN DAGScheduler: Broadcasting large task binary with size 35.9 MiB
22/02/22 17:48:47 WARN DAGScheduler: Broadcasting large task binary with size 35.9 MiB
                                                                                

4470


22/02/22 17:48:53 WARN DAGScheduler: Broadcasting large task binary with size 35.9 MiB


+------------------------------------+----+
|accountId                           |isBH|
+------------------------------------+----+
|ADB8AE92-2497-4A8E-BDE9-B4B3CE40F1BF|1   |
|6BE23C8C-C5EA-4535-BC26-00ED4DDD036F|1   |
|3C20A6C2-BE54-4E86-A94D-C038F59EF6C6|1   |
+------------------------------------+----+
only showing top 3 rows



                                                                                

In [74]:
bh_output_df = (
    return_df
    .select('accountId', 'return')
    .dropna()
    .join(bh_df, on = 'accountId', how = 'left')
    .fillna(0, 'isBH')
)

display_df(bh_output_df)

22/02/22 17:49:13 WARN DAGScheduler: Broadcasting large task binary with size 35.9 MiB
22/02/22 17:49:16 WARN DAGScheduler: Broadcasting large task binary with size 36.1 MiB
                                                                                

4362652


22/02/22 17:49:27 WARN DAGScheduler: Broadcasting large task binary with size 36.1 MiB


+------------------------------------+-------------------+----+
|accountId                           |return             |isBH|
+------------------------------------+-------------------+----+
|92C26116-9961-4133-BAD7-793AA7BEF8F1|-0.8483243103882736|1   |
|D227A812-A177-4D00-8C99-2F95F6229465|-0.64284177040449  |1   |
|FD0C12C3-368D-4EF8-A6C5-DA93CD0DF1BC|-0.64284177040449  |1   |
+------------------------------------+-------------------+----+
only showing top 3 rows



                                                                                

In [75]:
(
    bh_output_df
    .groupBy('isBH')
    .agg(
        F.round(F.expr('percentile(return, array(0.5))')[0], 3).alias('medianTradeFrequency')
    )
    .show()
)

22/02/22 17:49:36 WARN DAGScheduler: Broadcasting large task binary with size 36.1 MiB
[Stage 2084:>                                                       (0 + 1) / 1]

+----+--------------------+
|isBH|medianTradeFrequency|
+----+--------------------+
|   1|               0.275|
|   0|               0.286|
+----+--------------------+



                                                                                

In [76]:
bh_output_df.write.mode('overwrite').parquet('/home/user1/Data/Esmaeil/bhOutput.parquet')

22/02/22 17:49:50 WARN DAGScheduler: Broadcasting large task binary with size 36.3 MiB
                                                                                

### number of stocks within initial portfolio

In [77]:
n_stocks_within_initial_portfolio_df = (
    portfolio_df
    .groupBy('accountId')
    .agg(
        F.count(F.lit(1)).alias('nStocksWithinInitialPortfolio')
    )
    .dropna()
)

display_df(n_stocks_within_initial_portfolio_df)

22/02/22 17:49:56 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB
22/02/22 17:50:04 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB
                                                                                

4216142


22/02/22 17:50:07 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB


+------------------------------------+-----------------------------+
|accountId                           |nStocksWithinInitialPortfolio|
+------------------------------------+-----------------------------+
|59EAFFF2-1231-40B6-8D0F-1742E1596108|1                            |
|87C3D7FD-E2A3-4E03-B2E6-161672D2C9ED|1                            |
|16A0B31E-95A7-41F4-A98B-E8EFC4EDC514|1                            |
+------------------------------------+-----------------------------+
only showing top 3 rows



In [78]:
(
    n_stocks_within_initial_portfolio_df
    .agg(
        F.expr('percentile(nStocksWithinInitialPortfolio, array(0.25))')[0].alias('25%'),
        F.expr('percentile(nStocksWithinInitialPortfolio, array(0.50))')[0].alias('50%'),
        F.round(F.mean('nStocksWithinInitialPortfolio'), 4).alias('mean'),
        F.expr('percentile(nStocksWithinInitialPortfolio, array(0.75))')[0].alias('75%'),
        F.expr('percentile(nStocksWithinInitialPortfolio, array(0.9))')[0].alias('90%'),
        F.expr('percentile(nStocksWithinInitialPortfolio, array(0.99))')[0].alias('99%'),
        F.expr('percentile(nStocksWithinInitialPortfolio, array(0.999))')[0].alias('99.9%'),
    )
    .show()
)

22/02/22 17:50:09 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB
22/02/22 17:50:11 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB


+---+---+------+---+---+----+-----+
|25%|50%|  mean|75%|90%| 99%|99.9%|
+---+---+------+---+---+----+-----+
|1.0|1.0|2.0799|2.0|4.0|15.0| 45.0|
+---+---+------+---+---+----+-----+



                                                                                

In [79]:
n_stocks_within_initial_portfolio_output_df = (
    return_df
    .select('accountId', 'return')
    .dropna()
    .join(n_stocks_within_initial_portfolio_df, on = 'accountId', how = 'inner')
    .withColumn('nStocksWithinInitialPortfolioDecile', F.ntile(N_QUANTILES).over(Window.partitionBy().orderBy('nStocksWithinInitialPortfolio')))
)

display_df(n_stocks_within_initial_portfolio_output_df)

22/02/22 17:50:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/02/22 17:50:21 WARN DAGScheduler: Broadcasting large task binary with size 36.1 MiB
22/02/22 17:50:25 WARN DAGScheduler: Broadcasting large task binary with size 36.1 MiB
22/02/22 17:50:29 WARN DAGScheduler: Broadcasting large task binary with size 36.1 MiB
                                                                                

4215840


22/02/22 17:50:43 WARN DAGScheduler: Broadcasting large task binary with size 36.1 MiB


+------------------------------------+-------------------+-----------------------------+-----------------------------------+
|accountId                           |return             |nStocksWithinInitialPortfolio|nStocksWithinInitialPortfolioDecile|
+------------------------------------+-------------------+-----------------------------+-----------------------------------+
|00019AFD-C89B-4B63-98BB-18BF5A112C6F|0.07377049180327866|1                            |1                                  |
|000217BA-3C48-4458-8CAA-691CA19C7187|0.3664259927797835 |1                            |1                                  |
|000249D5-649B-4C99-AE9E-82AB979E80C9|0.07377049180327866|1                            |1                                  |
+------------------------------------+-------------------+-----------------------------+-----------------------------------+
only showing top 3 rows



                                                                                

In [80]:
(
    n_stocks_within_initial_portfolio_output_df
    .groupBy('nStocksWithinInitialPortfolioDecile')
    .agg(
        F.round(F.expr('percentile(return, array(0.5))')[0], 3).alias('medianReturn')
    )
    .orderBy('nStocksWithinInitialPortfolioDecile')
    .show()
)

22/02/22 17:50:52 WARN DAGScheduler: Broadcasting large task binary with size 36.1 MiB
[Stage 2224:>                                                       (0 + 1) / 1]

+-----------------------------------+------------+
|nStocksWithinInitialPortfolioDecile|medianReturn|
+-----------------------------------+------------+
|                                  1|       0.366|
|                                  2|       0.366|
|                                  3|       0.366|
|                                  4|       0.366|
|                                  5|       0.366|
|                                  6|       0.366|
|                                  7|       0.366|
|                                  8|       0.343|
|                                  9|       0.268|
|                                 10|       0.222|
+-----------------------------------+------------+



                                                                                

In [81]:
n_stocks_within_initial_portfolio_output_df.write.mode('overwrite').parquet('/home/user1/Data/Esmaeil/n_initial_portfolio.parquet')

22/02/22 17:51:01 WARN DAGScheduler: Broadcasting large task binary with size 36.3 MiB
                                                                                

### number of stocks within final portfolio

In [82]:
n_stocks_within_final_portfolio_df = (
    daily_portfolio_df
    .withColumn('rowNumber', F.row_number().over(Window.partitionBy('accountId', 'symbol').orderBy('date')))
    .withColumn('maxRowNumber', F.max('rowNumber').over(Window.partitionBy('accountId', 'symbol')))
    .filter(F.col('rowNumber') == F.col('maxRowNumber'))
    .filter(F.col('heldShares') > 0)
    .withColumn('date', F.lit(MAX_PRICE_DATE))
    .join(price_df.select('date', 'symbol', 'close_price'), on = ['date', 'symbol'], how = 'left')
    .dropna(subset = ['close_price'])
    .groupBy('accountId')
    .agg(
        F.countDistinct('symbol').alias('nStocksWithinFinalPortfolio')
    )   
)

display_df(n_stocks_within_final_portfolio_df)

22/02/22 17:51:11 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
22/02/22 17:51:11 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:51:23 WARN DAGScheduler: Broadcasting large task binary with size 36.0 MiB
22/02/22 17:51:33 WARN DAGScheduler: Broadcasting large task binary with size 35.9 MiB
                                                                                

4313618


22/02/22 17:51:39 WARN DAGScheduler: Broadcasting large task binary with size 35.9 MiB
[Stage 2308:>                                                       (0 + 1) / 1]

+------------------------------------+---------------------------+
|accountId                           |nStocksWithinFinalPortfolio|
+------------------------------------+---------------------------+
|5B291534-2FE5-4A4A-B75B-B4D44060A1AF|14                         |
|0F6CC9D2-EF36-4537-BEBF-32F9386A468B|56                         |
|33046B40-C4EB-4CEE-974F-D6F1185F3434|7                          |
+------------------------------------+---------------------------+
only showing top 3 rows



                                                                                

In [83]:
n_stocks_within_final_portfolio_output_df = (
    return_df
    .select('accountId', 'return')
    .dropna()
    .join(n_stocks_within_final_portfolio_df, on = 'accountId', how = 'inner')
    .withColumn('nStocksWithinFinalPortfolioDecile', F.ntile(N_QUANTILES).over(Window.partitionBy().orderBy('nStocksWithinFinalPortfolio')))
)

display_df(n_stocks_within_final_portfolio_output_df)

22/02/22 17:51:41 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/02/22 17:51:55 WARN DAGScheduler: Broadcasting large task binary with size 36.1 MiB
22/02/22 17:51:58 WARN DAGScheduler: Broadcasting large task binary with size 36.1 MiB
22/02/22 17:52:01 WARN DAGScheduler: Broadcasting large task binary with size 36.1 MiB
                                                                                

4313613


22/02/22 17:52:16 WARN DAGScheduler: Broadcasting large task binary with size 36.1 MiB


+------------------------------------+-------------------+---------------------------+---------------------------------+
|accountId                           |return             |nStocksWithinFinalPortfolio|nStocksWithinFinalPortfolioDecile|
+------------------------------------+-------------------+---------------------------+---------------------------------+
|00019AFD-C89B-4B63-98BB-18BF5A112C6F|0.07377049180327866|1                          |1                                |
|000217BA-3C48-4458-8CAA-691CA19C7187|0.3664259927797835 |1                          |1                                |
|000249D5-649B-4C99-AE9E-82AB979E80C9|0.07377049180327866|1                          |1                                |
+------------------------------------+-------------------+---------------------------+---------------------------------+
only showing top 3 rows



In [84]:
(
    n_stocks_within_final_portfolio_output_df
    .groupBy('nStocksWithinFinalPortfolioDecile')
    .agg(
        F.round(F.expr('percentile(return, array(0.5))')[0], 3).alias('medianReturn')
    )
    .orderBy('nStocksWithinFinalPortfolioDecile')
    .show()
)

22/02/22 17:52:24 WARN DAGScheduler: Broadcasting large task binary with size 36.2 MiB
[Stage 2401:>                                                       (0 + 1) / 1]

+---------------------------------+------------+
|nStocksWithinFinalPortfolioDecile|medianReturn|
+---------------------------------+------------+
|                                1|       0.366|
|                                2|       0.366|
|                                3|       0.366|
|                                4|       0.366|
|                                5|       0.366|
|                                6|       0.366|
|                                7|        0.35|
|                                8|       0.314|
|                                9|       0.264|
|                               10|       0.197|
+---------------------------------+------------+



                                                                                

In [85]:
n_stocks_within_final_portfolio_output_df.write.mode('overwrite').parquet('/home/user1/Data/Esmaeil/n_final_portfolio.parquet')

22/02/22 17:52:35 WARN DAGScheduler: Broadcasting large task binary with size 36.3 MiB
                                                                                

### turnover

In [86]:
turnover_df = (
    trade_kpi_df
    .join(final_portfolio_value_df, on =['accountId'], how = 'left')
    .withColumn('turnover', F.col('absSumTradeValue') / F.col('finalPortfolioValue'))
    .join(return_df.select('accountId', 'return'), on = 'accountId')
    .withColumn('turnoverDecile', F.ntile(N_QUANTILES).over(Window.partitionBy().orderBy(F.col('turnover'))))
    .select(
        'accountId',
        'turnover',
        'turnoverDecile',
        'return'
    )
)

display_df(turnover_df)

22/02/22 17:52:41 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/02/22 17:52:51 WARN DAGScheduler: Broadcasting large task binary with size 36.1 MiB
22/02/22 17:52:55 WARN DAGScheduler: Broadcasting large task binary with size 36.2 MiB
22/02/22 17:52:59 WARN DAGScheduler: Broadcasting large task binary with size 36.2 MiB
                                                                                

727991


22/02/22 17:53:06 WARN DAGScheduler: Broadcasting large task binary with size 36.2 MiB
[Stage 2500:>                                                       (0 + 1) / 1]

+------------------------------------+--------+--------------+-------------------+
|accountId                           |turnover|turnoverDecile|return             |
+------------------------------------+--------+--------------+-------------------+
|010B09FF-38E2-4440-9E3D-460E44548D88|null    |1             |0.183003099402258  |
|029D5E79-ABF9-4EBE-B113-F43D9245588E|null    |1             |0.19860501119716534|
|042E34A4-2946-4DB8-8AA0-E0787EC0F3A5|null    |1             |0.0                |
+------------------------------------+--------+--------------+-------------------+
only showing top 3 rows



                                                                                

In [87]:
(
    turnover_df
    .groupBy('turnoverDecile')
    .agg(
        F.round(F.expr('percentile(return, array(0.5))')[0], 3).alias('medianReturn')
    )
    .orderBy('turnoverDecile')
    .show()
)

22/02/22 17:53:16 WARN DAGScheduler: Broadcasting large task binary with size 36.2 MiB
[Stage 2534:>                                                       (0 + 1) / 1]

+--------------+------------+
|turnoverDecile|medianReturn|
+--------------+------------+
|             1|       0.163|
|             2|       0.295|
|             3|        0.17|
|             4|       0.093|
|             5|       0.156|
|             6|       0.142|
|             7|       0.168|
|             8|       0.152|
|             9|       0.089|
|            10|       0.082|
+--------------+------------+



                                                                                

In [88]:
turnover_df.write.mode('overwrite').parquet('/home/user1/Data/Esmaeil/turnover.parquet')

22/02/22 17:53:23 WARN DAGScheduler: Broadcasting large task binary with size 36.4 MiB
                                                                                

### time series of the number of stocks within portfolio

In [89]:
dates_list = (
    price_df
    .select('date')
    .distinct()
    .orderBy('date')
    .rdd.flatMap(lambda x: x).collect()
)

print(dates_list)

22/02/22 17:53:25 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:53:29 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:53:30 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
22/02/22 17:53:32 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB


[13980105, 13980106, 13980107, 13980110, 13980111, 13980117, 13980118, 13980119, 13980120, 13980121, 13980124, 13980125, 13980126, 13980127, 13980128, 13980131, 13980202, 13980203, 13980204, 13980207, 13980208, 13980209, 13980210, 13980211, 13980214, 13980215, 13980216, 13980217, 13980218, 13980221, 13980222, 13980223, 13980224, 13980225, 13980228, 13980229, 13980230, 13980231, 13980301, 13980304, 13980305, 13980307, 13980308, 13980311, 13980312, 13980313, 13980318, 13980319, 13980320, 13980321, 13980322, 13980325, 13980326, 13980327, 13980328, 13980329]


In [90]:
nStocksWithinPortfolioOfAllInvestors = []
nInvestors = []

for date in dates_list:
    print(date)
    result = (
        daily_portfolio_df
        .filter(F.col('date') <= date)
        .withColumn('rowNumber', F.row_number().over(Window.partitionBy('accountId', 'symbol').orderBy('date')))
        .withColumn('maxRowNumber', F.max('rowNumber').over(Window.partitionBy('accountId', 'symbol')))
        .filter(F.col('rowNumber') == F.col('maxRowNumber'))
        .filter( (F.col('heldShares') > 0) & (F.col('heldShares').isNotNull()) )
        .groupBy('accountId')
        .agg(
            F.count(F.lit(1)).alias('nStocksWithinPortfolioOfAllInvestors'),
        )
        .agg(
            F.round(F.mean('nStocksWithinPortfolioOfAllInvestors'), 3).alias('nStocksWithinPortfolioOfAllInvestors'),
            F.count(F.lit(1)).alias('nInvestors')
        )
    )
    nStocksWithinPortfolioOfAllInvestors.append(result.collect()[0][0])
    nInvestors.append(result.collect()[0][1])

13980105


22/02/22 17:53:36 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
22/02/22 17:53:46 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
                                                                                

13980106


22/02/22 17:53:52 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
22/02/22 17:54:02 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
                                                                                

13980107


22/02/22 17:54:08 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
22/02/22 17:54:17 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
                                                                                

13980110


22/02/22 17:54:22 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
22/02/22 17:54:32 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
                                                                                

13980111


22/02/22 17:54:38 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
22/02/22 17:54:47 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
                                                                                

13980117


22/02/22 17:54:54 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
22/02/22 17:55:04 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
                                                                                

13980118


22/02/22 17:55:10 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
22/02/22 17:55:20 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
                                                                                

13980119


22/02/22 17:55:26 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
22/02/22 17:55:35 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
                                                                                

13980120


22/02/22 17:55:41 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
22/02/22 17:55:51 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
                                                                                

13980121


22/02/22 17:55:56 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
22/02/22 17:56:06 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
                                                                                

13980124


22/02/22 17:56:12 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
22/02/22 17:56:22 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
                                                                                

13980125


22/02/22 17:56:28 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
22/02/22 17:56:38 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
                                                                                

13980126


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/user1/.local/lib/python3.8/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/user1/.local/lib/python3.8/site-packages/py4j/clientserver.py", line 475, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt
22/02/22 17:56:44 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB

KeyboardInterrupt: 

22/02/22 17:57:05 WARN DAGScheduler: Broadcasting large task binary with size 18.1 MiB
Exception in thread "serve-DataFrame" java.net.SocketTimeoutException: Accept timed out
	at java.base/java.net.PlainSocketImpl.socketAccept(Native Method)
	at java.base/java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:458)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:565)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:533)
	at org.apache.spark.security.SocketAuthServer$$anon$1.run(SocketAuthServer.scala:64)


In [None]:
n_stocks_df = spark.createDataFrame(
    pd.DataFrame({
        'date' : dates_list,
        'nStocksWithinPortfolioOfAllInvestors' : nStocksWithinPortfolioOfAllInvestors,
        'nInvestors' : nInvestors
    })
)

display_df(n_stocks_df)

In [None]:
n_stocks_df.write.mode('overwrite').parquet('/home/user1/Data/Esmaeil/mean_number_of_stocks_within_portfolio.parquet')

In [100]:
initial_ids =  [row['accountId'] for row in portfolio_df.select('accountId').distinct().collect()]
initial_ids = set(initial_ids)

22/02/22 18:05:30 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB
22/02/22 18:05:38 WARN DAGScheduler: Broadcasting large task binary with size 18.0 MiB
                                                                                

In [109]:
unique_id_trade = flat_trade_df.dropDuplicates(subset=['accountId','date'])
unique_id_trade.count()
result = {}
for date in dates_list:
    print(initial_ids)
    tempt = unique_id_trade.filter(F.col('date') == date).select('accountId').distinct().collect()
    teades_ids = set([row['accountId'] for row in tempt])
    result[date] = len(teades_ids -initial_ids )
    
    initial_ids =  set.union(initial_ids, teades_ids)

22/02/22 18:17:09 WARN DAGScheduler: Broadcasting large task binary with size 17.9 MiB
                                                                                