In [1]:
import pyspark
import os
import sys
from pyspark import SparkContext
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType
import numpy as np
import pyspark.sql.functions as fun
spark = SparkSession.builder.config("spark.driver.memory", "16g").appName('chapter_8').getOrCreate()

In [2]:
stocks = spark.read.option("header", "true").option("inferSchema", "true").csv("Datasets/stocks/AAME.csv")
stocks.show(2)

+---------+----+----+----+-----+------+
|     Date|Open|High| Low|Close|Volume|
+---------+----+----+----+-----+------+
|31-Dec-13|4.04|4.13|3.96| 4.09| 30735|
|30-Dec-13|4.05|4.05|3.84|  3.9| 14646|
+---------+----+----+----+-----+------+
only showing top 2 rows



In [4]:
params = {}
for col_name in stocks.columns:
    if col_name != "Date" and col_name != "Symbol":
        std_dev_val = stocks.select(fun.stddev(col_name)).collect()[0][0]
        mean_val = stocks.select(fun.mean(col_name)).collect()[0][0]
        params[col_name] = {'mean':mean_val,'std_dev':std_dev_val}

for col_name,val in params.items():
    print(f"column : {col_name}, mean:{val['mean']}, std_dev_val:{val['std_dev']}")


column : Open, mean:2.3776905972045754, std_dev_val:0.8594838332913094
column : High, mean:2.445060355781448, std_dev_val:0.855811640978281
column : Low, mean:2.3282719186785252, std_dev_val:0.8412114675601051
column : Close, mean:2.3474244152880743, std_dev_val:0.8579680474599332
column : Volume, mean:6616.119794637764, std_dev_val:14076.882853421579


In [5]:
num_simulations = 1000
mean_open = 2.479037434833667
std_dev_open = 1.792742719789236
mean_close = 3.5762375610480097
std_dev_close = 2.3617920735737727
mean_high = 3.6387547097464177
std_dev_high = 2.3921509846515936
mean_low = 3.512713408899274
std_dev_low = 102.3491839152505065
mean_vol=7977.757971875619
std_dev_vol=16239.129044271978

@udf(FloatType())

def generate_random_open():
    return np.random.normal(mean_open,std_dev_open)

@udf(FloatType())
def generate_random_close():
    return np.random.normal(mean_close, std_dev_close)

@udf(FloatType())
def generate_random_high():
    return np.random.normal(mean_high, std_dev_high)

@udf(FloatType())
def generate_random_low():
    return np.random.normal(mean_low, std_dev_low)

@udf(FloatType())
def generate_random_vol():
  return np.random.normal(mean_vol,std_dev_vol)

simulated_prices_df = stocks.select(
    "*",
    generate_random_open().alias("simulated_open"),
    generate_random_close().alias("simulated_close"),
    generate_random_high().alias("simulated_high"),
    generate_random_low().alias("simulated_low"),
    generate_random_vol().alias("simulated_vol")
)

# Show the results
simulated_prices_df.show()


+---------+----+----+----+-----+------+--------------+---------------+--------------+-------------+-------------+
|     Date|Open|High| Low|Close|Volume|simulated_open|simulated_close|simulated_high|simulated_low|simulated_vol|
+---------+----+----+----+-----+------+--------------+---------------+--------------+-------------+-------------+
|31-Dec-13|4.04|4.13|3.96| 4.09| 30735|     4.0679626|      2.4697208|     0.7865059|    159.87758|    11601.194|
|30-Dec-13|4.05|4.05|3.84|  3.9| 14646|      5.104902|      6.1638117|     6.2224193|   -44.121853|   -12360.729|
|27-Dec-13|4.02|4.05|3.99| 4.05|  5047|      5.202862|      3.8933122|     0.5958919|    -118.4245|     3247.335|
|26-Dec-13|3.99|4.04|3.70| 4.01|  6309|      2.545513|      4.4349184|    0.72486526|   -7.7052746|    6412.2915|
|24-Dec-13|3.90|3.97|3.84| 3.95| 13592|      2.435408|      0.6400224|     1.1327266|    171.63959|   -1541.5275|
|23-Dec-13|3.96|3.97|3.64| 3.92| 29745|     2.7796378|      3.1084414|     3.1870482|   