1) Implement a PySpark script that runs Monte Carlo simulations in parallel.

In [3]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
import random

def monte_carlo_simulation(rows):
    num_simulations = 1000
    results = []

    for _ in range(num_simulations):

        simulation_result = random.uniform(0, 1)
        results.append(simulation_result)

    return [(*row, results) for row in rows]

spark = SparkSession.builder.appName('MonteCarlo').getOrCreate()

df = spark.read.csv(['/home/lplab/Desktop/Jayasuryan_BDA/stocks/ABAX.csv',
                     '/home/lplab/Desktop/Jayasuryan_BDA/stocks/AAME.csv',
                     '/home/lplab/Desktop/Jayasuryan_BDA/stocks/AEPI.csv'],
                     inferSchema=True, header=True)

df = df.withColumn("source", lit("ABAX")) \
       .union(df.withColumn("source", lit("AAME"))) \
       .union(df.withColumn("source", lit("AEPI")))

simulation_results = df.rdd \
                       .map(lambda row: (row['source'], row)) \
                       .groupByKey() \
                       .flatMapValues(monte_carlo_simulation)



simulation_results_df = simulation_results.map(lambda x: (x[0], *x[1])).toDF(["source", "Date", "Open", "High", "Low", "Close", "Volume", "SimulationResults"])

abax_forecast = simulation_results_df.filter(simulation_results_df['source'] == 'ABAX')
aame_forecast = simulation_results_df.filter(simulation_results_df['source'] == 'AAME')
aepi_forecast = simulation_results_df.filter(simulation_results_df['source'] == 'AEPI')

print("ABAX Forecast:")
abax_forecast.show(5)

print("AAME Forecast:")
aame_forecast.show(5)

print("AEPI Forecast:")
aepi_forecast.show(5)


ABAX Forecast:


                                                                                

+------+---------+-----+-----+-----+-----+------+-----------------+--------------------+
|source|     Date| Open| High|  Low|Close|Volume|SimulationResults|                  _9|
+------+---------+-----+-----+-----+-----+------+-----------------+--------------------+
|  ABAX|31-Dec-13|52.94|54.37|52.25|52.83| 79429|             ABAX|[0.92711389713746...|
|  ABAX|30-Dec-13|50.36|54.10|50.36|52.95|131095|             ABAX|[0.92711389713746...|
|  ABAX|27-Dec-13|50.38|50.80|49.67|50.52| 54354|             ABAX|[0.92711389713746...|
|  ABAX|26-Dec-13|50.50|51.19|49.67| 50.0| 74414|             ABAX|[0.92711389713746...|
|  ABAX|24-Dec-13|49.85|50.60|49.66|49.99| 36872|             ABAX|[0.92711389713746...|
+------+---------+-----+-----+-----+-----+------+-----------------+--------------------+
only showing top 5 rows

AAME Forecast:


                                                                                

+------+---------+-----+-----+-----+-----+------+-----------------+--------------------+
|source|     Date| Open| High|  Low|Close|Volume|SimulationResults|                  _9|
+------+---------+-----+-----+-----+-----+------+-----------------+--------------------+
|  AAME|31-Dec-13|52.94|54.37|52.25|52.83| 79429|             AAME|[0.20106634806985...|
|  AAME|30-Dec-13|50.36|54.10|50.36|52.95|131095|             AAME|[0.20106634806985...|
|  AAME|27-Dec-13|50.38|50.80|49.67|50.52| 54354|             AAME|[0.20106634806985...|
|  AAME|26-Dec-13|50.50|51.19|49.67| 50.0| 74414|             AAME|[0.20106634806985...|
|  AAME|24-Dec-13|49.85|50.60|49.66|49.99| 36872|             AAME|[0.20106634806985...|
+------+---------+-----+-----+-----+-----+------+-----------------+--------------------+
only showing top 5 rows

AEPI Forecast:
+------+---------+-----+-----+-----+-----+------+-----------------+--------------------+
|source|     Date| Open| High|  Low|Close|Volume|SimulationResults|   

2. Demonstrate how to define and apply probability distributions to input parameters using
PySpark.

In [4]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
import math

def monte_carlo_simulation(rows):
    num_simulations = 1000
    results = []

    for _ in range(num_simulations):
        u1 = random.uniform(0, 1)
        u2 = random.uniform(0, 1)
        
        z1 = math.sqrt(-2 * math.log(u1)) * math.cos(2 * math.pi * u2)
        z2 = math.sqrt(-2 * math.log(u1)) * math.sin(2 * math.pi * u2)
        
        results.append(z1)

    return [(*row, results) for row in rows]

spark = SparkSession.builder.appName('MonteCarlo').getOrCreate()

df = spark.read.csv(['/home/lplab/Desktop/Jayasuryan_BDA/stocks/ABAX.csv',
                     '/home/lplab/Desktop/Jayasuryan_BDA/stocks/AAME.csv',
                     '/home/lplab/Desktop/Jayasuryan_BDA/stocks/AEPI.csv'],
                     inferSchema=True, header=True)

df = df.withColumn("source", lit("ABAX")) \
       .union(df.withColumn("source", lit("AAME"))) \
       .union(df.withColumn("source", lit("AEPI")))

simulation_results = df.rdd \
                       .map(lambda row: (row['source'], row)) \
                       .groupByKey() \
                       .flatMapValues(monte_carlo_simulation)

simulation_results_df = simulation_results.map(lambda x: (x[0], *x[1])).toDF(["source", "Date", "Open", "High", "Low", "Close", "Volume", "SimulationResults"])

abax_forecast = simulation_results_df.filter(simulation_results_df['source'] == 'ABAX')
aame_forecast = simulation_results_df.filter(simulation_results_df['source'] == 'AAME')
aepi_forecast = simulation_results_df.filter(simulation_results_df['source'] == 'AEPI')

print("ABAX Forecast:")
abax_forecast.show(5)

print("AAME Forecast:")
aame_forecast.show(5)

print("AEPI Forecast:")
aepi_forecast.show(5)


ABAX Forecast:


                                                                                

+------+---------+-----+-----+-----+-----+------+-----------------+--------------------+
|source|     Date| Open| High|  Low|Close|Volume|SimulationResults|                  _9|
+------+---------+-----+-----+-----+-----+------+-----------------+--------------------+
|  ABAX|31-Dec-13|52.94|54.37|52.25|52.83| 79429|             ABAX|[-0.4986238883543...|
|  ABAX|30-Dec-13|50.36|54.10|50.36|52.95|131095|             ABAX|[-0.4986238883543...|
|  ABAX|27-Dec-13|50.38|50.80|49.67|50.52| 54354|             ABAX|[-0.4986238883543...|
|  ABAX|26-Dec-13|50.50|51.19|49.67| 50.0| 74414|             ABAX|[-0.4986238883543...|
|  ABAX|24-Dec-13|49.85|50.60|49.66|49.99| 36872|             ABAX|[-0.4986238883543...|
+------+---------+-----+-----+-----+-----+------+-----------------+--------------------+
only showing top 5 rows

AAME Forecast:


                                                                                

+------+---------+-----+-----+-----+-----+------+-----------------+--------------------+
|source|     Date| Open| High|  Low|Close|Volume|SimulationResults|                  _9|
+------+---------+-----+-----+-----+-----+------+-----------------+--------------------+
|  AAME|31-Dec-13|52.94|54.37|52.25|52.83| 79429|             AAME|[0.96410721363802...|
|  AAME|30-Dec-13|50.36|54.10|50.36|52.95|131095|             AAME|[0.96410721363802...|
|  AAME|27-Dec-13|50.38|50.80|49.67|50.52| 54354|             AAME|[0.96410721363802...|
|  AAME|26-Dec-13|50.50|51.19|49.67| 50.0| 74414|             AAME|[0.96410721363802...|
|  AAME|24-Dec-13|49.85|50.60|49.66|49.99| 36872|             AAME|[0.96410721363802...|
+------+---------+-----+-----+-----+-----+------+-----------------+--------------------+
only showing top 5 rows

AEPI Forecast:
+------+---------+-----+-----+-----+-----+------+-----------------+--------------------+
|source|     Date| Open| High|  Low|Close|Volume|SimulationResults|   