In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.conf import SparkConf
config = SparkConf()
config.setMaster("local").setAppName("Assigment2")

from pyspark.sql import SparkSession
spark = SparkSession.builder\
                    .config(conf=config)\
                    .getOrCreate()

sc = spark.sparkContext

22/03/15 01:52:02 WARN Utils: Your hostname, ubuntu-virtual-machine resolves to a loopback address: 127.0.1.1; using 192.168.80.128 instead (on interface ens33)
22/03/15 01:52:02 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/03/15 01:52:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/15 01:52:04 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
stockDf = spark.read.format('csv')\
                    .option('header', True)\
                    .option('inferSchema', True)\
                    .load("hdfs://localhost:9000/assigment2")\
                    .drop("_c13")

stockDf.printSchema()

                                                                                

root
 |-- SYMBOL: string (nullable = true)
 |-- SERIES: string (nullable = true)
 |-- OPEN: string (nullable = true)
 |-- HIGH: string (nullable = true)
 |-- LOW: string (nullable = true)
 |-- CLOSE: string (nullable = true)
 |-- LAST: string (nullable = true)
 |-- PREVCLOSE: string (nullable = true)
 |-- TOTTRDQTY: string (nullable = true)
 |-- TOTTRDVAL: string (nullable = true)
 |-- TIMESTAMP: string (nullable = true)
 |-- TOTALTRADES: string (nullable = true)
 |-- ISIN: string (nullable = true)



In [4]:
# 1. Convert Daily Stock data in CSV format to Parquet format. Partitions "Year" "Month" "Day"
from pyspark.sql.functions import col, date_format

stockDfParquet = stockDf.withColumn("year", date_format(col("TIMESTAMP"), "yyyy"))\
                        .withColumn("month", date_format(col("TIMESTAMP"), "MM"))\
                        .withColumn("day", date_format(col("TIMESTAMP"), "dd"))\
                        .write\
                        .partitionBy("Year", "Month", "Day")\
                        .format("parquet")\
                        .mode("overwrite")\
                        .save("hdfs://localhost:9000/stock-Parquet")

                                                                                

In [5]:
#2.	Find the PriceGain, PriceGainP, VolumeGain, VolumeGainP of each stock for 10 days [Refer Point 1] from historical data, write the results to JSON, Parquet, ORC, 
#   CSV format in Hadoop. 
#   The column should be “Date”, “Symbol”, “PriceGain”, “PriceGainP”, “VolumnGain”, “VolumnGainP”, sorted by PriceGainP in descending order 
from pyspark.sql.window import Window
from pyspark.sql.functions import col, desc, lead, row_number

windowSpec = Window.partitionBy("SYMBOL").orderBy("TIMESTAMP")

stockDf = stockDf.withColumn("PRICEGAIN", col("CLOSE") - col("OPEN"))\
                 .withColumn("PRICEGAINP", (col("PRICEGAIN") / col("OPEN")) * 100)\
                 .withColumn("VOLUMNGAIN", lead("TOTTRDVAL", 2).over(windowSpec))\
                 .withColumn("VOLUMNGAINp", col("VOLUMNGAIN") / col("TOTTRDVAL") * 100)\
                 .select("TIMESTAMP", "SYMBOL", "PRICEGAIN", "PRICEGAINP", "VOLUMNGAIN", "VOLUMNGAINP")\
                 .sort(desc("PRICEGAINP"))

# to find the volumnGain we need to take the lead or lag
stockDf.show(20)



+-----------+----------+-------------------+------------------+------------+------------------+
|  TIMESTAMP|    SYMBOL|          PRICEGAIN|        PRICEGAINP|  VOLUMNGAIN|       VOLUMNGAINP|
+-----------+----------+-------------------+------------------+------------+------------------+
|07-MAR-2022|  ATNINTER|0.05000000000000002| 33.33333333333335|        null|              null|
|25-FEB-2022|SREIBNPNCD|               40.0|              32.0|    33260.93|156.70638398115432|
|25-FEB-2022|      RHFL|  67.69999999999999|28.771780705482357|         882| 0.576096690187349|
|28-FEB-2022|    FELDVR| 2.3499999999999996| 22.59615384615384|        null|              null|
|28-FEB-2022|  MODISNME| 13.399999999999999|22.558922558922557|        null|              null|
|22-FEB-2022|  STEELCAS|  53.30000000000001| 22.39495798319328|    768336.1| 74.88968715701235|
|25-FEB-2022|  FILDF2GP|0.21999999999999997| 22.22222222222222|        null|              null|
|02-MAR-2022|KOTHARIPRO|  19.35000000000

                                                                                

In [8]:
# 3. Use the Intraday 1 Min data from parquet format, calculate Gain, GainP for every hourly window for each stock. The possible column output should be, 
#    “SYMBOL”, “DATEFROM”, “DATETO”, "Gain", “GAINP” where are DATEFROM and DATETO are timestamp/date columns example, DATEFROM from Feb 01, 2022 10:00 AM to Feb 01, 2022 11:00 
from pyspark.sql.functions import row_number, col, date_trunc, col, to_timestamp, concat, lit, date_format, sum, avg, max, min, mean, count
from pyspark.sql.types import StructType, StringType, DoubleType, LongType

intraDaySchema = StructType() \
        .add("Symbol", StringType(), True)\
        .add("Date", StringType(), True)\
        .add("Time", StringType(), True)\
        .add("Open", DoubleType(), True)\
        .add("High", DoubleType(), True)\
        .add("Low", DoubleType(), True)\
        .add("Close", DoubleType(), True)\
        .add("Volume", LongType(), True)\
        .add("OI", LongType(), True)

intraDayDf = spark.read.format('csv')\
                    .option("head", True)\
                    .schema(intraDaySchema)\
                    .load("hdfs://localhost:9000/raw/*/*/*.txt")

intraDayDf = intraDayDf.withColumn("DateTimeStr", concat( col("Date"), lit(" "), col("Time")))\
    .withColumn("DateTime", to_timestamp(col("DateTimeStr"), "yyyMMdd hh:mm" ))\
    .withColumn("DateFrom", date_format(col("DateTime").cast("timestamp"), "yyyy-MM-dd HH:00"))\
 

intraDayDf.groupBy("Symbol","DateFrom") \
    .agg(sum("Open").alias("SumOpen"), \
        sum("Close").alias("SumClose"))\
    .withColumn("Gain", col("SumClose") - col("SumOpen"))\
    .withColumn("GainP", (col("Gain") / col("SumOpen")) * 100)\
    .drop("SumOpen")\
    .drop("SumClose")\
    .show()



+------------+----------------+-------------------+--------------------+
|      Symbol|        DateFrom|               Gain|               GainP|
+------------+----------------+-------------------+--------------------+
|SILVERMIC_F1|2021-12-14 10:00|              -39.0|-0.00105241484682...|
|SILVERMIC_F1|2021-09-28 10:00|               48.0|0.001316207282903948|
|  SILVERM_F1|2021-03-01 09:00|              292.0|0.007111817746518255|
|SILVERMIC_F1|2021-10-19 00:00|              361.0|0.009386770841621513|
|  MCXMETLDEX|2021-12-23 09:00| 37.030000000377186|0.003628982546228337|
|SILVERMIC_F2|2021-02-15 10:00|              208.0|0.005061003339045616|
|  MCXCOMPDEX|2021-06-21 09:00|  6.739999999641441|0.001029807770587...|
|  MCXCOMPDEX|2021-06-23 09:00|  32.16999999945983|0.004850218703019227|
|  MCXCOMPDEX|2021-06-23 11:00| -10.67000000027474|-0.00158212243760...|
|  MCXMETLDEX|2021-06-23 09:00|  92.94000000017695| 0.01062346175413482|
| CRUDEOIL_F1|2021-03-08 00:00|                4.0|

                                                                                