In [33]:
%%HTML
<div>
    <P>
        <label>
            <strong>
            Introduction to PySpark
            </strong>
        </label>
        <ul>
            <li>Setup your working environment (Conda package management is recommended as it replace both pip and virtual environment)</li>
            <li>I used WSL 2 for setting up a linux subsystem on my windows 11 in order to use GPU acceleration power.</li>
            <li>You can install PySpark from provided Instruction. <a href="https://spark.apache.org/docs/latest/api/python/getting_started/install.html">here</a>
            <br> Don't forget to install Java SE 8 or above</li>
        </ul>
        <div>
            <a style="font-weight: 500; color: red;">Note:</a>
            I used Spark version 3.5.1 and PySpark worked with python 3.11.9 well, earlier version encountered with some errors during some code executions.
        </div>
    </P>
</div>
<div>
    <P>
        <label>
            <strong>
            Dataset
            </strong>
        </label>
        <ul>
            <li>
                 Previously used: Medicare Part D Prescribers - by Provider and Drug |
                <a href="https://data.cms.gov/provider-summary-by-type-of-service/medicare-part-d-prescribers/medicare-part-d-prescribers-by-provider-and-drug"> Download </a>
            </li>
            <li>
                New dataset is available in datasets directory. which is real data of overall sales of medicines from different distributors and manufactures of Iran.
            </li>
        </ul>
    </P>

    <p>
        <label>
            <strong>
                CUDA
            </strong>
        </label>
        <div>
            For GPU Accelerated tasks:
            Install or update CUDA toolkit if your nvidia graphic card supports.
            <ul>
            <li><a href="https://developer.nvidia.com/cuda-gpus">Supported Graphic cards</a></li>
            <li><a href="https://developer.nvidia.com/cuda-toolkit">Download</a></li>
            </ul>
        </div>
    </p>
</div>

In [34]:
# CUDA toolkit Installation Instructions (Ubuntu):
# wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin
# sudo mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600
# wget https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb
# sudo dpkg -i cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb
# sudo cp /var/cuda-repo-ubuntu2204-12-4-local/cuda-*-keyring.gpg /usr/share/keyrings/
# sudo apt-get update
# sudo apt-get -y install cuda-toolkit-12-4

%env SPARK_RAPIDS_PLUGIN_JAR=/opt/spark/jars/rapids-4-spark_2.13-24.04.1.jar

env: SPARK_RAPIDS_PLUGIN_JAR=/opt/spark/jars/rapids-4-spark_2.13-24.04.1.jar


In [35]:
from pyspark.sql import SparkSession
import os


spark_rapids:str = os.getenv("SPARK_RAPIDS_PLUGIN_JAR")

# SPARK SESSION CONFIGURATION
print("Running Spark session...")
session_builder = SparkSession.Builder()
session_builder.master("local[*]")
session_builder.appName("Learning Spark")
session_builder.config("spark.ui.enabled", True)
session_builder.config("spark.driver.bindAddress", "localhost")
session_builder.config("spark.ui.port", "8080")
session_builder.config("spark.driver.extraClassPath", spark_rapids)
session_builder.config("spark.plugins", "com.nvidia.spark.SQLPlugin")
session_builder.config("spark.rapids.memory.gpu.pooling.enabled", True)
session_builder.config("spark.rapids.sql.enabled", True)
session_builder.config("spark.rapids.sql.explain", "NONE")
spark = session_builder.getOrCreate()
print("Session started")

Running Spark session...
Session started


In [36]:
from pyspark.sql import DataFrameReader, DataFrame
import pandas as pd

dataset = amarnamehFile = "datasets/amarnameh1400.csv"

def readData(format: str, fromPath: str):
    print("Reading the dataset...")
    df_reader: DataFrameReader = spark.read
    if format == "csv":
        # Reading the data from csv file.
        # inferSchema identifies the schema of provided file from it contents.
        df: DataFrame = df_reader.csv(header=True, path=fromPath, inferSchema=True)
        print("Completed")
        return df
    if format == "parquet":
        df = df_reader.parquet(fromPath)
        print("Completed")
        return df
    return print("NOT SUPPORTED RIGHT NOW")
    
# Converting csv file into parquet format.
# Its optimized columnar storage and efficient compression
def convertToParquet(df: DataFrame, write_path: str):
    df = df.write.parquet(path=write_path, mode="ignore")
    return df

# Follow line executed once, converting 3.5Gb csv file into 520Mb parquet
# csv_df = readData(format="csv", filePath=file_path)
# df = convertToParquet(csv_df, parquet_path)
# csv.printSchema() is Deprecated

spark_df = readData("csv", amarnamehFile)
spark_df.show(n=10)

pd_df = pd.read_csv(amarnamehFile)
pd_df.describe()

Reading the dataset...
Completed
+---+--------------------+--------------------+--------------------+-----------+-----------------+--------------------+--------------------+--------------------+--------------------+-----------------+-------------+---------+---------------------+--------+---+--------+--------+
|_c0|نام شرکت تولید کننده|نام شرکت تامین کننده|       نام صاحب برند|توزیع کننده|کشور تولید  کننده|            نام برند|      نام لاتین برند|           نام ژنریک|      نام ماده موثره|تعداد فروش (بسته)|تعداد در بسته|فروش عددی|فروش ریالی مصرف کننده|کد ژنریک|OTC|بیولوژیک|ATC Code|
+---+--------------------+--------------------+--------------------+-----------+-----------------+--------------------+--------------------+--------------------+--------------------+-----------------+-------------+---------+---------------------+--------+---+--------+--------+
|  0|   Actero middleeast|            اکتوورکو|            اکتوورکو|  الیت دارو|            ایران|فاویپیراویر   قرص...|FAVIPIRAVIR   

Unnamed: 0.1,Unnamed: 0,تعداد فروش (بسته),تعداد در بسته,فروش عددی,فروش ریالی مصرف کننده,کد ژنریک,OTC,بیولوژیک
count,52445.0,52445.0,52445.0,52445.0,52445.0,52445.0,52445.0,52445.0
mean,26222.0,37130.15,35.771761,949718.2,12120210000.0,7637.450224,0.162933,0.020135
std,15139.711771,173310.7,42.962913,6226750.0,91262010000.0,14113.649829,0.369308,0.140465
min,0.0,1.0,1.0,1.0,17000.0,1.0,0.0,0.0
25%,13111.0,1045.0,1.0,7850.0,245400000.0,638.0,0.0,0.0
50%,26222.0,5057.0,21.0,56280.0,1162718000.0,1822.0,0.0,0.0
75%,39333.0,19734.0,60.0,347300.0,4844706000.0,6975.0,0.0,0.0
max,52444.0,10705760.0,2400.0,494913300.0,8023067000000.0,52909.0,1.0,1.0


In [37]:
# Getting the schema of the dataset
spark_df.printSchema()
print(f'Number of rows: {spark_df.count()}')

root
 |-- _c0: integer (nullable = true)
 |-- نام شرکت تولید کننده: string (nullable = true)
 |-- نام شرکت تامین کننده: string (nullable = true)
 |-- نام صاحب برند: string (nullable = true)
 |-- توزیع کننده: string (nullable = true)
 |-- کشور تولید  کننده: string (nullable = true)
 |-- نام برند: string (nullable = true)
 |-- نام لاتین برند: string (nullable = true)
 |-- نام ژنریک: string (nullable = true)
 |-- نام ماده موثره: string (nullable = true)
 |-- تعداد فروش (بسته): integer (nullable = true)
 |-- تعداد در بسته: integer (nullable = true)
 |-- فروش عددی: integer (nullable = true)
 |-- فروش ریالی مصرف کننده: double (nullable = true)
 |-- کد ژنریک: integer (nullable = true)
 |-- OTC: integer (nullable = true)
 |-- بیولوژیک: integer (nullable = true)
 |-- ATC Code: string (nullable = true)

Number of rows: 52445
