In [1]:
%%HTML
<div>
    <P>
        <label>
            <strong>
            Introduction to PySpark
            </strong>
        </label>
        <ul>
            <li>Setup your working environment (Conda package management is recommended as it replace both pip and virtual environment)</li>
            <li>I used WSL 2 for setting up a linux subsystem on my windows 11 in order to use GPU acceleration power.</li>
            <li>You can install PySpark from provided Instruction. <a href="https://spark.apache.org/docs/latest/api/python/getting_started/install.html">here</a>
            <br> Don't forget to install Java SE 8 or above</li>
            <li>I highly recommend to install spark at first, 
                then add "SPARK_HOME" environment variable in your ~/.bashrc and reload using "source ~/.bashrc", then install pyspark with conda.</li>
            <li>If you want to open excel files, it's possible to open them with pandas with installed openpyxl as its dependency, then convert the file to csv, <br>
                or instead there is an excel extension for spark 3.5.0 at the moment i work with the files. if your spark and scala version supports the jar file, open the excel file directly with spark.</li>
        </ul>
        <div>
            <a style="font-weight: 500; color: red;">Note:</a>
            I used Spark version 3.5.1 (Scala 2.13 & CUDA 12.5) and PySpark worked with python 3.11.9 well, earlier python version encountered with some errors during some code executions.
        </div>
    </P>
</div>
<div>
    <P>
        <label>
            <strong>
            Dataset
            </strong>
        </label>
        <ul>
            <li>
                 Previously used: Medicare Part D Prescribers - by Provider and Drug |
                <a href="https://data.cms.gov/provider-summary-by-type-of-service/medicare-part-d-prescribers/medicare-part-d-prescribers-by-provider-and-drug"> Download </a>
            </li>
            <li>
                New dataset is available in datasets directory. which is real data of overall sales of medicines from different distributors and manufactures of Iran.
            </li>
        </ul>
    </P>

    <p>
        <label>
            <strong>
                CUDA
            </strong>
        </label>
        <div>
            For GPU Accelerated tasks: <br>
            Install or update CUDA toolkit if your nvidia graphic card supports.
            <br>
            Then download supported spark-rapids jar file and add it to spark plugins. <br>
            <a style="font-weight: 500; color: red;">Note:</a>
            Spark rapids built at the top of cuDF, by installing them you can leverage GPU acceleration for Pandas as well without any code change.
            <ul>
            <li><a href="https://developer.nvidia.com/cuda-gpus">Supported Graphic cards</a></li>
            <li><a href="https://developer.nvidia.com/cuda-toolkit">Download</a></li>
            </ul>
        </div>
    </p>
</div>

In [2]:
# CUDA toolkit Installation Instructions (Ubuntu):
# wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin
# sudo mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600
# wget https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb
# sudo dpkg -i cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb
# sudo cp /var/cuda-repo-ubuntu2204-12-4-local/cuda-*-keyring.gpg /usr/share/keyrings/
# sudo apt-get update
# sudo apt-get -y install cuda-toolkit-12-4

%env SPARK_RAPIDS_PLUGIN_JAR=/opt/spark/jars/rapids-4-spark_2.13-24.04.1.jar

env: SPARK_RAPIDS_PLUGIN_JAR=/opt/spark/jars/rapids-4-spark_2.13-24.04.1.jar


In [6]:
from pyspark.sql import SparkSession
import os


spark_rapids:str = os.getenv("SPARK_RAPIDS_PLUGIN_JAR")

# SPARK SESSION CONFIGURATION
print("Running Spark session...")
session_builder = SparkSession.Builder()
session_builder.master("local[*]")
session_builder.appName("Learning Spark")
session_builder.config("spark.ui.enabled", True)
session_builder.config("spark.driver.bindAddress", "localhost")
session_builder.config("spark.ui.port", "8080")
session_builder.config("spark.driver.extraClassPath", spark_rapids)
session_builder.config("spark.plugins", "com.nvidia.spark.SQLPlugin")
session_builder.config("spark.rapids.memory.gpu.pooling.enabled", True)
session_builder.config("spark.rapids.sql.enabled", True)
session_builder.config("spark.rapids.sql.explain", "NONE")
spark = session_builder.getOrCreate()
print("Session started")

Running Spark session...
Session started


In [4]:
from pyspark.sql import DataFrameReader, DataFrame


dataset = amarnamehFile = "datasets/amarnameh1400.csv"

def readData(format: str, fromPath: str):
    print("Reading the dataset...")
    df_reader: DataFrameReader = spark.read
    if format == "csv":
        # inferSchema identifies the schema of provided file from it contents.
        df: DataFrame = df_reader.csv(header=True, path=fromPath, inferSchema=True)
        print("Completed")
        return df
    if format == "parquet":
        df = df_reader.parquet(fromPath)
        print("Completed")
        return df
    return print("NOT SUPPORTED RIGHT NOW")
    
# Converting csv file into parquet format.
# Its optimized columnar storage and efficient compression
def convertToParquet(df: DataFrame, write_path: str):
    df = df.write.parquet(path=write_path, mode="ignore")
    return df

spark_df = readData("csv", amarnamehFile)
df_cols = spark_df.columns
renamed_list = ["index", "manufacturer_name", "supplier_name",
                "brand_owner", "distributer_name", "origin_country",
                "drug_brand_fa", "drug_brand_en", "drug_generic_name",
                "active_ingredient", "total_box_sold", "pills_count_in_box",
                "total_sale_amount","total_sale_value_rial", "generic_code",
                "otc", "biologic", "atc_code"]

def columns_mapper(key: list, value: list) -> dict:
    colsMap: dict = {}
    for index in range(len(key)):
        colsMap[key[index]] = value[index]
    return colsMap

colsMap = columns_mapper(df_cols, renamed_list)
spark_df = spark_df.withColumnsRenamed(colsMap)

print(f'Number of rows: {spark_df.count()}')
spark_df.printSchema()

Reading the dataset...


                                                                                

Completed
Number of rows: 52445
root
 |-- index: integer (nullable = true)
 |-- manufacturer_name: string (nullable = true)
 |-- supplier_name: string (nullable = true)
 |-- brand_owner: string (nullable = true)
 |-- distributer_name: string (nullable = true)
 |-- origin_country: string (nullable = true)
 |-- drug_brand_fa: string (nullable = true)
 |-- drug_brand_en: string (nullable = true)
 |-- drug_generic_name: string (nullable = true)
 |-- active_ingredient: string (nullable = true)
 |-- total_box_sold: integer (nullable = true)
 |-- pills_count_in_box: integer (nullable = true)
 |-- total_sale_amount: integer (nullable = true)
 |-- total_sale_value_rial: double (nullable = true)
 |-- generic_code: integer (nullable = true)
 |-- otc: integer (nullable = true)
 |-- biologic: integer (nullable = true)
 |-- atc_code: string (nullable = true)



In [5]:
manufacturers = spark_df.groupby("manufacturer_name").count().sort("count", ascending=False)
# NOTE: Duplications are not yet handled, some manufactures presents with several different names
print(f"Number of drug manufacturers in Pharmaceutical market of Iran: {manufacturers.count()}")
manufacturers.toPandas().to_excel("datasets/sorted_manufacturers.xlsx", index=True, sheet_name="manufacturers")

Number of drug manufacturers in Pharmaceutical market of Iran: 587
