In [1]:
%%HTML
<div>
    <P>
        <label>
            <strong>
            Introduction to PySpark
            </strong>
        </label>
        <ul>
            <li>Setup your working environment (Jupyter notebook is recommended)</li>
            <li>Install PySpark from provided Instruction. <a href="https://spark.apache.org/docs/latest/api/python/getting_started/install.html">here</a>
            <br> Don't forget to install Java SE 8 or above</li>
        </ul>
        <div>
            <a style="font-weight: 500; color: red;">Note:</a>
            PySpark works with Python 3.11 or older otherwise you may encounter some issues like me.
        </div>
    </P>
</div>
<div>
    <P>
        <label>
            <strong>
            Dataset
            </strong>
        </label>
        <ul>
            <li>
                Medicare Part D Prescribers - by Provider and Drug |
                <a href="https://data.cms.gov/provider-summary-by-type-of-service/medicare-part-d-prescribers/medicare-part-d-prescribers-by-provider-and-drug"> Download </a>
            </li>
        </ul>
    </P>

    <p>
        <label>
            <strong>
                CUDA
            </strong>
        </label>
        <div>
            For GPU Accelerated tasks:
            Install or update CUDA toolkit if your nvidia graphic card supports.
            <ul>
            <li><a href="https://developer.nvidia.com/cuda-gpus">Supported Graphic cards</a></li>
            <li><a href="https://developer.nvidia.com/cuda-toolkit">Download</a></li>
            </ul>
        </div>
    </p>
</div>

In [2]:
# CUDA toolkit Installation Instructions (Ubuntu):
# wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin
# sudo mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600
# wget https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb
# sudo dpkg -i cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb
# sudo cp /var/cuda-repo-ubuntu2204-12-4-local/cuda-*-keyring.gpg /usr/share/keyrings/
# sudo apt-get update
# sudo apt-get -y install cuda-toolkit-12-4


# It is essential to provide the address of SPARK RAPIDS Plugin for Spark
# Using environment variable can be a nice option to provide them.
# %env SPARK_RAPIDS_DIR=
# %env SPARK_RAPIDS_PLUGIN_JAR=

In [1]:
from pyspark.sql import SparkSession
import os

spark_rapids:str = os.getenv("SPARK_RAPIDS_PLUGIN_JAR")

# Creating Spark session to start using spark
# When you are working in clouds you can create multiple instances
print("Running Spark session...")
session_builder = SparkSession.Builder()
session_builder.appName("First Practice")
session_builder.master("local[*]")
session_builder.config("spark.ui.enabled", True)
session_builder.config("spark.driver.bindAddress", "localhost")
session_builder.config("spark.ui.port", "8080")
# Make sure you are running on a Linux Operation system, otherwise spark rapids not supported
session_builder.config("spark.driver.extraClassPath", spark_rapids)
session_builder.config("spark.plugins", "com.nvidia.spark.SQLPlugin")
session_builder.config("spark.rapids.memory.gpu.pooling.enabled", True)
session_builder.config("spark.rapids.sql.enabled", True)
session = session_builder.getOrCreate()
print("Session started")

# Renaming our variable name for better readability
spark = session

Running Spark session...


ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "c:\Projects\Data-analytics-practice\venv\Lib\site-packages\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\muham\AppData\Local\Programs\Python\Python311\Lib\socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Projects\Data-analytics-practice\venv\Lib\site-packages\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Projects\Data-analytics-practice\venv\Lib\site-packages\py4j\clientserver.py", line 539, in send_com

Py4JError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext

In [None]:
from pyspark.sql import DataFrameReader, DataFrame


file_path = "datasets/Medicare-PD/MUP_DPR_RY23_P04_V10_DY21_NPIBN.csv"
parquet_path = "datasets/Medicare-PD-Parquet"

def readData(format: str, fromPath: str):
    print("Reading the dataset...")
    df_reader: DataFrameReader = spark.read
    if format == "csv":
        # Reading the data from csv file.
        # inferSchema idenifies the schema of provided file from it contents.
        df: DataFrame = df_reader.csv(header=True, path=fromPath, inferSchema=True)
        print("Completed")
        return df
    if format == "parquet":
        df = df_reader.parquet(fromPath)
        print("Completed")
        return df
    return print("NOT SUPPORTED RIGHT NOW")
    
# Converting csv file into parquet format.
# Its optimized columnar storage and efficient compression
def convertToParquet(df: DataFrame, write_path: str):
    df = df.write.parquet(path=write_path, mode="ignore")
    return df

# Follow line executed once, converting 3.5Gb csv file into 520Mb parquet
# csv_df = readData(format="csv", filePath=file_path)
# df = convertToParquet(csv_df, parquet_path)
# csv.printSchema() is Deprecated

df = readData(format="parquet", fromPath=parquet_path)

In [11]:
# Getting the schema of the dataset
df.printSchema()
print(f'Number of rows: {df.count()}')

root
 |-- Prscrbr_NPI: integer (nullable = true)
 |-- Prscrbr_Last_Org_Name: string (nullable = true)
 |-- Prscrbr_First_Name: string (nullable = true)
 |-- Prscrbr_City: string (nullable = true)
 |-- Prscrbr_State_Abrvtn: string (nullable = true)
 |-- Prscrbr_State_FIPS: string (nullable = true)
 |-- Prscrbr_Type: string (nullable = true)
 |-- Prscrbr_Type_Src: string (nullable = true)
 |-- Brnd_Name: string (nullable = true)
 |-- Gnrc_Name: string (nullable = true)
 |-- Tot_Clms: integer (nullable = true)
 |-- Tot_30day_Fills: double (nullable = true)
 |-- Tot_Day_Suply: integer (nullable = true)
 |-- Tot_Drug_Cst: double (nullable = true)
 |-- Tot_Benes: integer (nullable = true)
 |-- GE65_Sprsn_Flag: string (nullable = true)
 |-- GE65_Tot_Clms: integer (nullable = true)
 |-- GE65_Tot_30day_Fills: double (nullable = true)
 |-- GE65_Tot_Drug_Cst: double (nullable = true)
 |-- GE65_Tot_Day_Suply: integer (nullable = true)
 |-- GE65_Bene_Sprsn_Flag: string (nullable = true)
 |-- GE65_T

In [5]:
# Pandas is much slower
# Reading the dataframe from it may cause out-off-memory error