In [1]:
%%HTML
<div>
    <P>
        <label>Introduction to PySpark</label>
        <ul>
            <li>Setup your working enviroment (Jupyter notebook is recemended)</li>
            <li>Install PySpark from Instructions provided. <a href="https://spark.apache.org/docs/latest/api/python/getting_started/install.html">here</a>
            <br> Dont forget to install Java SE 8 or above</li>
        </ul>
        <div>
            <a style="font-weight: 500; color: red;">Note:</a>
            PySpark works with Python 3.11 or older otherwise you may incounter some issues like me.
        </div>
    </P>
</div>
<div>
    <P>
        <label>Dataset</label>
        <ul>
            <li>
                Medicare Part D Prescribers - by Provider and Drug |
                <a href="https://data.cms.gov/provider-summary-by-type-of-service/medicare-part-d-prescribers/medicare-part-d-prescribers-by-provider-and-drug"> Download </a>
            </li>
        </ul>
    </P>
</div>

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql import DataFrameReader, DataFrame

# Creating spark session to start using spark
# When you are working in clouds you can create multiple instances
print("Running Spark session...")
session_builder = SparkSession.Builder()
session_builder.appName("First Practise")
session = session_builder.getOrCreate()
print("Session started")

# Renaming our variable name for easier understading
spark = session
spark

Running Spark session...
Session started


In [3]:
file_path = "datasets/Medicare-PD/MUP_DPR_RY23_P04_V10_DY21_NPIBN.csv"
parquet_path = "datasets/Medicare-PD-Parquet"

def readData(format: str, fromPath: str):
    print("Reading the dataset...")
    df_reader: DataFrameReader = spark.read
    if format == "csv":
        # Reading the data from csv file.
        # inferSchema idenifies the schema of provided file from it contents.
        df: DataFrame = df_reader.csv(header=True, path=fromPath, inferSchema=True)
        print("Completed")
        return df
    if format == "parquet":
        df = df_reader.parquet(fromPath)
        print("Completed")
        return df
    return print("NOT SUPPORTED RIGHT NOW")
    
# Converting csv file into parquet format.
# Its optimized columnar storage and efficient compression
def convertToParquet(df: DataFrame, write_path: str):
    df = df.write.parquet(path=write_path, mode="ignore")
    return df

# Follow line executed once, converting 3.5Gb csv file into 520Mb parquet
# csv_df = readData(format="csv", filePath=file_path)
# df = convertToParquet(csv_df, parquet_path)
# csv.printSchema() is Deprecated

df = readData(format="parquet", fromPath=parquet_path)

Reading the dataset...


                                                                                

Completed


In [4]:
# Getting the schema of the dataset
df.printSchema()
print(f'Number of rows: {df.count()}')

root
 |-- Prscrbr_NPI: integer (nullable = true)
 |-- Prscrbr_Last_Org_Name: string (nullable = true)
 |-- Prscrbr_First_Name: string (nullable = true)
 |-- Prscrbr_City: string (nullable = true)
 |-- Prscrbr_State_Abrvtn: string (nullable = true)
 |-- Prscrbr_State_FIPS: string (nullable = true)
 |-- Prscrbr_Type: string (nullable = true)
 |-- Prscrbr_Type_Src: string (nullable = true)
 |-- Brnd_Name: string (nullable = true)
 |-- Gnrc_Name: string (nullable = true)
 |-- Tot_Clms: integer (nullable = true)
 |-- Tot_30day_Fills: double (nullable = true)
 |-- Tot_Day_Suply: integer (nullable = true)
 |-- Tot_Drug_Cst: double (nullable = true)
 |-- Tot_Benes: integer (nullable = true)
 |-- GE65_Sprsn_Flag: string (nullable = true)
 |-- GE65_Tot_Clms: integer (nullable = true)
 |-- GE65_Tot_30day_Fills: double (nullable = true)
 |-- GE65_Tot_Drug_Cst: double (nullable = true)
 |-- GE65_Tot_Day_Suply: integer (nullable = true)
 |-- GE65_Bene_Sprsn_Flag: string (nullable = true)
 |-- GE65_T

[Stage 1:>                                                          (0 + 8) / 9]

Number of rows: 25231862


                                                                                

In [5]:
# Pandas is much slower
# Reading the dataframe from it may cause out-off-memory error