In [34]:
%env SPARK_RAPIDS_PLUGIN_JAR=/opt/spark/jars/rapids-4-spark_2.13-24.04.1.jar

env: SPARK_RAPIDS_PLUGIN_JAR=/opt/spark/jars/rapids-4-spark_2.13-24.04.1.jar


In [35]:
from pyspark.sql import SparkSession
import os


spark_rapids:str = os.getenv("SPARK_RAPIDS_PLUGIN_JAR")

# SPARK SESSION CONFIGURATION
print("Running Spark session...")
session_builder = SparkSession.Builder() \
    .master("local[*]") \
    .appName("Learning Spark") \
    .config("spark.ui.enabled", True) \
    .config("spark.driver.bindAddress", "localhost") \
    .config("spark.ui.port", "8080") \
    .config("spark.driver.memory", "10g") \
    .config("spark.executor.memory", "10g") \
    .config("spark.driver.extraClassPath", spark_rapids) \
    .config("spark.plugins", "com.nvidia.spark.SQLPlugin") \
    .config("spark.rapids.memory.gpu.pooling.enabled", True) \
    .config("spark.rapids.sql.enabled", True) \
    .config("spark.rapids.sql.explain", "NONE")
spark = session_builder.getOrCreate()
print("Session started")

Running Spark session...
Session started


In [36]:
datasets_excel = os.listdir("datasets-private/amarnameh-excel")
datasets_csv = sorted(os.listdir("datasets-private/amarnameh-csv"))

In [37]:
from pyspark.sql import DataFrame
import pandas as pd


def readData(format: str, fromPath: str):
    print("Reading the dataset...")
    if format == "csv":
        # inferSchema identifies the schema of provided file from it contents.
        df: DataFrame = spark.read.csv(header=True, path=fromPath, inferSchema=True)
        print("Completed")
        return df
    if format == "parquet":
        df = spark.read.parquet(fromPath)
        print("Completed")
        return df
    if format == "xlsx":
        print("Switching to Pandas, data frame reader")
        df = pd.read_excel(fromPath)
        return df
    return print("[ERROR] NOT SUPPORTED RIGHT NOW")

dataframes: dict[str, DataFrame] = {}
for data in datasets_csv:
    print("---- starting ----")
    print(f"File: {str(data)}")
    filePath = f"datasets-private/amarnameh-csv/{data}"
    df = readData(format="csv", fromPath=filePath)
    df_name = data.rstrip(".csv")
    dataframes.update({df_name: df})
    print("---- Done ----")


---- starting ----
File: 1398.csv
Reading the dataset...
Completed
---- Done ----
---- starting ----
File: 1399.csv
Reading the dataset...


                                                                                

Completed
---- Done ----
---- starting ----
File: 1400.csv
Reading the dataset...
Completed
---- Done ----
---- starting ----
File: 1401.csv
Reading the dataset...
Completed
---- Done ----
---- starting ----
File: 1402.csv
Reading the dataset...
Completed
---- Done ----


In [38]:
for df_name in dataframes:
    df = dataframes.get(df_name)
    # Rows number:
    print(f"Dataset[{df_name}]:")
    print(f'-> Contains: {df.count()} Rows')
    print(f"-> Headers Len: {len(df.columns)}")
    print(df.columns)

Dataset[1398]:
-> Contains: 36999 Rows
-> Headers Len: 27
['تولید کننده / وارد کننده', 'صاحب پروانه', 'توزیع کننده', 'کد فرآورده', 'کد ATC', 'نام انگلیسی فرآورده', 'نام فارسی فرآورده', 'نام ژنریک', 'Column1', 'کد ژنریک', 'تولیدی/وارداتی', 'فوریتی/غیرفوریتی', 'نوع فرآورده', 'نام انگلیسی فهرست', 'کد ATC سطح1', 'عنوان ATC سطح 1', 'کد ATC سطح 2', 'عنوان ATC سطح 2', 'کد ATC سطح 3', 'عنوان ATC سطح 3', 'کد ATC سطح 4', 'عنوان ATC سطح 4', 'حجم فروش', 'تعداد در بسته', ' فروش عددی ', ' فروش ریالی ', '_c26']
Dataset[1399]:
-> Contains: 1048575 Rows
-> Headers Len: 20
['صاحب پروانه', 'توزیع کننده', 'IRC', 'نام فرآورده (برند)', 'تولیدی/وارداتی', 'OTC', 'بیولوژیک', 'کشور تولید کننده', 'تحت لیسانس', '111', 'نام ژنریک', 'فرآورده', 'کد ژنریک', 'ماده موثره', 'ATC Code', ' تعداد فروش (بسته) ', ' تعداد در بسته ', ' فروش عددی  ', ' فروش ریالی مصرف کننده  ', '_c19']
Dataset[1400]:
-> Contains: 52445 Rows
-> Headers Len: 19
['نام شرکت تولید کننده', 'نام شرکت تامین کننده', 'نام صاحب برند', 'توزیع کننده', 'کشور

In [39]:
schema = ["atc_code", "brand_owner", "drug_generic_name"]


print("Normalizing...")
df_1398 = dataframes.get("1398")
df_1399 = dataframes.get("1399")
df_1400 = dataframes.get("1400")
df_1401 = dataframes.get("1401")
df_1402 = dataframes.get("1402")

df_1398 = df_1398.withColumnsRenamed({"صاحب پروانه": "brand_owner", "کد ATC": "atc_code",}).select(["brand_owner", "atc_code"])
df_1399 = df_1399.withColumnsRenamed({"صاحب پروانه":"brand_owner", "ATC Code": "atc_code"}).select(["brand_owner", "atc_code"])
df_1400 = df_1400.withColumnsRenamed({"نام صاحب برند": "brand_owner", "ATC Code": "atc_code"}).select(["brand_owner","atc_code"])
df_1401 = df_1401.withColumnsRenamed({"صاحب پروانه" : "brand_owner", "کد ATC": "atc_code"}).select(["brand_owner", "atc_code"])
df_1402 = df_1402.withColumnsRenamed({"صاحب پروانه" : "brand_owner", "کد ATC": "atc_code"}).select(["brand_owner", "atc_code"])
dataframes.update({"1398": df_1398, "1399":df_1399,  "1400": df_1400, "1401": df_1401, "1402": df_1402})

Normalizing...


In [40]:
antibiotics = readData(format="csv", fromPath="datasets-private/antibiotics/ghotb-isfahan.csv")
antibiotics = antibiotics.select(["ATC", "LABEL"]).withColumnsRenamed(colsMap={"ATC": "atc_code",
                                                                               "LABEL": "label"})

Reading the dataset...
Completed


In [41]:
from pyspark.sql.functions import countDistinct

joined_dfs: dict[int, DataFrame] = {}
result_dfs: dict[int, DataFrame] = {}

def findManufacturers(data_frame: DataFrame):
        joined_df = data_frame \
            .join(antibiotics, on="atc_code", how="inner") \
            .dropDuplicates(["brand_owner", "label"]).sort("label")
        return joined_df

def countManufacturers(dataframes: dict, start: int, stop: int):
    for i in range(start, stop):
        df: DataFrame = dataframes.get(str(i))
        print(f"Data frame {i} contains: {df.count()} rows")

        joined_df: DataFrame = findManufacturers(df)
        joined_dfs.update(  {i: joined_df}  )
        print(f"All Antibiotics rows count: {joined_df.count()}")

        result_df = joined_df.groupBy("atc_code").agg(countDistinct("brand_owner").alias(f"manufacturers_{i}"))
        print(f"All manufacturers with their ATC codes: {result_df.count()}")
        print("-=-=-=-=-=-=-=-")
        result_dfs.update(  {i: result_df}  )

countManufacturers(dataframes=dataframes, start=1398, stop=1403)

Data frame 1398 contains: 36999 rows
All Antibiotics rows count: 1387
All manufacturers with their ATC codes: 46
-=-=-=-=-=-=-=-
Data frame 1399 contains: 1048575 rows
All Antibiotics rows count: 1459
All manufacturers with their ATC codes: 42
-=-=-=-=-=-=-=-
Data frame 1400 contains: 52445 rows
All Antibiotics rows count: 1338
All manufacturers with their ATC codes: 40
-=-=-=-=-=-=-=-
Data frame 1401 contains: 125484 rows
All Antibiotics rows count: 1223
All manufacturers with their ATC codes: 42
-=-=-=-=-=-=-=-
Data frame 1402 contains: 107213 rows
All Antibiotics rows count: 1087
All manufacturers with their ATC codes: 42
-=-=-=-=-=-=-=-


In [42]:
final_df = result_dfs.get(1398).join(result_dfs.get(1399), on="atc_code", how="outer") \
.join(result_dfs.get(1400), on="atc_code", how="outer") \
.join(result_dfs.get(1401), on="atc_code", how="outer") \
.join(result_dfs.get(1402), on="atc_code", how="outer") \
.join(antibiotics, on="atc_code", how="inner")

print("Saving final result...")
with pd.ExcelWriter("datasets-private/result.xlsx") as writer:
    print("result page ...")
    pd = final_df.toPandas().to_excel(writer, sheet_name="result", index=True)
    
    print("joined data frames pages ...")
    for i in range(1398, 1403):
        print(f"page {i} ...")
        joined_dfs.get(i).toPandas().to_excel(writer, sheet_name=f"{i}", index=False)
print("Done")

Saving final result...
result page ...


                                                                                

joined data frames pages ...
page 1398 ...
page 1399 ...
page 1400 ...
page 1401 ...
page 1402 ...
Done
