In [4]:
%%HTML
<div>
    <P>
        <label>
            <strong>
            Introduction to PySpark
            </strong>
        </label>
        <ul>
            <li>Setup your working environment (Conda package management is recommended as it replace both pip and virtual environment)</li>
            <li>I used WSL 2 for setting up a linux subsystem on my windows 11 in order to use GPU acceleration power.</li>
            <li>You can install PySpark from provided Instruction. <a href="https://spark.apache.org/docs/latest/api/python/getting_started/install.html">here</a>
            <br> Don't forget to install Java SE 8 or above</li>
            <li>I highly recommend to install spark at first, 
                then add "SPARK_HOME" environment variable in your ~/.bashrc and reload using "source ~/.bashrc", then install pyspark with conda.</li>
            <li>If you want to open excel files, it's possible to open them with pandas with installed openpyxl as its dependency, then convert the file to csv, <br>
                or instead there is an excel extension for spark 3.5.0 at the moment i work with the files. if your spark and scala version supports the jar file, open the excel file directly with spark.</li>
        </ul>
        <div>
            <a style="font-weight: 500; color: red;">Note:</a>
            I used Spark version 3.5.1 (Scala 2.13 & CUDA 12.5) and PySpark worked with python 3.11.9 well, earlier python version encountered with some errors during some code executions.
        </div>
    </P>
</div>
<div>
    <P>
        <label>
            <strong>
            Dataset
            </strong>
        </label>
        <ul>
            <li>
                 Previously used: Medicare Part D Prescribers - by Provider and Drug |
                <a href="https://data.cms.gov/provider-summary-by-type-of-service/medicare-part-d-prescribers/medicare-part-d-prescribers-by-provider-and-drug"> Download </a>
            </li>
            <li>
                New dataset is available in datasets directory. which is real data of overall sales of medicines from different distributors and manufactures of Iran.
            </li>
        </ul>
    </P>

    <p>
        <label>
            <strong>
                CUDA
            </strong>
        </label>
        <div>
            For GPU Accelerated tasks: <br>
            Install or update CUDA toolkit if your nvidia graphic card supports.
            <br>
            Then download supported spark-rapids jar file and add it to spark plugins. <br>
            <a style="font-weight: 500; color: red;">Note:</a>
            Spark rapids built at the top of cuDF, by installing them you can leverage GPU acceleration for Pandas as well without any code change.
            <ul>
            <li><a href="https://developer.nvidia.com/cuda-gpus">Supported Graphic cards</a></li>
            <li><a href="https://developer.nvidia.com/cuda-toolkit">Download</a></li>
            </ul>
        </div>
    </p>
</div>

In [5]:
# CUDA toolkit Installation Instructions (Ubuntu):
# wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin
# sudo mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600
# wget https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb
# sudo dpkg -i cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb
# sudo cp /var/cuda-repo-ubuntu2204-12-4-local/cuda-*-keyring.gpg /usr/share/keyrings/
# sudo apt-get update
# sudo apt-get -y install cuda-toolkit-12-4

%env SPARK_RAPIDS_PLUGIN_JAR=/opt/spark/jars/rapids-4-spark_2.13-24.04.1.jar

env: SPARK_RAPIDS_PLUGIN_JAR=/opt/spark/jars/rapids-4-spark_2.13-24.04.1.jar


In [7]:
from pyspark.sql import SparkSession
import os


spark_rapids:str = os.getenv("SPARK_RAPIDS_PLUGIN_JAR")

# SPARK SESSION CONFIGURATION
print("Running Spark session...")
session_builder = SparkSession.Builder() \
    .master("local[*]") \
    .appName("Learning Spark") \
    .config("spark.ui.enabled", True) \
    .config("spark.driver.bindAddress", "localhost") \
    .config("spark.ui.port", "8080") \
    .config("spark.driver.memory", "10g") \
    .config("spark.executor.memory", "10g") \
    .config("spark.driver.extraClassPath", spark_rapids) \
    .config("spark.plugins", "com.nvidia.spark.SQLPlugin") \
    .config("spark.rapids.memory.gpu.pooling.enabled", True) \
    .config("spark.rapids.sql.enabled", True) \
    .config("spark.rapids.sql.explain", "NONE")
spark = session_builder.getOrCreate()
print("Session started")

Running Spark session...
Session started


In [8]:
amarnamehFile = "datasets/amarnameh1400.csv"
amarnamehParquet = "datasets/amarnameh1400.parquet"

conversion = False

In [9]:
from pyspark.sql import DataFrame
import pandas as pd


def readData(format: str, fromPath: str):
    print("Reading the dataset...")
    if format == "csv":
        # inferSchema identifies the schema of provided file from it contents.
        df: DataFrame = spark.read.csv(header=True, path=fromPath, inferSchema=True)
        print("Completed")
        return df
    if format == "parquet":
        df = spark.read.parquet(fromPath)
        print("Completed")
        return df
    if format == "xlsx":
        print("Switching to Pandas, data frame reader")
        df = pd.read_excel(fromPath)
        return df
    return print("[ERROR] NOT SUPPORTED RIGHT NOW")


In [12]:
base_schema =   ["index", "manufacturer_name", "supplier_name",
                "brand_owner", "distributer_name", "origin_country",
                "drug_brand_fa", "drug_brand_en", "drug_generic_name",
                "active_ingredient", "total_box_sold", "pills_count_in_box",
                "total_sale_amount","total_sale_value_rial", "generic_code",
                "otc", "biologic", "atc_code"]

# Optimized columnar storage and efficient compression
def convertToParquet(df: DataFrame, write_path: str):
    df = df.write.parquet(path=write_path, mode="ignore")
    return df

# Renaming headers
def columns_mapper(key: list, value: list) -> dict:
    colsMap: dict = {}
    for index in range(len(key)):
        colsMap[key[index]] = value[index]
    return colsMap

# spark_df = readData("csv", amarnamehFile)

# df_cols = spark_df.columns
# colsMap = columns_mapper(df_cols, renamed_list)
# spark_df = spark_df.withColumnsRenamed(colsMap)

# if conversion:
#     convertToParquet(spark_df, "datasets/amarnameh1400.parquet")

spark_df = readData(format="parquet", fromPath=amarnamehParquet)
print(f'Number of rows: {spark_df.count()}')
spark_df.printSchema()

Reading the dataset...
Completed
Number of rows: 52445
root
 |-- index: integer (nullable = true)
 |-- manufacturer_name: string (nullable = true)
 |-- supplier_name: string (nullable = true)
 |-- brand_owner: string (nullable = true)
 |-- distributer_name: string (nullable = true)
 |-- origin_country: string (nullable = true)
 |-- drug_brand_fa: string (nullable = true)
 |-- drug_brand_en: string (nullable = true)
 |-- drug_generic_name: string (nullable = true)
 |-- active_ingredient: string (nullable = true)
 |-- total_box_sold: integer (nullable = true)
 |-- pills_count_in_box: integer (nullable = true)
 |-- total_sale_amount: integer (nullable = true)
 |-- total_sale_value_rial: double (nullable = true)
 |-- generic_code: integer (nullable = true)
 |-- otc: integer (nullable = true)
 |-- biologic: integer (nullable = true)
 |-- atc_code: string (nullable = true)



In [13]:
manufacturers = spark_df.groupby("manufacturer_name").count().sort("count", ascending=False)
# NOTE: Duplications are not yet handled, some manufactures presents with several different names
print(f"Number of drug manufacturers in Pharmaceutical market of Iran: {manufacturers.count()}")
manufacturers.toPandas().to_excel("datasets/sorted_manufacturers.xlsx", sheet_name="manufacturers", index=True)
manufacturers.show(n=10, truncate=False)

Number of drug manufacturers in Pharmaceutical market of Iran: 587
+-----------------------------------+-----+
|manufacturer_name                  |count|
+-----------------------------------+-----+
|Amin Pharmaceutical Co.            |2591 |
|Darou Pakhsh                       |2188 |
|Sobhandarou                        |2119 |
|Raha Pharmaceutical Isfahan Company|1891 |
|Aburaihan Pharmaceutical Co        |1782 |
|Tehran Chemie Pharmaceutical Co.   |1674 |
|Toliddaru Pharma. Co.              |1499 |
|Exir Pharmaceutical Company        |1473 |
|Caspian Tamin Co.                  |1418 |
|Alborzdarou                        |1210 |
+-----------------------------------+-----+
only showing top 10 rows



In [None]:
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf


words: tuple = ("a", "b", "c", "d", "e", "f", "g",
         "h", "i", "j", "k", "l", "m", "n",
         "o", "p", "q", "r", "s", "t", "u",
         "v", "w", "x", "y", "z")
numbers: list = [str(n) for n in range(10)]
numbers_tuple = tuple(numbers)

# Simple way to differentiate written language of companies name 
def detect_language(name: str):
    if name == None:
        return "null"
    if name.lower().startswith(words):
        return "en"
    elif name.lower().startswith(numbers_tuple):
        return "int"
    else:
        return "fa"

detect_lag = udf(detect_language, StringType())
manufacturers = manufacturers.withColumn("language", detect_lag("manufacturer_name"))
manufacturers.show(truncate=False)

+------------------------------------------+-----+--------+
|manufacturer_name                         |count|language|
+------------------------------------------+-----+--------+
|Amin Pharmaceutical Co.                   |2591 |en      |
|Darou Pakhsh                              |2188 |en      |
|Sobhandarou                               |2119 |en      |
|Raha Pharmaceutical Isfahan Company       |1891 |en      |
|Aburaihan Pharmaceutical Co               |1782 |en      |
|Tehran Chemie Pharmaceutical Co.          |1674 |en      |
|Toliddaru Pharma. Co.                     |1499 |en      |
|Exir Pharmaceutical Company               |1473 |en      |
|Caspian Tamin Co.                         |1418 |en      |
|Alborzdarou                               |1210 |en      |
|Hakim Pharmaceutical Co.                  |1149 |en      |
|Sina Darou Laboratories                   |1130 |en      |
|Pars Darou                                |1028 |en      |
|Daana Pharma                           

In [None]:
manufacturers.filter(manufacturers.language == "fa").show()
manufacturers.filter(manufacturers.language == "int").show()

                                                                                

+--------------------+-----+--------+
|   manufacturer_name|count|language|
+--------------------+-----+--------+
|            میم دارو|   18|      fa|
| نوشا فارمد ایرانیان|   10|      fa|
|     نویا ویژن آریان|    5|      fa|
|شرکت دارو سازی یا...|    5|      fa|
|           توسن دارو|    3|      fa|
|   نیواد فارمد سلامت|    2|      fa|
|    زیست فناوری کوثر|    2|      fa|
|فن آوری زیستی رزف...|    1|      fa|
|          پارس تک رخ|    1|      fa|
+--------------------+-----+--------+

+-----------------+-----+--------+
|manufacturer_name|count|language|
+-----------------+-----+--------+
|    3m Healthcare|    1|     int|
+-----------------+-----+--------+



In [None]:
from pyspark.sql.functions import col, soundex, levenshtein, lower
from pyspark.sql.types import FloatType, IntegerType
import torch
from transformers import AutoTokenizer, AutoModel
from fuzzywuzzy import fuzz


# Load pre-trained model and tokenizer from Hugging Face
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def fuzzy(company1, company2):
    return fuzz.ratio(company1, company2)

fuzzy_ratio = udf(fuzzy, IntegerType())

def compute_similarity(company1, company2):
    # Tokenize and encode the company names
    inputs = tokenizer([company1, company2], return_tensors='pt', padding=True, truncation=True)
    
    # Get the embeddings
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state
    
    # Compute the mean of the embeddings to get a single vector for each input
    company1_embedding = embeddings[0].mean(dim=0)
    company2_embedding = embeddings[1].mean(dim=0)
    
    # Compute cosine similarity
    cos_sim = torch.nn.functional.cosine_similarity(company1_embedding, company2_embedding, dim=0)
    
    return cos_sim.item()

# Register the UDF with Spark
similarity_udf = udf(compute_similarity, FloatType())

# Preparing for similarity checks
manufacturers = manufacturers.withColumn("manufacturer_name", lower("manufacturer_name")) \
    .drop("count", "language")

manufacturers_coupled = manufacturers.alias("df1").crossJoin(manufacturers.alias("df2")) \
    .withColumn("soundex1", soundex(col("df1.manufacturer_name"))) \
    .withColumn("soundex2", soundex(col("df2.manufacturer_name"))) \
    .withColumn("levenshtein_distance", levenshtein(col("df1.manufacturer_name"), col("df2.manufacturer_name"))) \
    .withColumn("simple_fuzzy_ratio", fuzzy_ratio(col("df1.manufacturer_name"), col("df2.manufacturer_name"))) \
    .withColumn("similarity_score", similarity_udf(col("df1.manufacturer_name"), col("df2.manufacturer_name"))) \

print(f"Counts: {manufacturers_coupled.count()}")
manufacturers_coupled.show(truncate=False, n=10)



Counts: 344569


[Stage 5012:>                                                       (0 + 1) / 1]

+----------------------------------+-----------------------------------+--------+--------+--------------------+------------------+----------------+
|manufacturer_name                 |manufacturer_name                  |soundex1|soundex2|levenshtein_distance|simple_fuzzy_ratio|similarity_score|
+----------------------------------+-----------------------------------+--------+--------+--------------------+------------------+----------------+
|novell pharmaceutical laboratories|novell pharmaceutical laboratories |N141    |N141    |0                   |100               |1.0             |
|novell pharmaceutical laboratories|eli lilly                          |N141    |E444    |28                  |28                |0.14914232      |
|novell pharmaceutical laboratories|doppel farmaceutici s.r.l          |N141    |D141    |19                  |54                |0.22473177      |
|novell pharmaceutical laboratories|pharmachemie                       |N141    |P652    |25                  |4

                                                                                

In [None]:
# full_matches = manufacturers_coupled.filter(col("simple_fuzzy_ratio") == i).show()
    # .filter((col("soundex1") == col("soundex2")) & (col("levenshtein_distance") > 0)) \
    # .select(col("df1.manufacturer_name").alias("manufacturer_name_1"),
    #         col("df2.manufacturer_name").alias("manufacturer_name_2"),
    #         col("soundex1").alias("soundex"),
    #         col("levenshtein_distance"),
    #         col("similarity_score"))
# df_spark = df_spark.dropDuplicates(["soundex", "levenshtein_distance"]) \
#     .orderBy(col("similarity_score").desc()) \
#     .filter(col("similarity_score") >= float(0.39))
# print(df_spark.count())
# df_spark.show(truncate=False)

# df_spark.toPandas().to_excel("datasets/sorted_manufacturer_lang.xlsx", sheet_name="manufacturers", index=True)

0
1
2
3
4
