In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=3006f6b193b3741418e648afc15f1fefbf0babe72881e180412fe75fe1248aee
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pandas as pd
import numpy as np
import time

spark = SparkSession \
        .builder \
        .master("local[*]") \
        .appName("k-prototype-global") \
        .getOrCreate()

sc = spark.sparkContext

In [None]:
df = pd.read_csv("Retail_Transaction_Dataset.csv")
df = df.dropna()
len(df)

58714

In [None]:
# df = pd.read_csv("data.csv")


labels = ["Quantity", "Price", "PaymentMethod","ProductID"]

# from which label are the categorical variables
categorical_labels_start_index = 2

labels[categorical_labels_start_index:]

['PaymentMethod', 'ProductID']

In [None]:
df = df[labels]
df.head()

Unnamed: 0,Quantity,Price,PaymentMethod,ProductID
0,7,80.079844,Cash,C
1,4,75.195229,Cash,C
2,8,31.528816,Cash,A
3,5,98.880218,PayPal,D
4,7,93.188512,Cash,A


In [None]:
df = spark.createDataFrame(df)
df.show(5)

+--------+-----------+-------------+---------+
|Quantity|      Price|PaymentMethod|ProductID|
+--------+-----------+-------------+---------+
|       7|80.07984415|         Cash|        C|
|       4|75.19522942|         Cash|        C|
|       8|31.52881648|         Cash|        A|
|       5|98.88021828|       PayPal|        D|
|       7|93.18851246|         Cash|        A|
+--------+-----------+-------------+---------+
only showing top 5 rows



In [None]:
df.printSchema()

root
 |-- Quantity: string (nullable = true)
 |-- Price: double (nullable = true)
 |-- PaymentMethod: string (nullable = true)
 |-- ProductID: string (nullable = true)



In [None]:
from pyspark.sql.types import LongType

df = df.withColumn("Quantity", df["Quantity"].cast(LongType()))

In [None]:
df.printSchema()

root
 |-- Quantity: long (nullable = true)
 |-- Price: double (nullable = true)
 |-- PaymentMethod: string (nullable = true)
 |-- ProductID: string (nullable = true)



## Normalising the data

In [None]:
# min max normalisation for the numerical data
for col in df.columns[:categorical_labels_start_index]:
  minimum = df.agg({col: "min"}).collect()[0][0]
  maximum = df.agg({col: "max"}).collect()[0][0]

  df = df.withColumn(col + '_norm', (df[col] - minimum) / (maximum - minimum))

In [None]:
df.show(10)

+--------+-----------+-------------+---------+-------------+-------------------+
|Quantity|      Price|PaymentMethod|ProductID|Quantity_norm|         Price_norm|
+--------+-----------+-------------+---------+-------------+-------------------+
|       7|80.07984415|         Cash|        C|         0.75| 0.7786700737595417|
|       4|75.19522942|         Cash|        C|        0.375| 0.7243958855114194|
|       8|31.52881648|         Cash|        A|        0.875|0.23920734004228444|
|       5|98.88021828|       PayPal|        D|          0.5| 0.9875657801905688|
|       7|93.18851246|         Cash|        A|         0.75| 0.9243237989488644|
|       3|54.09315249|         Cash|        D|         0.25|0.48992537779848405|
|       7|13.12193739|       PayPal|        D|         0.75| 0.0346838564079694|
|       8|56.02516419|   Debit Card|        A|        0.875| 0.5113924478745256|
|       5|23.85798105|  Credit Card|        B|          0.5|0.15397474972329578|
|       4| 63.3427768|   Deb

In [None]:
df_norm = df.drop("Quantity").drop("Price")
df_norm.show(5)

+-------------+---------+-------------+-------------------+
|PaymentMethod|ProductID|Quantity_norm|         Price_norm|
+-------------+---------+-------------+-------------------+
|         Cash|        C|         0.75| 0.7786700737595417|
|         Cash|        C|        0.375| 0.7243958855114194|
|         Cash|        A|        0.875|0.23920734004228444|
|       PayPal|        D|          0.5| 0.9875657801905688|
|         Cash|        A|         0.75| 0.9243237989488644|
+-------------+---------+-------------+-------------------+
only showing top 5 rows



## K prototype

In [None]:
# make into partitions
rdd = df_norm.rdd.repartition(100)
rdd.cache()

MapPartitionsRDD[75] at coalesce at NativeMethodAccessorImpl.java:0

In [None]:
type(rdd)

In [None]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import euclidean, hamming

class KPrototypes:
    """
    K-Prototypes Clustering Algorithm for mixed numerical and categorical data.
    """
    def __init__(self, data: pd.DataFrame, numerical_cols: list, categorical_cols: list, k_clusters: int):
        """
        Initialize KPrototypes instance.

        Args:
            data (pd.DataFrame): Input data containing both numerical and categorical columns.
            numerical_cols (list): List of numerical column names.
            categorical_cols (list): List of categorical column names.
            k_clusters (int): Number of clusters.
        """
        self.data = data
        self.numerical_cols = numerical_cols
        self.categorical_cols = categorical_cols
        self.k_clusters = k_clusters
        self.centroids = self._initialize_centroids()

    def _initialize_centroids(self) -> pd.DataFrame:
        """
        Initialize centroids by randomly sampling from the data.

        Returns:
            pd.DataFrame: DataFrame containing centroids.
        """
        centroids = self.data.sample(n=self.k_clusters, random_state=42)
        centroids.reset_index(drop=True, inplace=True)
        return centroids

    def _assign_clusters(self) -> list:
        """
        Assign each data point to the nearest centroid.

        Returns:
            list: List of tuples containing index of data point and its assigned cluster.
        """
        clusters = []
        for idx, row in self.data.iterrows():
            min_dis = np.inf
            nearest_cluster = None
            for cen_idx, cen_row in self.centroids.iterrows():
                num_dis = euclidean(row[self.numerical_cols], cen_row[self.numerical_cols])
                cat_dis = hamming(row[self.categorical_cols], cen_row[self.categorical_cols])
                total_dis = num_dis + cat_dis
                if total_dis < min_dis:
                    min_dis = total_dis
                    nearest_cluster = cen_idx
            clusters.append((idx, nearest_cluster))
        return clusters

    def _update_centroids(self, clusters: list) -> None:
        """
        Update centroids based on assigned clusters.

        Args:
            clusters (list): List of tuples containing index of data point and its assigned cluster.
        """
        clusters_df = pd.DataFrame(clusters, columns=['row_index', 'cluster'])
        merged_df = pd.merge(clusters_df, self.data, left_on='row_index', right_index=True)
        self.centroids = merged_df.groupby('cluster').agg({
            col: 'mean' if col in self.numerical_cols else lambda x: x.mode().iloc[0] for col in self.data.columns
        })

    def fit(self) -> None:
        """
        Fit the model to the data by iteratively updating centroids until convergence.
        """
        prev_centroids = None
        step = 0
        while True and step < 20:  # Limiting to 20 steps to prevent infinite loops
            prev_centroids = self.centroids.copy()
            clusters = self._assign_clusters()
            self._update_centroids(clusters)
            step += 1
            if prev_centroids is not None:
                if self.centroids.equals(prev_centroids):
                    print('Convergence reached.')
                    break

    def predict(self, data: pd.DataFrame) -> pd.DataFrame:
        """
        Predict the cluster for each data point.

        Args:
            data (pd.DataFrame): New data for prediction.

        Returns:
            pd.DataFrame: DataFrame with a 'cluster' column indicating the predicted cluster for each data point.
        """
        clusters = []
        for idx, row in data.iterrows():
            min_dis = np.inf
            nearest_cluster = None
            for cen_idx, cen_row in self.centroids.iterrows():
                num_dis = euclidean(row[self.numerical_cols], cen_row[self.numerical_cols])
                cat_dis = hamming(row[self.categorical_cols], cen_row[self.categorical_cols])
                total_dis = num_dis + cat_dis
                if total_dis < min_dis:
                    min_dis = total_dis
                    nearest_cluster = cen_idx
            clusters.append(nearest_cluster)
        data['cluster'] = clusters
        return data




In [None]:
all_columns = df_norm.columns

cat_columns = [col[0] for col in df_norm.dtypes if col[1] == 'string']

num_columns = [col[0] for col in df_norm.dtypes if col[1] != 'string']

#cat_index = [df_norm.columns.index(col) for col in cat_columns]

def build_model(partition_iter):
  partition_df = pd.DataFrame(partition_iter, columns=all_columns)
  kproto = KPrototypes(partition_df,k_clusters=3,numerical_cols=num_columns,categorical_cols=cat_columns)
  centroids = kproto.fit()

  return [centroids]

centroids = rdd.mapPartitions(build_model).collect()
centroids

[        PaymentMethod ProductID  Quantity_norm  Price_norm
 cluster                                                   
 0              PayPal         B       0.440990    0.601331
 1                Cash         A       0.399601    0.449394
 2          Debit Card         D       0.619231    0.476343,
         PaymentMethod ProductID  Quantity_norm  Price_norm
 cluster                                                   
 0          Debit Card         B       0.594212    0.524036
 1                Cash         A       0.411585    0.588133
 2              PayPal         D       0.473837    0.403789,
         PaymentMethod ProductID  Quantity_norm  Price_norm
 cluster                                                   
 0              PayPal         A       0.429768    0.403846
 1         Credit Card         C       0.653289    0.538795
 2                Cash         D       0.412628    0.584870,
         PaymentMethod ProductID  Quantity_norm  Price_norm
 cluster                             

In [None]:
df = pd.concat(centroids, ignore_index=True)

In [None]:
k_proto = KPrototypes(df, numerical_cols = num_columns, categorical_cols=cat_columns, k_clusters = 3)
centroid = k_proto.fit()

        PaymentMethod ProductID  Quantity_norm  Price_norm
cluster                                                   
0                Cash         B       0.533090    0.499590
1          Debit Card         C       0.484253    0.526806
2         Credit Card         C       0.467075    0.470309
        PaymentMethod ProductID  Quantity_norm  Price_norm
cluster                                                   
0                Cash         B       0.539388    0.504580
1          Debit Card         C       0.484618    0.527185
2         Credit Card         C       0.467390    0.467219
        PaymentMethod ProductID  Quantity_norm  Price_norm
cluster                                                   
0                Cash         B       0.539401    0.506369
1          Debit Card         C       0.484618    0.527185
2         Credit Card         C       0.468815    0.465981
        PaymentMethod ProductID  Quantity_norm  Price_norm
cluster                                                 

In [None]:
centroid.to_csv('centroids_local.csv')