In [29]:
import os
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, DoubleType, TimestampType
from pyspark.sql.functions import col, to_date, concat, lit
os.environ["SPARK_HOME"] = "/home/mate/.local/lib/python3.10/site-packages/pyspark/"
os.environ["PYSPARK_DRIVER_PYTHON"] = "jupyter"
os.environ["PYSPARK_DRIVER_PYTHON_OPTS"] = "notebook"


In [30]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local") \
    .appName("My Dogs Application") \
    .getOrCreate()


In [31]:
from pyspark.sql.functions import col, dayofweek,to_date, month, count, avg, round
from pyspark.sql import Window
from pyspark.sql.functions import row_number,   sum, when


# Load the CSV file into a DataFrame
csv_file_path_dogs = "./Dog_Info_Modified_Sprint_v2.csv"  # Replace with the path to your CSV file

df_dogs = spark.read.option("delimiter", ",").option("header", "true").csv(csv_file_path_dogs)

In [32]:
df = df_dogs

In [33]:
df = df.withColumn("SPRINT_SPEED", round(df["SPRINT_SPEED"], 2))
df = df.withColumn("WEIGHT", round(df["WEIGHT"], 2))

df.show()
df.printSchema()

+-------+----------------+--------------+------------+------+
|   NAME|            RACE|WAS_VACCINATED|SPRINT_SPEED|WEIGHT|
+-------+----------------+--------------+------------+------+
|  Bella|        Labrador|           Yes|        22.7| 27.13|
|    Max|          Beagle|            No|        22.7|  9.31|
|Charlie|           Boxer|           Yes|       17.72| 30.24|
|   Luna|          Poodle|            No|        22.7| 27.62|
|   Lucy|         Bulldog|           Yes|       22.74| 22.83|
| Cooper|       Dachshund|            No|        29.0|  6.48|
|   Milo| German Shepherd|           Yes|       17.86|  41.9|
| Bailey|Golden Retriever|            No|       18.64| 33.84|
|  Daisy|       Chihuahua|           Yes|       30.23|  0.65|
|  Sadie|             Pug|            No|       23.97| 10.71|
|   Lola|        Labrador|           Yes|       23.93| 22.33|
| Tucker|          Beagle|            No|       29.23|  7.67|
|  Buddy|           Boxer|           Yes|       18.16| 28.21|
|  Molly

In [34]:
from pyspark.ml.feature import VectorAssembler, StandardScaler, OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline

# Indexing and encoding categorical columns
race_indexer = StringIndexer(inputCol="RACE", outputCol="RACE_Index")
race_encoder = OneHotEncoder(inputCol="RACE_Index", outputCol="RACE_Encoded")
vaccinated_indexer = StringIndexer(inputCol="WAS_VACCINATED", outputCol="VACCINATED_Index")

# Assembling features into a single vector
assembler = VectorAssembler(inputCols=["RACE_Encoded", "VACCINATED_Index", "SPRINT_SPEED", "WEIGHT"], outputCol="features")

# Scaling the features
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

# Pipeline
pipeline = Pipeline(stages=[race_indexer, race_encoder, vaccinated_indexer, assembler, scaler])
model = pipeline.fit(df)
transformed_df = model.transform(df)

In [35]:
transformed_df.show()

+-------+----------------+--------------+------------+------+----------+-------------+----------------+--------------------+--------------------+
|   NAME|            RACE|WAS_VACCINATED|SPRINT_SPEED|WEIGHT|RACE_Index| RACE_Encoded|VACCINATED_Index|            features|      scaledFeatures|
+-------+----------------+--------------+------------+------+----------+-------------+----------------+--------------------+--------------------+
|  Bella|        Labrador|           Yes|        22.7| 27.13|       7.0|(9,[7],[1.0])|             1.0|(12,[7,9,10,11],[...|(12,[7,9,10,11],[...|
|    Max|          Beagle|            No|        22.7|  9.31|       0.0|(9,[0],[1.0])|             0.0|(12,[0,10,11],[1....|(12,[0,10,11],[3....|
|Charlie|           Boxer|           Yes|       17.72| 30.24|       1.0|(9,[1],[1.0])|             1.0|(12,[1,9,10,11],[...|(12,[1,9,10,11],[...|
|   Luna|          Poodle|            No|        22.7| 27.62|       8.0|(9,[8],[1.0])|             0.0|(12,[8,10,11],[1....|

In [36]:
from pyspark.ml.clustering import KMeans
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType, FloatType
from pyspark.ml.linalg import VectorUDT, DenseVector
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import sqrt, pow, col


from pyspark.ml.clustering import KMeans
kmeans = KMeans(k=3)
model = kmeans.fit(transformed_df)
model.summary.predictions.show()


print(model.clusterCenters())


print(model.summary.clusterSizes)

+-------+----------------+--------------+------------+------+----------+-------------+----------------+--------------------+--------------------+----------+
|   NAME|            RACE|WAS_VACCINATED|SPRINT_SPEED|WEIGHT|RACE_Index| RACE_Encoded|VACCINATED_Index|            features|      scaledFeatures|prediction|
+-------+----------------+--------------+------------+------+----------+-------------+----------------+--------------------+--------------------+----------+
|  Bella|        Labrador|           Yes|        22.7| 27.13|       7.0|(9,[7],[1.0])|             1.0|(12,[7,9,10,11],[...|(12,[7,9,10,11],[...|         0|
|    Max|          Beagle|            No|        22.7|  9.31|       0.0|(9,[0],[1.0])|             0.0|(12,[0,10,11],[1....|(12,[0,10,11],[3....|         2|
|Charlie|           Boxer|           Yes|       17.72| 30.24|       1.0|(9,[1],[1.0])|             1.0|(12,[1,9,10,11],[...|(12,[1,9,10,11],[...|         0|
|   Luna|          Poodle|            No|        22.7| 27.