In [11]:
# A dog food company try to predict why some batches of their dog food are spoiling much quicker than intended! 
# Unfortunately this Dog Food company hasn't upgraded to the latest machinery, meaning that the
# amounts of the five preservative chemicals they are using can vary a lot, but which is the chemical that has the 
# strongest effect? The dog food company first mixes up a batch of preservative that contains 4 different preservative
# chemicals (A,B,C,D) and then is completed with a "filler" chemical. The food scientists beelive one of the A,B,C, or 
# D preservatives is causing the problem, but need help to figure out which one! 

# Pres_A : Percentage of preservative A in the mix
# Pres_B : Percentage of preservative B in the mix
# Pres_C : Percentage of preservative C in the mix
# Pres_D : Percentage of preservative D in the mix
# Spoiled: Label indicating whether or not the dog food batch was spoiled.


In [12]:
# Import the findspark module and initialize it with the specified Spark path
import findspark
findspark.init('/home/mina/python-spark/spark-3.4.0-bin-hadoop3/')

# Import the pyspark module and the SparkSession class
import pyspark
from pyspark.sql import SparkSession

# Create a Spark session with the specified app name
spark = SparkSession.builder.appName('dog_food').getOrCreate()

In [13]:
# Read a CSV file named 'cruise_ship_info.csv' into a DataFrame
# The 'inferSchema=True' option infers data types for columns, and 'header=True' treats the first row as column names
data = spark.read.csv('dog_food.csv', header = True , inferSchema=True)
data.show()

+---+---+----+---+-------+
|  A|  B|   C|  D|Spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
|  5|  6|12.0|  7|    1.0|
|  6|  2|13.0|  6|    1.0|
|  4|  2|12.0|  1|    1.0|
|  4|  2|12.0|  3|    1.0|
| 10|  3|13.0|  9|    1.0|
|  8|  5|14.0|  5|    1.0|
|  5|  8|12.0|  8|    1.0|
|  6|  5|12.0|  9|    1.0|
|  3|  3|12.0|  1|    1.0|
|  9|  8|11.0|  3|    1.0|
|  1| 10|12.0|  3|    1.0|
|  1|  5|13.0| 10|    1.0|
|  2| 10|12.0|  6|    1.0|
|  1| 10|11.0|  4|    1.0|
|  5|  3|12.0|  2|    1.0|
|  4|  9|11.0|  8|    1.0|
|  5|  1|11.0|  1|    1.0|
|  4|  9|12.0| 10|    1.0|
|  5|  8|10.0|  9|    1.0|
+---+---+----+---+-------+
only showing top 20 rows



In [14]:
# Print the schema of the dataset
data.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



In [15]:
data.describe().show()

+-------+------------------+------------------+------------------+------------------+-------------------+
|summary|                 A|                 B|                 C|                 D|            Spoiled|
+-------+------------------+------------------+------------------+------------------+-------------------+
|  count|               490|               490|               490|               490|                490|
|   mean|  5.53469387755102| 5.504081632653061| 9.126530612244897| 5.579591836734694| 0.2857142857142857|
| stddev|2.9515204234399057|2.8537966089662063|2.0555451971054275|2.8548369309982857|0.45221563164613465|
|    min|                 1|                 1|               5.0|                 1|                0.0|
|    max|                10|                10|              14.0|                10|                1.0|
+-------+------------------+------------------+------------------+------------------+-------------------+



In [16]:
# Import the necessary modules for creating VectorAssembly
from pyspark.ml.feature import VectorAssembler

# Create a VectorAssembler to assemble selected columns into a feature vector
assembler = VectorAssembler(inputCols=['A', 'B', 'C', 'D'],outputCol='Features')

# Transform the DataFrame using the VectorAssembler to add the 'Features' column
output_data = assembler.transform(data)

# Print the schema of the dataset
output_data.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)
 |-- Features: vector (nullable = true)



In [17]:
# Select the desired columns 'Features' and 'Spoiled' from the transformed DataFrame
final_data = output_data.select('Features','Spoiled')
final_data.show()

+-------------------+-------+
|           Features|Spoiled|
+-------------------+-------+
| [4.0,2.0,12.0,3.0]|    1.0|
| [5.0,6.0,12.0,7.0]|    1.0|
| [6.0,2.0,13.0,6.0]|    1.0|
| [4.0,2.0,12.0,1.0]|    1.0|
| [4.0,2.0,12.0,3.0]|    1.0|
|[10.0,3.0,13.0,9.0]|    1.0|
| [8.0,5.0,14.0,5.0]|    1.0|
| [5.0,8.0,12.0,8.0]|    1.0|
| [6.0,5.0,12.0,9.0]|    1.0|
| [3.0,3.0,12.0,1.0]|    1.0|
| [9.0,8.0,11.0,3.0]|    1.0|
|[1.0,10.0,12.0,3.0]|    1.0|
|[1.0,5.0,13.0,10.0]|    1.0|
|[2.0,10.0,12.0,6.0]|    1.0|
|[1.0,10.0,11.0,4.0]|    1.0|
| [5.0,3.0,12.0,2.0]|    1.0|
| [4.0,9.0,11.0,8.0]|    1.0|
| [5.0,1.0,11.0,1.0]|    1.0|
|[4.0,9.0,12.0,10.0]|    1.0|
| [5.0,8.0,10.0,9.0]|    1.0|
+-------------------+-------+
only showing top 20 rows



In [20]:
# Import the necessary modules
from pyspark.ml.classification import RandomForestClassifier

# Create a RandomForestClassifier model
rf = RandomForestClassifier(labelCol='Spoiled', featuresCol='Features')

# Fit the model on the training data
fit_data = rf.fit(final_data)

final_data.head(1)

[Row(Features=DenseVector([4.0, 2.0, 12.0, 3.0]), Spoiled=1.0)]

In [21]:
#featureImportances contains information about the importance of each feature in making predictions
fit_data.featureImportances

SparseVector(4, {0: 0.0168, 1: 0.0202, 2: 0.9361, 3: 0.0269})