In [1]:
#specail thanks to :
#Matjaz Zwitter & Milan Soklic (physicians)
#Institute of Oncology
#University Medical Center
#Ljubljana, Yugoslavia

#Donors:

#Ming Tan and Jeff Schlimmer (Jeffrey.Schlimmer '@' a.gp.cs.cmu.edu)

In [2]:
# data set link  : https://archive.ics.uci.edu/ml/datasets/Breast+Cancer

In [3]:
# importing pyspark
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark_session = SparkSession.builder.appName("breast cancer").getOrCreate()

In [4]:
#reading the data set
data = spark_session.read.csv("breast-cancer.data" , inferSchema=True, sep=",")
data = data.withColumnRenamed("_c0" , "Class")
data = data.withColumnRenamed("_c1" , "age")
data = data.withColumnRenamed("_c2" , "menopause")
data = data.withColumnRenamed("_c3" , "tumor_size")
data = data.withColumnRenamed("_c4" , "inv_nodes")
data = data.withColumnRenamed("_c5" , "node_caps")
data = data.withColumnRenamed("_c6" , "deg_malig")
data = data.withColumnRenamed("_c7" , "breast")
data = data.withColumnRenamed("_c8" , "breast_quad")
data = data.withColumnRenamed("_c9" , "irradiat")
data.printSchema()

root
 |-- Class: string (nullable = true)
 |-- age: string (nullable = true)
 |-- menopause: string (nullable = true)
 |-- tumor_size: string (nullable = true)
 |-- inv_nodes: string (nullable = true)
 |-- node_caps: string (nullable = true)
 |-- deg_malig: integer (nullable = true)
 |-- breast: string (nullable = true)
 |-- breast_quad: string (nullable = true)
 |-- irradiat: string (nullable = true)



In [5]:
data.show(50)

+--------------------+-----+---------+----------+---------+---------+---------+------+-----------+--------+
|               Class|  age|menopause|tumor_size|inv_nodes|node_caps|deg_malig|breast|breast_quad|irradiat|
+--------------------+-----+---------+----------+---------+---------+---------+------+-----------+--------+
|no-recurrence-events|30-39|  premeno|     30-34|      0-2|       no|        3|  left|   left_low|      no|
|no-recurrence-events|40-49|  premeno|     20-24|      0-2|       no|        2| right|   right_up|      no|
|no-recurrence-events|40-49|  premeno|     20-24|      0-2|       no|        2|  left|   left_low|      no|
|no-recurrence-events|60-69|     ge40|     15-19|      0-2|       no|        2| right|    left_up|      no|
|no-recurrence-events|40-49|  premeno|       0-4|      0-2|       no|        2| right|  right_low|      no|
|no-recurrence-events|60-69|     ge40|     15-19|      0-2|       no|        2|  left|   left_low|      no|
|no-recurrence-events|50-59|

In [6]:
# checking if any columns have null values (?)
for i in range(10):
    if data.filter("{} = '?'".format(data.columns[i])).count() > 0 :
        print(data.columns[i])

node_caps
breast_quad


In [7]:
# eliminating missing values ?
print(data.count())
data = data.filter((data.node_caps != '?') & (data.breast_quad != '?'))
data.show()
print(data.count())

286
+--------------------+-----+---------+----------+---------+---------+---------+------+-----------+--------+
|               Class|  age|menopause|tumor_size|inv_nodes|node_caps|deg_malig|breast|breast_quad|irradiat|
+--------------------+-----+---------+----------+---------+---------+---------+------+-----------+--------+
|no-recurrence-events|30-39|  premeno|     30-34|      0-2|       no|        3|  left|   left_low|      no|
|no-recurrence-events|40-49|  premeno|     20-24|      0-2|       no|        2| right|   right_up|      no|
|no-recurrence-events|40-49|  premeno|     20-24|      0-2|       no|        2|  left|   left_low|      no|
|no-recurrence-events|60-69|     ge40|     15-19|      0-2|       no|        2| right|    left_up|      no|
|no-recurrence-events|40-49|  premeno|       0-4|      0-2|       no|        2| right|  right_low|      no|
|no-recurrence-events|60-69|     ge40|     15-19|      0-2|       no|        2|  left|   left_low|      no|
|no-recurrence-events|50

In [8]:
# indexing  the categorical columns
from pyspark.ml.feature import StringIndexer
str_indexer = StringIndexer(inputCol = "Class" , outputCol = "class_indexed")
indexed = str_indexer.fit(data).transform(data)
for i in range(1,6):
    str_indexer = StringIndexer(inputCol = data.columns[i] , outputCol = f"{ data.columns[i]}_indexed")
    indexed = str_indexer.fit(indexed).transform(indexed)
for i in range(7,10):
    str_indexer = StringIndexer(inputCol = data.columns[i] , outputCol = f"{data.columns[i]}_indexed")
    indexed = str_indexer.fit(indexed).transform(indexed)
indexed.show()

+--------------------+-----+---------+----------+---------+---------+---------+------+-----------+--------+-------------+-----------+-----------------+------------------+-----------------+-----------------+--------------+-------------------+----------------+
|               Class|  age|menopause|tumor_size|inv_nodes|node_caps|deg_malig|breast|breast_quad|irradiat|class_indexed|age_indexed|menopause_indexed|tumor_size_indexed|inv_nodes_indexed|node_caps_indexed|breast_indexed|breast_quad_indexed|irradiat_indexed|
+--------------------+-----+---------+----------+---------+---------+---------+------+-----------+--------+-------------+-----------+-----------------+------------------+-----------------+-----------------+--------------+-------------------+----------------+
|no-recurrence-events|30-39|  premeno|     30-34|      0-2|       no|        3|  left|   left_low|      no|          0.0|        3.0|              0.0|               0.0|              0.0|              0.0|           0.0|  

In [9]:
indexed.columns[6],indexed.columns[10:]

('deg_malig',
 ['class_indexed',
  'age_indexed',
  'menopause_indexed',
  'tumor_size_indexed',
  'inv_nodes_indexed',
  'node_caps_indexed',
  'breast_indexed',
  'breast_quad_indexed',
  'irradiat_indexed'])

In [10]:
# assembling features
from pyspark.ml.feature import VectorAssembler
va = VectorAssembler(inputCols = ['deg_malig',
  'age_indexed',
  'menopause_indexed',
  'tumor_size_indexed',
  'inv_nodes_indexed',
  'node_caps_indexed',
  'breast_indexed',
  'breast_quad_indexed',
  'irradiat_indexed']
,outputCol = "features")
finalized_data = va.transform(indexed)
finalized_data = finalized_data.select(["features" , "class_indexed"]).withColumnRenamed("class_indexed" , "label")
finalized_data.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
| (9,[0,1],[3.0,3.0])|  0.0|
|[2.0,1.0,0.0,2.0,...|  0.0|
|(9,[0,1,3],[2.0,1...|  0.0|
|[2.0,2.0,1.0,3.0,...|  0.0|
|[2.0,1.0,0.0,7.0,...|  0.0|
|(9,[0,1,2,3],[2.0...|  0.0|
| (9,[0,3],[2.0,1.0])|  0.0|
|(9,[0,1,2,3],[1.0...|  0.0|
|(9,[0,1,3],[2.0,1...|  0.0|
|[2.0,1.0,0.0,2.0,...|  0.0|
|(9,[0,1,3,7],[3.0...|  0.0|
|(9,[0,2,3],[2.0,1...|  0.0|
|[1.0,2.0,2.0,4.0,...|  0.0|
|(9,[0,2,3,7],[3.0...|  0.0|
|(9,[0,1,7],[3.0,1...|  0.0|
|(9,[0,1,2],[1.0,2...|  0.0|
|(9,[0,1,3],[2.0,1...|  0.0|
|       (9,[0],[3.0])|  0.0|
|(9,[0,1,2],[3.0,2...|  0.0|
|(9,[0,2,6,7],[1.0...|  0.0|
+--------------------+-----+
only showing top 20 rows



In [11]:
test_data , train_data = finalized_data.randomSplit([0.3,0.7])

In [12]:
from pyspark.ml.classification import LinearSVC
svm = LinearSVC()
svm = svm.fit(train_data)
predictions = svm.evaluate(test_data).predictions
predictions.show()

+--------------------+-----+--------------------+----------+
|            features|label|       rawPrediction|prediction|
+--------------------+-----+--------------------+----------+
|       (9,[0],[3.0])|  0.0|[0.99150251803715...|       0.0|
| (9,[0,1],[1.0,1.0])|  0.0|[1.00365529617852...|       0.0|
| (9,[0,1],[3.0,3.0])|  0.0|[0.98950106143959...|       0.0|
|(9,[0,1,2,3],[1.0...|  0.0|[1.00863330452980...|       0.0|
|(9,[0,1,2,3],[2.0...|  0.0|[1.00139322288177...|       0.0|
|(9,[0,1,2,8],[2.0...|  0.0|[1.00067152214902...|       0.0|
|(9,[0,1,3],[1.0,1...|  1.0|[1.00780587856730...|       0.0|
|(9,[0,1,3],[1.0,3...|  0.0|[1.00481134121341...|       0.0|
|(9,[0,1,3],[2.0,1...|  0.0|[0.99890556396375...|       0.0|
|(9,[0,1,3],[2.0,1...|  0.0|[1.00056579691926...|       0.0|
|(9,[0,1,3],[3.0,3...|  1.0|[0.99448176030612...|       0.0|
|(9,[0,1,3,6],[1.0...|  0.0|[1.00821625843353...|       0.0|
|(9,[0,1,3,6],[2.0...|  0.0|[1.00097617678550...|       0.0|
|(9,[0,1,3,6],[2.0...|  

In [13]:
# eavaluating the accuracy of the model
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)

0.7012743628185908