In [1]:
#specail thanks to :
#Matjaz Zwitter & Milan Soklic (physicians)
#Institute of Oncology
#University Medical Center
#Ljubljana, Yugoslavia

#Donors:

#Ming Tan and Jeff Schlimmer (Jeffrey.Schlimmer '@' a.gp.cs.cmu.edu)

In [2]:
# data set link  : https://archive.ics.uci.edu/ml/datasets/Breast+Cancer

In [3]:
# importing pyspark
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark_session = SparkSession.builder.appName("breast cancer").getOrCreate()

In [None]:
#reading the data set
data = spark_session.read.csv("breast-cancer.data" , inferSchema=True, sep=",")
data = data.withColumnRenamed("_c0" , "Class")
data = data.withColumnRenamed("_c1" , "age")
data = data.withColumnRenamed("_c2" , "menopause")
data = data.withColumnRenamed("_c3" , "tumor_size")
data = data.withColumnRenamed("_c4" , "inv_nodes")
data = data.withColumnRenamed("_c5" , "node_caps")
data = data.withColumnRenamed("_c6" , "deg_malig")
data = data.withColumnRenamed("_c7" , "breast")
data = data.withColumnRenamed("_c8" , "breast_quad")
data = data.withColumnRenamed("_c9" , "irradiat")
data.printSchema()

In [None]:
data.show(50)

In [None]:
# checking if any columns have null values (?)
for i in range(10):
    if data.filter("{} = '?'".format(data.columns[i])).count() > 0 :
        print(data.columns[i])

In [None]:
# eliminating missing values ?
print(data.count())
data = data.filter((data.node_caps != '?') & (data.breast_quad != '?'))
data.show()
print(data.count())

In [None]:
# indexing  the categorical columns
from pyspark.ml.feature import StringIndexer
str_indexer = StringIndexer(inputCol = "Class" , outputCol = "class_indexed")
indexed = str_indexer.fit(data).transform(data)
for i in range(1,6):
    str_indexer = StringIndexer(inputCol = data.columns[i] , outputCol = f"{ data.columns[i]}_indexed")
    indexed = str_indexer.fit(indexed).transform(indexed)
for i in range(7,10):
    str_indexer = StringIndexer(inputCol = data.columns[i] , outputCol = f"{data.columns[i]}_indexed")
    indexed = str_indexer.fit(indexed).transform(indexed)
indexed.show()

In [None]:
indexed.columns[6],indexed.columns[10:]

In [None]:
# assembling features
from pyspark.ml.feature import VectorAssembler
va = VectorAssembler(inputCols = ['deg_malig',
  'age_indexed',
  'menopause_indexed',
  'tumor_size_indexed',
  'inv_nodes_indexed',
  'node_caps_indexed',
  'breast_indexed',
  'breast_quad_indexed',
  'irradiat_indexed']
,outputCol = "features")
finalized_data = va.transform(indexed)
finalized_data = finalized_data.select(["features" , "class_indexed"]).withColumnRenamed("class_indexed" , "label")
finalized_data.show()

In [None]:
test_data , train_data = finalized_data.randomSplit([0.3,0.7])

In [None]:
from pyspark.ml.classification import LinearSVC
svm = LinearSVC()
svm = svm.fit(train_data)
predictions = svm.evaluate(test_data).predictions
predictions.show()

In [None]:
# eavaluating the accuracy of the model
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)