# Cancer Prediction

## Dataset Information:

####Target Variable (y):

- Diagnosis (M = malignant, B = benign)

#### Ten features (X) are computed for each cell nucleus:

1. radius (mean of distances from center to points on the perimeter)
2. texture (standard deviation of gray-scale values)
3. perimeter
4. area
5. smoothness (local variation in radius lengths)
6. compactness (perimeter^2 / area - 1.0)
7. concavity (severity of concave portions of the contour)
8. concave points (number of concave portions of the contour)
9. symmetry
10. fractal dimension (coastline approximation - 1)

#### For each characteristic three measures are given:

   a. Mean

   b. Standard error

   c. Largest/ Worst

In [0]:
dbutils.fs.ls("dbfs:/FileStore/tables/")

[FileInfo(path='dbfs:/FileStore/tables/Admission_Chance.csv', name='Admission_Chance.csv', size=12905, modificationTime=1720190058000),
 FileInfo(path='dbfs:/FileStore/tables/Cancer.csv', name='Cancer.csv', size=125204, modificationTime=1720190099000),
 FileInfo(path='dbfs:/FileStore/tables/Credit_Default.csv', name='Credit_Default.csv', size=101152, modificationTime=1720190106000),
 FileInfo(path='dbfs:/FileStore/tables/Customer_Purchase.csv', name='Customer_Purchase.csv', size=1489, modificationTime=1720190113000),
 FileInfo(path='dbfs:/FileStore/tables/Fish.csv', name='Fish.csv', size=6349, modificationTime=1720190119000),
 FileInfo(path='dbfs:/FileStore/tables/Ice_Cream.csv', name='Ice_Cream.csv', size=4872, modificationTime=1720190124000),
 FileInfo(path='dbfs:/FileStore/tables/Test1.csv', name='Test1.csv', size=108, modificationTime=1720158698000),
 FileInfo(path='dbfs:/FileStore/tables/Test2.csv', name='Test2.csv', size=192, modificationTime=1720158698000),
 FileInfo(path='dbfs:

In [0]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import col

In [0]:
spark = SparkSession.builder.appName('Cancer Prediction').getOrCreate() 

In [0]:
spark

In [0]:
df_pyspark = spark.read.csv('dbfs:/FileStore/tables/Cancer.csv',header=True,inferSchema=True)

In [0]:
df_pyspark.printSchema()

root
 |-- id: integer (nullable = true)
 |-- diagnosis: string (nullable = true)
 |-- radius_mean: double (nullable = true)
 |-- texture_mean: double (nullable = true)
 |-- perimeter_mean: double (nullable = true)
 |-- area_mean: double (nullable = true)
 |-- smoothness_mean: double (nullable = true)
 |-- compactness_mean: double (nullable = true)
 |-- concavity_mean: double (nullable = true)
 |-- concave points_mean: double (nullable = true)
 |-- symmetry_mean: double (nullable = true)
 |-- fractal_dimension_mean: double (nullable = true)
 |-- radius_se: double (nullable = true)
 |-- texture_se: double (nullable = true)
 |-- perimeter_se: double (nullable = true)
 |-- area_se: double (nullable = true)
 |-- smoothness_se: double (nullable = true)
 |-- compactness_se: double (nullable = true)
 |-- concavity_se: double (nullable = true)
 |-- concave points_se: double (nullable = true)
 |-- symmetry_se: double (nullable = true)
 |-- fractal_dimension_se: double (nullable = true)
 |-- radi

In [0]:
df_pyspark

DataFrame[id: int, diagnosis: string, radius_mean: double, texture_mean: double, perimeter_mean: double, area_mean: double, smoothness_mean: double, compactness_mean: double, concavity_mean: double, concave points_mean: double, symmetry_mean: double, fractal_dimension_mean: double, radius_se: double, texture_se: double, perimeter_se: double, area_se: double, smoothness_se: double, compactness_se: double, concavity_se: double, concave points_se: double, symmetry_se: double, fractal_dimension_se: double, radius_worst: double, texture_worst: double, perimeter_worst: double, area_worst: double, smoothness_worst: double, compactness_worst: double, concavity_worst: double, concave points_worst: double, symmetry_worst: double, fractal_dimension_worst: double, _c32: string]

In [0]:
df_pyspark.show()

+--------+---------+-----------+------------+--------------+---------+---------------+----------------+--------------+-------------------+-------------+----------------------+---------+----------+------------+-------+-------------+--------------+------------+-----------------+-----------+--------------------+------------+-------------+---------------+----------+----------------+-----------------+---------------+--------------------+--------------+-----------------------+----+
|      id|diagnosis|radius_mean|texture_mean|perimeter_mean|area_mean|smoothness_mean|compactness_mean|concavity_mean|concave points_mean|symmetry_mean|fractal_dimension_mean|radius_se|texture_se|perimeter_se|area_se|smoothness_se|compactness_se|concavity_se|concave points_se|symmetry_se|fractal_dimension_se|radius_worst|texture_worst|perimeter_worst|area_worst|smoothness_worst|compactness_worst|concavity_worst|concave points_worst|symmetry_worst|fractal_dimension_worst|_c32|
+--------+---------+-----------+------

# 1. Clean the DataFrame

In [0]:
# Convert diagnosis column to numeric (0 for benign, 1 for malignant)
indexer = StringIndexer(inputCol="diagnosis", outputCol="label")
indexed_data = indexer.fit(df_pyspark).transform(df_pyspark)
indexed_data = indexed_data.withColumn("label", indexed_data["label"].cast("integer"))

In [0]:
indexed_data.select("diagnosis","label").show()

+---------+-----+
|diagnosis|label|
+---------+-----+
|        M|    1|
|        M|    1|
|        M|    1|
|        M|    1|
|        M|    1|
|        M|    1|
|        M|    1|
|        M|    1|
|        M|    1|
|        M|    1|
|        M|    1|
|        M|    1|
|        M|    1|
|        M|    1|
|        M|    1|
|        M|    1|
|        M|    1|
|        M|    1|
|        M|    1|
|        B|    0|
+---------+-----+
only showing top 20 rows



In [0]:
# Clean dataframe (keep all columns except id and _c32)
columns_to_keep = [
    "label",
    "radius_mean",
    "texture_mean",
    "perimeter_mean",
    "area_mean",
    "smoothness_mean",
    "compactness_mean",
    "concavity_mean",
    "concave points_mean",
    "symmetry_mean",
    "fractal_dimension_mean",
    "radius_se",
    "texture_se",
    "perimeter_se",
    "area_se",
    "smoothness_se",
    "compactness_se",
    "concavity_se",
    "concave points_se",
    "symmetry_se",
    "fractal_dimension_se",
    "radius_worst",
    "texture_worst",
    "perimeter_worst",
    "area_worst",
    "smoothness_worst",
    "compactness_worst",
    "concavity_worst",
    "concave points_worst",
    "symmetry_worst",
    "fractal_dimension_worst"
]

df_cleaned = indexed_data.select(columns_to_keep)

In [0]:
df_cleaned.show()

+-----+-----------+------------+--------------+---------+---------------+----------------+--------------+-------------------+-------------+----------------------+---------+----------+------------+-------+-------------+--------------+------------+-----------------+-----------+--------------------+------------+-------------+---------------+----------+----------------+-----------------+---------------+--------------------+--------------+-----------------------+
|label|radius_mean|texture_mean|perimeter_mean|area_mean|smoothness_mean|compactness_mean|concavity_mean|concave points_mean|symmetry_mean|fractal_dimension_mean|radius_se|texture_se|perimeter_se|area_se|smoothness_se|compactness_se|concavity_se|concave points_se|symmetry_se|fractal_dimension_se|radius_worst|texture_worst|perimeter_worst|area_worst|smoothness_worst|compactness_worst|concavity_worst|concave points_worst|symmetry_worst|fractal_dimension_worst|
+-----+-----------+------------+--------------+---------+---------------+-

In [0]:
# Drop rows with null values if necessary
df_cleaned = df_cleaned.dropna()

# Prepare the DataFrame

In [0]:
# Assemble features into a vector
feature_cols = df_cleaned.columns[1:]  # Excluding 'diagnosis'
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df_assembled = assembler.transform(df_cleaned)

In [0]:
df_assembled.select("features","label").show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[17.99,10.38,122....|    1|
|[20.57,17.77,132....|    1|
|[19.69,21.25,130....|    1|
|[11.42,20.38,77.5...|    1|
|[20.29,14.34,135....|    1|
|[12.45,15.7,82.57...|    1|
|[18.25,19.98,119....|    1|
|[13.71,20.83,90.2...|    1|
|[13.0,21.82,87.5,...|    1|
|[12.46,24.04,83.9...|    1|
|[16.02,23.24,102....|    1|
|[15.78,17.89,103....|    1|
|[19.17,24.8,132.4...|    1|
|[15.85,23.95,103....|    1|
|[13.73,22.61,93.6...|    1|
|[14.54,27.54,96.7...|    1|
|[14.68,20.13,94.7...|    1|
|[16.13,20.68,108....|    1|
|[19.81,22.15,130....|    1|
|[13.54,14.36,87.4...|    0|
+--------------------+-----+
only showing top 20 rows



# 3. Split the DataFrame

In [0]:
# Split data into training and testing sets
train_data, test_data = df_assembled.randomSplit([0.8, 0.2], seed=42)

# 4. Train the Model

In [0]:
# Initialize logistic regression model
lr = LogisticRegression(featuresCol="features", labelCol="label")

# Fit the model
lr_model = lr.fit(train_data)

# 5. Evaluate the Model

In [0]:
# Make predictions
predictions = lr_model.transform(test_data)

# Evaluate the model using BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol="label")
accuracy_binary = evaluator.evaluate(predictions)

# Compute confusion matrix
confusion_matrix = predictions.groupBy('label').pivot('prediction').count().na.fill(0).orderBy('label')
confusion_matrix.show()

# Compute classification report (precision, recall, f1-score)
tp = predictions.filter((col("label") == 1.0) & (col("prediction") == 1.0)).count()
tn = predictions.filter((col("label") == 0.0) & (col("prediction") == 0.0)).count()
fp = predictions.filter((col("label") == 0.0) & (col("prediction") == 1.0)).count()
fn = predictions.filter((col("label") == 1.0) & (col("prediction") == 0.0)).count()

precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * (precision * recall) / (precision + recall)

print(f"Binary Classification Accuracy: {accuracy_binary}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1_score}")

+-----+---+---+
|label|0.0|1.0|
+-----+---+---+
|    0| 49|  5|
|    1|  2| 30|
+-----+---+---+

Binary Classification Accuracy: 0.9895833333333334
Precision: 0.8571428571428571
Recall: 0.9375
F1-score: 0.8955223880597014


In [0]:
# Save the trained logistic regression model
model_path = "./Internship_Sem-6_models/Cancer_Prediction_model"
lr_model.save(model_path)

In [0]:
dbutils.fs.ls("dbfs:/Internship_Sem-6_models/Cancer_Prediction_model")

[FileInfo(path='dbfs:/Internship_Sem-6_models/Cancer_Prediction_model/data/', name='data/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/Internship_Sem-6_models/Cancer_Prediction_model/metadata/', name='metadata/', size=0, modificationTime=0)]