In [2]:
!apt-get update && apt-get install -y openjdk-8-jdk-headless
!wget -q https://archive.apache.org/dist/spark/spark-3.0.2/spark-3.0.2-bin-hadoop2.7.tgz  # Download Spark 3.0.2
!tar xf spark-3.0.2-bin-hadoop2.7.tgz
!echo 'export SPARK_HOME="/content/spark-3.0.2-bin-hadoop2.7"' >> ~/.bashrc
!source ~/.bashrc


0% [Working]            Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
0% [Connecting to archive.ubuntu.com (185.125.190.36)] [1 InRelease 12.3 kB/110 kB 11%] [Connected t0% [Connecting to archive.ubuntu.com (185.125.190.36)] [Connected to cloud.r-project.org (52.85.151.                                                                                                    Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
0% [Waiting for headers] [Connected to ppa.launchpadcontent.net (185.125.190.80)] [Waiting for heade                                                                                                    Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
0% [Waiting for headers] [Connected to ppa.launchpadcontent.net (185.125.190.80)] [3 InRelease 1,581                                                                                                    0%

In [3]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=ff85c1396e16c6a1e4eb023df08d3815bf48cd868eb0c64c07d46adc6a0ef009
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [4]:
# Import necessary libraries
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [5]:
# Create a SparkSession
spark = SparkSession.builder.appName("SmokingDrinkingPySpark").getOrCreate()


In [15]:
#importing data set
data = spark.read.csv("/content/smoking_driking_dataset_Ver01.csv",inferSchema=True,header=True)


In [16]:
from pyspark.sql.functions import col

# Calculate the sum of null values in each column
null_counts = {col_name: data.filter(col(col_name).isNull()).count() for col_name in data.columns}

# Display the result
null_counts

{'sex': 0,
 'age': 0,
 'height': 0,
 'weight': 0,
 'waistline': 0,
 'sight_left': 0,
 'sight_right': 0,
 'hear_left': 0,
 'hear_right': 0,
 'SBP': 0,
 'DBP': 0,
 'BLDS': 0,
 'tot_chole': 0,
 'HDL_chole': 0,
 'LDL_chole': 0,
 'triglyceride': 0,
 'hemoglobin': 0,
 'urine_protein': 0,
 'serum_creatinine': 0,
 'SGOT_AST': 0,
 'SGOT_ALT': 0,
 'gamma_GTP': 0,
 'SMK_stat_type_cd': 0,
 'DRK_YN': 0}

In [18]:
# Display initial DataFrame information
data.show(5)
data.printSchema()

+----+---+------+------+---------+----------+-----------+---------+----------+-----+----+-----+---------+---------+---------+------------+----------+-------------+----------------+--------+--------+---------+----------------+------+
| sex|age|height|weight|waistline|sight_left|sight_right|hear_left|hear_right|  SBP| DBP| BLDS|tot_chole|HDL_chole|LDL_chole|triglyceride|hemoglobin|urine_protein|serum_creatinine|SGOT_AST|SGOT_ALT|gamma_GTP|SMK_stat_type_cd|DRK_YN|
+----+---+------+------+---------+----------+-----------+---------+----------+-----+----+-----+---------+---------+---------+------------+----------+-------------+----------------+--------+--------+---------+----------------+------+
|Male| 35|   170|    75|     90.0|       1.0|        1.0|      1.0|       1.0|120.0|80.0| 99.0|    193.0|     48.0|    126.0|        92.0|      17.1|          1.0|             1.0|    21.0|    35.0|     40.0|             1.0|     Y|
|Male| 30|   180|    80|     89.0|       0.9|        1.2|      1.0| 

In [19]:
total_rows = data.count()
print("Total number of rows :", total_rows)

Total number of rows in the DataFrame: 991346


In [20]:
# Encode 'sex' column as numerical
indexer = StringIndexer(inputCol="sex", outputCol="sex_encoded")
data = indexer.fit(data).transform(data)

In [21]:
# Encode '"DRK_YN"' column as numerical
indexer = StringIndexer(inputCol="DRK_YN", outputCol="DRK_YN_encoded")
data = indexer.fit(data).transform(data)

In [None]:
data.printSchema()

In [22]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=["age", "sex_encoded","height","weight","waistline","sight_left","sight_right","hear_left","hear_right","SBP","DBP","BLDS","tot_chole","HDL_chole","LDL_chole","triglyceride","hemoglobin","urine_protein","serum_creatinine","SGOT_AST","SGOT_ALT","gamma_GTP","SMK_stat_type_cd"], outputCol="features")
output = assembler.transform(data)


In [10]:
output.select("features").show(3)

+--------------------+
|            features|
+--------------------+
|[35.0,0.0,170.0,7...|
|[30.0,0.0,180.0,8...|
|[40.0,0.0,165.0,7...|
+--------------------+
only showing top 3 rows



In [11]:
output.show(3)

+----+---+------+------+---------+----------+-----------+---------+----------+-----+----+-----+---------+---------+---------+------------+----------+-------------+----------------+--------+--------+---------+----------------+------+-----------+--------------+--------------------+
| sex|age|height|weight|waistline|sight_left|sight_right|hear_left|hear_right|  SBP| DBP| BLDS|tot_chole|HDL_chole|LDL_chole|triglyceride|hemoglobin|urine_protein|serum_creatinine|SGOT_AST|SGOT_ALT|gamma_GTP|SMK_stat_type_cd|DRK_YN|sex_encoded|DRK_YN_encoded|            features|
+----+---+------+------+---------+----------+-----------+---------+----------+-----+----+-----+---------+---------+---------+------------+----------+-------------+----------------+--------+--------+---------+----------------+------+-----------+--------------+--------------------+
|Male| 35|   170|    75|     90.0|       1.0|        1.0|      1.0|       1.0|120.0|80.0| 99.0|    193.0|     48.0|    126.0|        92.0|      17.1|        

In [23]:
data_final=output.select("features","DRK_YN_encoded")

In [24]:
data_final.printSchema()

root
 |-- features: vector (nullable = true)
 |-- DRK_YN_encoded: double (nullable = false)



In [14]:
data_final.show(3)

+--------------------+--------------+
|            features|DRK_YN_encoded|
+--------------------+--------------+
|[35.0,0.0,170.0,7...|           1.0|
|[30.0,0.0,180.0,8...|           0.0|
|[40.0,0.0,165.0,7...|           0.0|
+--------------------+--------------+
only showing top 3 rows



In [25]:
# split between training/test as a list.
train_data,test_data = data_final.randomSplit([0.7,0.3],seed=42)

In [26]:
#logistic Regression
from pyspark.ml.classification import LogisticRegression #Importing logistic regression library
lr = LogisticRegression(labelCol='DRK_YN_encoded')


In [27]:
# Fit the model to the data and call this model lrModel
lrModel = lr.fit(train_data)

In [40]:
lrModel

LogisticRegressionModel: uid=LogisticRegression_2befbee2f59d, numClasses=2, numFeatures=23

In [28]:
#To find the summary
lrMdel_summary=lrModel.summary

In [30]:
#pedicting the model
lrModel.summary.predictions.show()

+--------------------+--------------+--------------------+--------------------+----------+
|            features|DRK_YN_encoded|       rawPrediction|         probability|prediction|
+--------------------+--------------+--------------------+--------------------+----------+
|[20.0,0.0,150.0,5...|           0.0|[-0.6492981535855...|[0.34314771423616...|       1.0|
|[20.0,0.0,150.0,5...|           1.0|[-1.2118971427757...|[0.22936554554420...|       1.0|
|[20.0,0.0,155.0,4...|           0.0|[-1.4646261309596...|[0.18776079005811...|       1.0|
|[20.0,0.0,155.0,4...|           0.0|[-0.6431523209590...|[0.34453430097636...|       1.0|
|[20.0,0.0,155.0,4...|           0.0|[-1.5035639198661...|[0.18189457929314...|       1.0|
|[20.0,0.0,155.0,4...|           0.0|[-0.6419866740325...|[0.34479758718960...|       1.0|
|[20.0,0.0,155.0,5...|           0.0|[-1.0770901719982...|[0.25405706945395...|       1.0|
|[20.0,0.0,155.0,5...|           1.0|[-1.4536852248344...|[0.18943505416737...|       1.0|

In [31]:
lrModel.summary.predictions.describe().show()

+-------+------------------+-------------------+
|summary|    DRK_YN_encoded|         prediction|
+-------+------------------+-------------------+
|  count|            693953|             693953|
|   mean|0.4993983742414832|0.49038191347252624|
| stddev|0.4999999983013892| 0.4999078440409893|
|    min|               0.0|                0.0|
|    max|               1.0|                1.0|
+-------+------------------+-------------------+



In [32]:
pred_labels=lrModel.evaluate(test_data)

In [35]:
predictions_df = pred_labels.predictions

In [36]:
predictions_df.show()

+--------------------+--------------+--------------------+--------------------+----------+
|            features|DRK_YN_encoded|       rawPrediction|         probability|prediction|
+--------------------+--------------+--------------------+--------------------+----------+
|[20.0,0.0,155.0,4...|           0.0|[-1.2766069483337...|[0.21812834953967...|       1.0|
|[20.0,0.0,155.0,4...|           0.0|[-0.8790742662819...|[0.29336965039948...|       1.0|
|[20.0,0.0,155.0,5...|           0.0|[-0.6709745896188...|[0.33827864823754...|       1.0|
|[20.0,0.0,155.0,5...|           1.0|[-0.8437427251873...|[0.30074710881964...|       1.0|
|[20.0,0.0,155.0,5...|           1.0|[-1.0704186641727...|[0.25532347397700...|       1.0|
|[20.0,0.0,155.0,5...|           0.0|[-1.3477603973051...|[0.20623676063234...|       1.0|
|[20.0,0.0,155.0,5...|           0.0|[-0.6175524611035...|[0.35033831039634...|       1.0|
|[20.0,0.0,155.0,5...|           1.0|[-1.8354683964431...|[0.13758811935500...|       1.0|

In [39]:
prediction_lr=lrModel.transform(test_data)

# Evaluate the model
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator_lr = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="DRK_YN_encoded", metricName="areaUnderROC")
accuracy = evaluator_lr.evaluate(prediction_lr)
print("Area Under ROC Curve:", accuracy)

Area Under ROC Curve: 0.8028343733871144


In [40]:
#Support vector machine

(train_data2, test_data2) = data_final.randomSplit([0.7, 0.3], seed=123)
from pyspark.ml.classification import LinearSVC
# Create an SVM model
svm = LinearSVC(labelCol='DRK_YN_encoded',featuresCol='features')
# Train the SVM model
svm_model = svm.fit(train_data2)

# Make predictions
predictions = svm_model.transform(test_data2)
pred_labels2=svm_model.evaluate(test_data2)
predictions_df2 = pred_labels2.predictions
predictions_df2.show()


# Evaluate the model
evaluator_svm = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="DRK_YN_encoded", metricName="areaUnderROC")
accuracy = evaluator_svm.evaluate(predictions)
print("Area Under ROC Curve:", accuracy)


+--------------------+--------------+--------------------+----------+
|            features|DRK_YN_encoded|       rawPrediction|prediction|
+--------------------+--------------+--------------------+----------+
|[20.0,0.0,155.0,4...|           0.0|[-1.1689731530687...|       1.0|
|[20.0,0.0,155.0,4...|           0.0|[-0.8026731961965...|       1.0|
|[20.0,0.0,155.0,5...|           1.0|[-0.7623124921561...|       1.0|
|[20.0,0.0,155.0,5...|           0.0|[-0.6661612364534...|       1.0|
|[20.0,0.0,155.0,5...|           1.0|[-0.9770198443034...|       1.0|
|[20.0,0.0,155.0,5...|           0.0|[-1.9240818755348...|       1.0|
|[20.0,0.0,155.0,5...|           1.0|[-0.8154504417101...|       1.0|
|[20.0,0.0,155.0,5...|           0.0|[-1.0699086225610...|       1.0|
|[20.0,0.0,155.0,5...|           1.0|[-1.8465081659891...|       1.0|
|[20.0,0.0,155.0,6...|           0.0|[-0.5230247496404...|       1.0|
|[20.0,0.0,155.0,6...|           0.0|[-0.0569971791943...|       1.0|
|[20.0,0.0,155.0,6..