In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

# Create a SparkSession
spark = SparkSession.builder.appName('ChemicalLabeling').getOrCreate()

# Load the dataset
data = spark.read.csv('https://raw.githubusercontent.com/Kuna1Chauhan/EDA/main/indian_liver_patient.csv', header=True, inferSchema=True)

# Data preprocessing
data = data.dropna()  # Remove rows with missing values
data = data.withColumnRenamed('Dataset', 'label')  # Rename the target column to 'label'

# Feature engineering
feature_cols = ['Age', 'Total_Bilirubin', 'Albumin_and_Globulin_Ratio']
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
data = assembler.transform(data)

# Model training
(train_data, test_data) = data.randomSplit([0.8, 0.2], seed=42)  # Split the data into train and test sets
lr = LogisticRegression(featuresCol='features', labelCol='label')
pipeline = Pipeline(stages=[lr])
model = pipeline.fit(train_data)

# Model evaluation
predictions = model.transform(test_data)
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(test_data.count())
print(f'Accuracy: {accuracy * 100:.2f}%')

# Save the model
model.save('model')

# Close the SparkSession
spark.stop()
