In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Start Spark session
spark = SparkSession.builder.appName("LogRegExample").getOrCreate()

# Create DataFrame
data = spark.createDataFrame([
    (22, 20000, 0),
    (25, 25000, 0),
    (47, 47000, 1),
    (52, 52000, 1),
    (46, 46000, 1)
], ["age", "income", "label"])

# Assemble features
assembler = VectorAssembler(inputCols=["age", "income"], outputCol="features_raw")
data = assembler.transform(data)

# Scale features
scaler = StandardScaler(inputCol="features_raw", outputCol="features", withStd=True, withMean=False)
scaler_model = scaler.fit(data)
data = scaler_model.transform(data)

# Split data
train, test = data.randomSplit([0.8, 0.2])

# Train model
lr = LogisticRegression(featuresCol="features", labelCol="label")
model = lr.fit(train)

# Predict and evaluate
predictions = model.transform(test)
evaluator = BinaryClassificationEvaluator(labelCol="label")
accuracy = evaluator.evaluate(predictions)
print("Area under ROC:", accuracy)
