In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, col, to_date, concat, lit, month, year, count, concat_ws,  date_trunc, when, trim, date_format, mean as _mean,  lag, sum as spark_sum, monotonically_increasing_id
from pyspark.ml.feature import StringIndexer, StandardScaler, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline

from pyspark.sql.functions import when

# Initialize a Spark session
spark = SparkSession.builder.appName("BankMarketingClassification").getOrCreate()

# Load the CSV file
data = spark.read.csv('bank_marketing.csv', header=True, inferSchema=True)

# Show the first few rows of the DataFrame
data.show()


+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|age|         job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
| 58|  management| married| tertiary|     no|   2143|    yes|  no|unknown|  5|  may|     261|       1|   -1|       0| unknown| no|
| 44|  technician|  single|secondary|     no|     29|    yes|  no|unknown|  5|  may|     151|       1|   -1|       0| unknown| no|
| 33|entrepreneur| married|secondary|     no|      2|    yes| yes|unknown|  5|  may|      76|       1|   -1|       0| unknown| no|
| 47| blue-collar| married|  unknown|     no|   1506|    yes|  no|unknown|  5|  may|      92|       1|   -1|       0| unknown| no|
| 33|     unknown|  single|  unknown|     no|      1|     no|  no|unknown|  5|  may

Data Cleaning

In [2]:
# Remove duplicates
data = data.dropDuplicates()

In [3]:
# Handle missing values
# For simplicity, filling numeric columns with the mean and string columns with 'unknown'
numeric_cols = [col_name for col_name, dtype in data.dtypes if dtype in ['int', 'double']]
string_cols = [col_name for col_name, dtype in data.dtypes if dtype == 'string']

# Fill numeric columns with mean
for col_name in numeric_cols:
    mean_val = data.select(_mean(col(col_name))).collect()[0][0]
    data = data.fillna(mean_val, subset=[col_name])

In [4]:
# Fill string columns with 'unknown'
data = data.fillna('unknown', subset=string_cols)

# Convert data types (example: converting a column 'age' to integer)
data = data.withColumn("age", col("age").cast("integer"))

In [5]:
# Trim whitespace from string columns
for col_name in string_cols:
    data = data.withColumn(col_name, trim(col(col_name)))

In [6]:
# Filter outliers (example: removing rows where 'age' > 100)
data = data.filter(col("age") <= 100)

In [7]:
data.show(5)
# End of Data Cleaning

+---+-----------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|age|        job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+-----------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
| 28| unemployed|  single| tertiary|     no|      0|    yes|  no|unknown|  5|  may|     125|       2|   -1|       0| unknown| no|
| 32| technician| married|secondary|     no|      0|    yes|  no|unknown|  6|  may|     176|       2|   -1|       0| unknown| no|
| 56|blue-collar| married|secondary|     no|   1602|    yes|  no|unknown|  6|  may|     427|       1|   -1|       0| unknown| no|
| 30|blue-collar|divorced|secondary|     no|    251|    yes| yes|unknown|  7|  may|     120|       2|   -1|       0| unknown| no|
| 55|    retired| married|secondary|     no|    102|    yes|  no|unknown|  7|  may|      7

In [8]:
# Drop unnecessary columns
data = data.drop('Unnamed: 0', 'index', 'prev_month')

# Handle missing values by treating 'unknown' as a separate category
data = data.withColumn('job', when(data.job == 'unknown', 'other').otherwise(data.job))
data = data.withColumn('education', when(data.education == 'unknown', 'other').otherwise(data.education))
data = data.withColumn('contact', when(data.contact == 'unknown', 'other').otherwise(data.contact))
data = data.withColumn('poutcome', when(data.poutcome == 'unknown', 'other').otherwise(data.poutcome))

# Convert target variable 'y' to numeric
data = data.withColumn('y', when(data.y == 'yes', 1).otherwise(0))

# Index categorical columns
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
indexers = [StringIndexer(inputCol=column, outputCol=column + "_index") for column in categorical_columns]

# Assemble all features into a single vector
assembler = VectorAssembler(
    inputCols=[column + "_index" for column in categorical_columns] + ['age', 'balance', 'duration', 'campaign', 'pdays', 'previous'],
    outputCol="features")

# Scale the features
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")

# Split the data into training and testing sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# Create a pipeline
pipeline = Pipeline(stages=indexers + [assembler, scaler])

# Fit the pipeline to the training data
pipeline_model = pipeline.fit(train_data)
train_data = pipeline_model.transform(train_data)
test_data = pipeline_model.transform(test_data)

# Select the scaled features and target variable for training
train_data = train_data.select('scaled_features', 'y')
test_data = test_data.select('scaled_features', 'y')

# Train a Logistic Regression model
lr = LogisticRegression(featuresCol='scaled_features', labelCol='y')
lr_model = lr.fit(train_data)

# Make predictions on the test data
predictions = lr_model.transform(test_data)

# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol='y')
accuracy = evaluator.evaluate(predictions)

# Show some predictions
predictions = predictions.withColumn('subscription_status', when(predictions.prediction == 1, 'Subscribed').otherwise('Not Subscribed'))

print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.8725


In [9]:
predictions.select('scaled_features', 'y', 'prediction', 'probability', 'subscription_status').show(100)

+--------------------+---+----------+--------------------+-------------------+
|     scaled_features|  y|prediction|         probability|subscription_status|
+--------------------+---+----------+--------------------+-------------------+
|[3.74763539197637...|  1|       1.0|[0.34625190852921...|         Subscribed|
|[3.74763539197637...|  1|       0.0|[0.90548187824278...|     Not Subscribed|
|[3.74763539197637...|  1|       0.0|[0.83252796271879...|     Not Subscribed|
|(14,[0,1,6,8,9,10...|  0|       0.0|[0.98332427833181...|     Not Subscribed|
|[3.74763539197637...|  0|       0.0|[0.88742476411033...|     Not Subscribed|
|(14,[0,1,6,8,9,10...|  0|       0.0|[0.98350396691785...|     Not Subscribed|
|(14,[0,1,4,8,9,10...|  1|       0.0|[0.82352176380828...|     Not Subscribed|
|[3.74763539197637...|  0|       0.0|[0.97206014506715...|     Not Subscribed|
|[3.74763539197637...|  0|       0.0|[0.95104162417019...|     Not Subscribed|
|[3.74763539197637...|  0|       0.0|[0.712283117782