In [63]:
import pyspark

In [64]:
from pyspark.sql import SparkSession

In [65]:
spark = SparkSession.builder.appName('BankingApp').getOrCreate()

In [66]:
df_bank = spark.read.format("csv") \
    .option("sep", ";") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("/content/sample_data/bank-full.csv")


In [67]:
df_bank.show()

+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|age|         job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
| 58|  management| married| tertiary|     no|   2143|    yes|  no|unknown|  5|  may|     261|       1|   -1|       0| unknown| no|
| 44|  technician|  single|secondary|     no|     29|    yes|  no|unknown|  5|  may|     151|       1|   -1|       0| unknown| no|
| 33|entrepreneur| married|secondary|     no|      2|    yes| yes|unknown|  5|  may|      76|       1|   -1|       0| unknown| no|
| 47| blue-collar| married|  unknown|     no|   1506|    yes|  no|unknown|  5|  may|      92|       1|   -1|       0| unknown| no|
| 33|     unknown|  single|  unknown|     no|      1|     no|  no|unknown|  5|  may

In [78]:
# Which customer segments are most likely to subscribe to a term deposit?
df_bank = df_bank.withColumn("subscription_rate", when(df_bank.y == "yes", 1).otherwise(0))
df_bank.groupBy("job").agg({"subscription_rate": "avg"}).show()
df_bank.groupBy("marital").agg({"subscription_rate": "avg"}).show()
df_bank.groupBy("education").agg({"subscription_rate": "avg"}).show()
from pyspark.sql.functions import when

df_bank = df_bank.withColumn(
    "age_category",
    when(df_bank.age < 30, "Youth")
    .when((df_bank.age >= 30) & (df_bank.age <= 44), "Adult")
    .otherwise("Senior")
)
df_bank.groupBy("age_category").agg({"subscription_rate": "avg"}).show()

+-------------+----------------------+
|          job|avg(subscription_rate)|
+-------------+----------------------+
|   management|   0.13755550856417847|
|      retired|   0.22791519434628976|
|      unknown|   0.11805555555555555|
|self-employed|   0.11842938568714376|
|      student|    0.2867803837953092|
|  blue-collar|   0.07274969173859433|
| entrepreneur|   0.08271687962340282|
|       admin.|   0.12202668729452718|
|   technician|   0.11056996182703699|
|     services|   0.08883004333172845|
|    housemaid|   0.08790322580645162|
|   unemployed|   0.15502686108979277|
+-------------+----------------------+

+--------+----------------------+
| marital|avg(subscription_rate)|
+--------+----------------------+
|divorced|   0.11945458037257538|
| married|   0.10123465863158668|
|  single|    0.1494917904612979|
+--------+----------------------+

+---------+----------------------+
|education|avg(subscription_rate)|
+---------+----------------------+
|  unknown|   0.135702746365105

In [79]:
# How do previous campaign contacts affect the likelihood of success?
df_bank.filter(df_bank.previous==0).groupBy("previous").agg({"subscription_rate": "avg"}).show()
df_bank.filter(df_bank.previous!=0).groupBy("previous").agg({"subscription_rate": "avg"}).show()
df_bank.filter(df_bank.pdays==-1).groupBy("pdays","y").count().show()
df_bank.filter(df_bank.pdays!=-1).groupBy("pdays").agg({"subscription_rate": "avg"}).show()


+--------+----------------------+
|previous|avg(subscription_rate)|
+--------+----------------------+
|       0|   0.09157330735509012|
+--------+----------------------+

+--------+----------------------+
|previous|avg(subscription_rate)|
+--------+----------------------+
|      28|                   0.0|
|      26|                   0.5|
|      27|                   0.0|
|      12|   0.22727272727272727|
|      22|   0.16666666666666666|
|       1|   0.21031746031746032|
|      13|   0.23684210526315788|
|      16|                   0.0|
|       6|    0.2996389891696751|
|       3|    0.2574430823117338|
|      20|                 0.125|
|      40|                   0.0|
|       5|    0.2636165577342048|
|      19|   0.18181818181818182|
|      15|                  0.05|
|      37|                   0.0|
|       9|    0.2608695652173913|
|      17|                   0.2|
|       4|   0.23949579831932774|
|       8|    0.3023255813953488|
+--------+----------------------+
only showing 

In [70]:
#What is the relationship between call duration and campaign success?
from pyspark.sql.functions import avg
df_bank.groupBy("y").agg(avg("duration")).show()

+---+------------------+
|  y|     avg(duration)|
+---+------------------+
| no|221.18280647262162|
|yes| 537.2945736434109|
+---+------------------+



In [71]:
#Does the time of year (month) or day affect subscription rates?
df_bank.groupBy("month").agg({"subscription_rate": "avg"}).show()
df_bank = df_bank.withColumn("day_of_week", df_bank.day.substr(1, 3))
df_bank.groupBy("day_of_week").agg({"subscription_rate": "avg"}).show()

+-----+----------------------+
|month|avg(subscription_rate)|
+-----+----------------------+
|  jun|   0.10222804718217562|
|  aug|   0.11013286377461182|
|  may|   0.06719453726572715|
|  feb|    0.1664779161947905|
|  sep|   0.46459412780656306|
|  mar|     0.519916142557652|
|  oct|   0.43766937669376693|
|  jul|   0.09093546047860769|
|  nov|   0.10151133501259446|
|  apr|   0.19679399727148705|
|  dec|    0.4672897196261682|
|  jan|   0.10121168923734854|
+-----+----------------------+

+-----------+----------------------+
|day_of_week|avg(subscription_rate)|
+-----------+----------------------+
|          7|    0.0864061640066043|
|         15|   0.13975337639459776|
|         11|   0.12237998647734956|
|         29|   0.07392550143266476|
|          3|     0.164967562557924|
|         30|   0.17305236270753513|
|          8|   0.10912052117263844|
|         28|   0.07814207650273224|
|         22|   0.17016574585635358|
|         16|   0.13568904593639575|
|          5|     0.11

In [72]:
#What impact does the contact method (telephone, cellular vs. unknown) have on success?
df_bank.groupBy("contact").agg({"subscription_rate": "avg"}).show()

+---------+----------------------+
|  contact|avg(subscription_rate)|
+---------+----------------------+
|  unknown|  0.040706605222734255|
| cellular|   0.14918900460986853|
|telephone|   0.13420509291121818|
+---------+----------------------+



In [73]:
# Are customers with housing or personal loans more or less likely to subscribe?
df_bank.groupBy("housing").agg({"subscription_rate": "avg"}).show()
df_bank.groupBy("loan").agg({"subscription_rate": "avg"}).show()

+-------+----------------------+
|housing|avg(subscription_rate)|
+-------+----------------------+
|     no|    0.1670235546038544|
|    yes|   0.07699960206923995|
+-------+----------------------+

+----+----------------------+
|loan|avg(subscription_rate)|
+----+----------------------+
|  no|   0.12655727342165565|
| yes|   0.06681391496410823|
+----+----------------------+



In [74]:
# What campaign strategies resulted in the highest conversion rates historically?
df_bank.groupBy("contact", "poutcome").agg({"subscription_rate": "avg"}).show()
df_bank.groupBy("campaign").agg({"subscription_rate": "avg"}).show()


+---------+--------+----------------------+
|  contact|poutcome|avg(subscription_rate)|
+---------+--------+----------------------+
|  unknown| failure|   0.12903225806451613|
|  unknown| unknown|                  0.04|
|telephone| unknown|   0.11076923076923077|
|telephone|   other|                 0.125|
| cellular| unknown|   0.12036440599981596|
| cellular|   other|    0.1710445937690898|
|  unknown| success|                  0.25|
| cellular| failure|   0.12651799514241555|
|telephone| success|    0.6578947368421053|
|  unknown|   other|   0.18518518518518517|
|telephone| failure|   0.12023460410557185|
| cellular| success|    0.6498194945848376|
+---------+--------+----------------------+

+--------+----------------------+
|campaign|avg(subscription_rate)|
+--------+----------------------+
|      31|                   0.0|
|      34|                   0.0|
|      28|                   0.0|
|      26|                   0.0|
|      27|                   0.0|
|      44|             

In [76]:
#We’re calculating average subscription rate (y == 'yes') many times across different groups. How can we avoid repeating this calculation and optimize the whole process?
df_bank = df_bank.withColumn("subscription_rate", when(df_bank.y == "yes", 1).otherwise(0)).cache()