### 一、加载数据

In [8]:
import os
import pandas as pd
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName("BankMarketingAnalysis").getOrCreate()

In [18]:
root_path = "D:\\大三下\\Big_Data_Application_Spark\\datasets\\bank-additional"
full_path = os.path.join(root_path, "bank-additional-full.csv")
bankMarketing_df = pd.read_csv(full_path, sep=";")

In [19]:
bankMarketing_df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [20]:
bankMarketing = spark.createDataFrame(bankMarketing_df)

In [30]:
bankMarketing.select("age", "job", "marital").show()

+---+-----------+--------+
|age|        job| marital|
+---+-----------+--------+
| 56|  housemaid| married|
| 57|   services| married|
| 37|   services| married|
| 40|     admin.| married|
| 56|   services| married|
| 45|   services| married|
| 59|     admin.| married|
| 41|blue-collar| married|
| 24| technician|  single|
| 25|   services|  single|
| 41|blue-collar| married|
| 25|   services|  single|
| 29|blue-collar|  single|
| 57|  housemaid|divorced|
| 35|blue-collar| married|
| 54|    retired| married|
| 35|blue-collar| married|
| 46|blue-collar| married|
| 50|blue-collar| married|
| 39| management|  single|
+---+-----------+--------+
only showing top 20 rows



In [29]:
bankMarketing.columns

['age',
 'job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'day_of_week',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome',
 'emp.var.rate',
 'cons.price.idx',
 'cons.conf.idx',
 'euribor3m',
 'nr.employed',
 'y']

### 二、计算指标
#### 2.1 根据婚姻情况（marital指标）统计各类人群的数量和缺失值的数量

In [33]:
bankMarketing.groupBy("marital").count().show()

+--------+-----+
| marital|count|
+--------+-----+
| unknown|   80|
|divorced| 4612|
| married|24928|
|  single|11568|
+--------+-----+



#### 2.2 根据职业（job指标）统计各类人群的数量和缺失值的数量

In [34]:
bankMarketing.groupBy("job").count().show()

+-------------+-----+
|          job|count|
+-------------+-----+
|   management| 2924|
|      retired| 1720|
|      unknown|  330|
|self-employed| 1421|
|      student|  875|
|  blue-collar| 9254|
| entrepreneur| 1456|
|       admin.|10422|
|   technician| 6743|
|     services| 3969|
|    housemaid| 1060|
|   unemployed| 1014|
+-------------+-----+



#### 2.3 根据教育情况（education）统计各类人群的数量和缺失值的数量

In [35]:
bankMarketing.groupBy("education").count().show()

+-------------------+-----+
|          education|count|
+-------------------+-----+
|        high.school| 9515|
|            unknown| 1731|
|           basic.6y| 2292|
|professional.course| 5243|
|  university.degree|12168|
|           basic.4y| 4176|
|           basic.9y| 6045|
|         illiterate|   18|
+-------------------+-----+



#### 2.4 选数值类字段作为数据子集，进行描述性统计（ 包括频次统计、平均值、标准差、最小值、最大值）

In [39]:
# 先查看那些是数值型字段
bankMarketing.printSchema()

root
 |-- age: long (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- duration: long (nullable = true)
 |-- campaign: long (nullable = true)
 |-- pdays: long (nullable = true)
 |-- previous: long (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- emp.var.rate: double (nullable = true)
 |-- cons.price.idx: double (nullable = true)
 |-- cons.conf.idx: double (nullable = true)
 |-- euribor3m: double (nullable = true)
 |-- nr.employed: double (nullable = true)
 |-- y: string (nullable = true)



In [None]:
# 疑问：带点的列名如何检索（除了改变列名外）
# bmSubset = bankMarketing.select("age", "duration", "campaign", "pdays", "previous",
#                                 "emp.var.rate", "cons.price.idx", "cons.conf.idx",
#                                 "euribor3m", "nr.employed")

In [54]:
# 使用cache缓存下来
bmSubset = bankMarketing.select("age", "duration", "campaign", "pdays", "previous", "euribor3m").cache()
bmSubset.describe().show()

+-------+------------------+-----------------+-----------------+-----------------+-------------------+------------------+
|summary|               age|         duration|         campaign|            pdays|           previous|         euribor3m|
+-------+------------------+-----------------+-----------------+-----------------+-------------------+------------------+
|  count|             41188|            41188|            41188|            41188|              41188|             41188|
|   mean| 40.02406040594348|258.2850101971448|2.567592502670681|962.4754540157328|0.17296299893172767|3.6212908128580406|
| stddev|10.421249980934045|259.2792488364649|2.770013542902329|186.9109073447416|0.49490107983929027|1.7344474048512684|
|    min|                17|                0|                1|                0|                  0|             0.634|
|    max|                98|             4918|               56|              999|                  7|             5.045|
+-------+---------------

#### 2.5 所有受访人的学历背景出现频率超过 0. 3 的学历
* 使用stat

In [66]:
# bankMarketing.stat.freqItems(["education"], 0.3).show()
bankMarketing.stat.freqItems(["education"], 0.3).collect()[0]

Row(education_freqItems=['high.school', 'university.degree', 'professional.course'])

#### 2.6 根据定期存款意愿将客户分组，并统计各组客户的客户总数、此次访谈的电话联系的平均次数、最后一次电话联系的平均持续时间、早前访谈电话联系的平均次数

In [71]:
from pyspark.sql.functions import count, avg,round

# agg函数是为了在整体的DataFrame中部分组聚合
bankMarketing.groupBy("y").agg(
    count("y").name("Total customers"),
    round(avg("campaign"), 2).name("Avg calls(curr)"),
    round(avg("duration"), 2).name("Avg dur"),
    round(avg("previous"), 2).name("Avg calls(prev)")
).show()

+---+---------------+---------------+-------+---------------+
|  y|Total customers|Avg calls(curr)|Avg dur|Avg calls(prev)|
+---+---------------+---------------+-------+---------------+
| no|          36548|           2.63| 220.84|           0.13|
|yes|           4640|           2.05| 553.19|           0.49|
+---+---------------+---------------+-------+---------------+



#### 2.7 根据年龄将客户分组，并统计各组客户的客户总数、此次访谈的电话联系的平均次数、最后一次电话联系的平均持续时间、早前访谈电话联系的平均次数

In [69]:
from pyspark.sql.functions import count, avg,round

# agg函数是为了在整体的DataFrame中部分组聚合
bankMarketing.groupBy("age").agg(
    count("age").name("Total customers"),
    round(avg("campaign"), 2).name("Avg calls(curr)"),
    round(avg("duration"), 2).name("Avg dur"),
    round(avg("previous"), 2).name("Avg calls(prev)")
).orderBy("age").show()

+---+---------------+---------------+-------+---------------+
|age|Total customers|Avg calls(curr)|Avg dur|Avg calls(prev)|
+---+---------------+---------------+-------+---------------+
| 17|              5|            2.2|  420.0|            1.8|
| 18|             28|           1.32| 321.79|           0.75|
| 19|             42|           2.29|  271.5|           0.67|
| 20|             65|           2.35| 288.49|           0.63|
| 21|            102|           2.03| 264.25|           0.28|
| 22|            137|           2.29| 250.93|           0.29|
| 23|            226|            2.4| 281.27|           0.34|
| 24|            463|           2.42|  282.9|           0.32|
| 25|            598|           2.91| 259.98|           0.18|
| 26|            698|           2.48| 263.53|           0.22|
| 27|            851|           2.49| 269.47|           0.23|
| 28|           1001|           2.33| 270.02|            0.2|
| 29|           1453|           2.46| 258.24|            0.2|
| 30|   