In [16]:
!pip install pyspark



In [40]:
from pyspark.sql import SparkSession,SQLContext
# Config Spark Session
spark = SparkSession.builder.appName("Banking Marketing").getOrCreate();

In [41]:
df = spark.read.csv("drive/MyDrive/Colab Notebooks/bank.csv", header=True, inferSchema=True)

In [42]:
df.show()

+---+-----------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+
|age|        job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|deposit|
+---+-----------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+
| 59|     admin.| married|secondary|     no|   2343|    yes|  no|unknown|  5|  may|    1042|       1|   -1|       0| unknown|    yes|
| 56|     admin.| married|secondary|     no|     45|     no|  no|unknown|  5|  may|    1467|       1|   -1|       0| unknown|    yes|
| 41| technician| married|secondary|     no|   1270|    yes|  no|unknown|  5|  may|    1389|       1|   -1|       0| unknown|    yes|
| 55|   services| married|secondary|     no|   2476|    yes|  no|unknown|  5|  may|     579|       1|   -1|       0| unknown|    yes|
| 54|     admin.| married| tertiary|     no|    184|     no|  

In [43]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- deposit: string (nullable = true)



In [44]:
string_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact','month', 'poutcome', 'deposit']

In [45]:
# Convert String col to Numeric
from pyspark.ml.feature import StringIndexer
for i in string_features:
  indexer = StringIndexer()
  indexer.setInputCol(i).setOutputCol(i+"_indexer")
  df = indexer.fit(df).transform(df)

In [46]:
df.show()

+---+-----------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+-----------+---------------+-----------------+---------------+---------------+------------+---------------+-------------+----------------+---------------+
|age|        job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|deposit|job_indexer|marital_indexer|education_indexer|default_indexer|housing_indexer|loan_indexer|contact_indexer|month_indexer|poutcome_indexer|deposit_indexer|
+---+-----------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+-----------+---------------+-----------------+---------------+---------------+------------+---------------+-------------+----------------+---------------+
| 59|     admin.| married|secondary|     no|   2343|    yes|  no|unknown|  5|  may|    1042|       1|   -1|       0| unknown|    yes|   

In [47]:
df = df.drop(*string_features[:-1])

In [48]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- balance: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- deposit: string (nullable = true)
 |-- job_indexer: double (nullable = false)
 |-- marital_indexer: double (nullable = false)
 |-- education_indexer: double (nullable = false)
 |-- default_indexer: double (nullable = false)
 |-- housing_indexer: double (nullable = false)
 |-- loan_indexer: double (nullable = false)
 |-- contact_indexer: double (nullable = false)
 |-- month_indexer: double (nullable = false)
 |-- poutcome_indexer: double (nullable = false)
 |-- deposit_indexer: double (nullable = false)



In [49]:
df.show()

+---+-------+---+--------+--------+-----+--------+-------+-----------+---------------+-----------------+---------------+---------------+------------+---------------+-------------+----------------+---------------+
|age|balance|day|duration|campaign|pdays|previous|deposit|job_indexer|marital_indexer|education_indexer|default_indexer|housing_indexer|loan_indexer|contact_indexer|month_indexer|poutcome_indexer|deposit_indexer|
+---+-------+---+--------+--------+-----+--------+-------+-----------+---------------+-----------------+---------------+---------------+------------+---------------+-------------+----------------+---------------+
| 59|   2343|  5|    1042|       1|   -1|       0|    yes|        3.0|            0.0|              0.0|            0.0|            1.0|         0.0|            1.0|          0.0|             0.0|            1.0|
| 56|     45|  5|    1467|       1|   -1|       0|    yes|        3.0|            0.0|              0.0|            0.0|            0.0|         0.0

In [50]:
feature_names = df.columns[:-1]

In [52]:
feature_names.remove("deposit")

In [53]:
feature_names

['age',
 'balance',
 'day',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'job_indexer',
 'marital_indexer',
 'education_indexer',
 'default_indexer',
 'housing_indexer',
 'loan_indexer',
 'contact_indexer',
 'month_indexer',
 'poutcome_indexer']

In [54]:
from pyspark.ml.feature import VectorAssembler

# Vector feature
assembler = VectorAssembler(inputCols=feature_names, outputCol="features")
transformed_data = assembler.transform(df)

In [55]:
transformed_data.select("features").show()

+--------------------+
|            features|
+--------------------+
|(16,[0,1,2,3,4,5,...|
|(16,[0,1,2,3,4,5,...|
|(16,[0,1,2,3,4,5,...|
|(16,[0,1,2,3,4,5,...|
|(16,[0,1,2,3,4,5,...|
|[42.0,0.0,5.0,562...|
|[56.0,830.0,6.0,1...|
|[60.0,545.0,6.0,1...|
|(16,[0,1,2,3,4,5,...|
|[28.0,5090.0,6.0,...|
|[38.0,100.0,7.0,7...|
|(16,[0,1,2,3,4,5,...|
|[29.0,199.0,7.0,1...|
|[46.0,460.0,7.0,1...|
|[31.0,703.0,8.0,9...|
|[35.0,3837.0,8.0,...|
|[32.0,611.0,8.0,5...|
|(16,[0,1,2,3,4,5,...|
|(16,[0,1,2,3,4,5,...|
|[49.0,168.0,8.0,5...|
+--------------------+
only showing top 20 rows



In [56]:
# Tách train test
(training_data, test_data) = transformed_data.randomSplit([0.8,0.2])

In [57]:
from pyspark.ml.classification import LogisticRegression
model = LogisticRegression(featuresCol = 'features',labelCol='deposit_indexer', maxIter=30)

In [58]:
# Fit model
fit_model = model.fit(training_data)

In [59]:
# Dự đoán data
y_pred = fit_model.transform(test_data)

In [60]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Tính accuracy
multi_evaluator = MulticlassClassificationEvaluator(labelCol = 'deposit_indexer', metricName = 'accuracy')
print('Logistic Regression Accuracy:', multi_evaluator.evaluate(y_pred))


Logistic Regression Accuracy: 0.8035055350553506


In [62]:
y_pred.select("age","balance","deposit","rawPrediction", "probability", "prediction").show(10)

+---+-------+-------+--------------------+--------------------+----------+
|age|balance|deposit|       rawPrediction|         probability|prediction|
+---+-------+-------+--------------------+--------------------+----------+
| 18|      5|     no|[1.24031836691664...|[0.77561942577722...|       0.0|
| 18|    108|    yes|[-0.8424151044749...|[0.30102637901402...|       1.0|
| 18|    108|    yes|[0.93865241633341...|[0.71882737144269...|       0.0|
| 18|    348|    yes|[-0.1202583591761...|[0.46997159090632...|       1.0|
| 19|     55|     no|[-0.8383004794600...|[0.30189284335200...|       1.0|
| 19|     60|     no|[0.53454555853161...|[0.63054266587903...|       0.0|
| 19|    103|    yes|[1.12953722001224...|[0.75575348457604...|       0.0|
| 19|    329|    yes|[-0.5368318763241...|[0.36892487585105...|       1.0|
| 19|    372|    yes|[0.15061621411956...|[0.53758303202370...|       0.0|
| 19|    394|    yes|[-0.6405631974937...|[0.34511923892910...|       1.0|
+---+-------+-------+----