In [None]:
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/45/b0/9d6860891ab14a39d4bddf80ba26ce51c2f9dc4805e5c6978ac0472c120a/pyspark-3.1.1.tar.gz (212.3MB)
[K     |████████████████████████████████| 212.3MB 70kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 43.1MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.1-py2.py3-none-any.whl size=212767604 sha256=4c3bcb6387819ea523878f864ebdba04ceee6e4298ea41c7962292c4fc677893
  Stored in directory: /root/.cache/pip/wheels/0b/90/c0/01de724414ef122bd05f056541fb6a0ecf47c7ca655f8b3c0f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.1


In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('ML project').getOrCreate()

In [None]:
# load data 
df = spark.read.csv('bank.csv', header = True, inferSchema = True)
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- deposit: string (nullable = true)



In [None]:
df.show()

+---+-----------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+
|age|        job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|deposit|
+---+-----------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+
| 59|     admin.| married|secondary|     no|   2343|    yes|  no|unknown|  5|  may|    1042|       1|   -1|       0| unknown|    yes|
| 56|     admin.| married|secondary|     no|     45|     no|  no|unknown|  5|  may|    1467|       1|   -1|       0| unknown|    yes|
| 41| technician| married|secondary|     no|   1270|    yes|  no|unknown|  5|  may|    1389|       1|   -1|       0| unknown|    yes|
| 55|   services| married|secondary|     no|   2476|    yes|  no|unknown|  5|  may|     579|       1|   -1|       0| unknown|    yes|
| 54|     admin.| married| tertiary|     no|    184|     no|  

In [None]:
# Show Dataset in DataFrame
import pandas as pd
pd.DataFrame(df.take(10), columns=df.columns).transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
age,59,56,41,55,54,42,56,60,37,28
job,admin.,admin.,technician,services,admin.,management,management,retired,technician,services
marital,married,married,married,married,married,single,married,divorced,married,single
education,secondary,secondary,secondary,secondary,tertiary,tertiary,tertiary,secondary,secondary,secondary
default,no,no,no,no,no,no,no,no,no,no
balance,2343,45,1270,2476,184,0,830,545,1,5090
housing,yes,no,yes,yes,no,yes,yes,yes,yes,yes
loan,no,no,no,no,no,yes,yes,no,no,no
contact,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown
day,5,5,5,5,5,5,6,6,6,6


In [None]:
# Data grouping by class 
class_name = df.columns[len(df.columns)-1]
df.groupby(class_name).count().toPandas()

Unnamed: 0,deposit,count
0,no,5873
1,yes,5289


In [None]:
# convert string to numeric 
from pyspark.ml.feature import StringIndexer
name_col = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'deposit']
indexer = StringIndexer()

In [None]:
df1 = df
for name in name_col:
  indexer.setInputCol(name).setOutputCol(name + "_index")
  df2 = indexer.fit(df1).transform(df1)
  df1 = df2

In [None]:
df1.show()

+---+-----------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+---------+-------------+---------------+-------------+-------------+----------+-------------+-----------+--------------+-------------+
|age|        job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|deposit|job_index|marital_index|education_index|default_index|housing_index|loan_index|contact_index|month_index|poutcome_index|deposit_index|
+---+-----------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+---------+-------------+---------------+-------------+-------------+----------+-------------+-----------+--------------+-------------+
| 59|     admin.| married|secondary|     no|   2343|    yes|  no|unknown|  5|  may|    1042|       1|   -1|       0| unknown|    yes|      3.0|          0.0|            0.0|          0.0|         

In [None]:
name_col_feature = [name + "_index" for name in name_col]
name_col_feature.remove("deposit_index")
name_col_feature += ['balance', 'day', 'campaign', 'pdays', 'previous']
class_name = 'deposit_index'
print(name_col_feature)
print(class_name)

['job_index', 'marital_index', 'education_index', 'default_index', 'housing_index', 'loan_index', 'contact_index', 'month_index', 'poutcome_index', 'balance', 'day', 'campaign', 'pdays', 'previous']
deposit_index


In [None]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler()
assembler.setInputCols(name_col_feature).setOutputCol('features')

transformed_data = assembler.transform(df1)

In [None]:
transformed_data.show()

+---+-----------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+---------+-------------+---------------+-------------+-------------+----------+-------------+-----------+--------------+-------------+--------------------+
|age|        job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|deposit|job_index|marital_index|education_index|default_index|housing_index|loan_index|contact_index|month_index|poutcome_index|deposit_index|            features|
+---+-----------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+---------+-------------+---------------+-------------+-------------+----------+-------------+-----------+--------------+-------------+--------------------+
| 59|     admin.| married|secondary|     no|   2343|    yes|  no|unknown|  5|  may|    1042|       1|   -1|       0| unknown|    yes|

In [None]:
# Split the data
(training_data, test_data) = transformed_data.randomSplit([0.8,0.2])

In [None]:
training_data.show()

+---+-----------+-------+---------+-------+-------+-------+----+---------+---+-----+--------+--------+-----+--------+--------+-------+---------+-------------+---------------+-------------+-------------+----------+-------------+-----------+--------------+-------------+--------------------+
|age|        job|marital|education|default|balance|housing|loan|  contact|day|month|duration|campaign|pdays|previous|poutcome|deposit|job_index|marital_index|education_index|default_index|housing_index|loan_index|contact_index|month_index|poutcome_index|deposit_index|            features|
+---+-----------+-------+---------+-------+-------+-------+----+---------+---+-----+--------+--------+-----+--------+--------+-------+---------+-------------+---------------+-------------+-------------+----------+-------------+-----------+--------------+-------------+--------------------+
| 18|    student| single|  primary|     no|    608|     no|  no| cellular| 12|  aug|     267|       1|   -1|       0| unknown|    

In [None]:
from pyspark.ml.classification import LogisticRegression
model = LogisticRegression(featuresCol = 'features',labelCol='deposit_index', maxIter=30)
M = model.fit(training_data)

In [None]:
# Predict with the test dataset
predictions = M.transform(test_data)

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

multi_evaluator = MulticlassClassificationEvaluator(labelCol = 'deposit_index', metricName = 'accuracy')
print('Logistic Regression Accuracy:', multi_evaluator.evaluate(predictions))

Logistic Regression Accuracy: 0.6900844819919965
