# Bank Marketing Classifier

**Links de Referência**:

* https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.sql.DataFrame.html
* https://sparkbyexamples.com/pyspark/pyspark-structtype-and-structfield/
* https://sparkbyexamples.com/pyspark/pyspark-map-transformation/

### Data Load and Packages Imports

In [1]:
from pyspark.sql import Row #Converte RDDs em objetos do tipo Row
from pyspark.ml.feature import StringIndexer #Converte strings em valores numéricos
from pyspark.ml.linalg import Vectors #Serve para criar um vetor denso
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
import sys
print(f'System Version: {sys.version}')
print(f'Spark Context Version: {sc.version}')

System Version: 3.9.7 (default, Sep 16 2021, 16:59:28) [MSC v.1916 64 bit (AMD64)]
Spark Context Version: 3.0.3


In [3]:
# Spark Session - usada quando se trabalha com Dataframes no Spark
spSession = SparkSession.builder.master("local").appName("DSA-SparkMLLib").config("spark.some.config.option", "session").getOrCreate()

In [4]:
#rdd = sc.textFile('data/bank-marketing-dataset.csv')
rdd = sc.textFile('data/test.csv')

### Overview

In [5]:
type(rdd)

pyspark.rdd.RDD

In [6]:
rdd.count()

41189

In [7]:
# Listando os 5 primeiros registros
rdd.take(5)

['age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y',
 '56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no',
 '57,services,married,high.school,unknown,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no',
 '37,services,married,high.school,no,yes,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no',
 '40,admin.,married,basic.6y,no,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no']

In [8]:
header = rdd.first()
rdd_body = rdd.filter(lambda x: header not in x).map(lambda l: l.split(','))

list_columns = header.replace('.', '_').upper().split(',')
list_columns

['AGE',
 'JOB',
 'MARITAL',
 'EDUCATION',
 'DEFAULT',
 'HOUSING',
 'LOAN',
 'CONTACT',
 'MONTH',
 'DAY_OF_WEEK',
 'CAMPAIGN',
 'PDAYS',
 'PREVIOUS',
 'POUTCOME',
 'EMP_VAR_RATE',
 'CONS_PRICE_IDX',
 'CONS_CONF_IDX',
 'EURIBOR3M',
 'NR_EMPLOYED',
 'Y']

In [9]:
# Não está funcionando
def createRow(p):
    dictionary = {x: p[i] for i, x in enumerate(header)}
    return Row(dictionary)


# rdd_row1 = rdd_body.map(createRow) # RDD com as chaves dinâmicas

In [10]:
rdd_row = rdd_body.map(lambda p: Row(
    AGE = p[0], 
    JOB = p[1], 
    MARITAL = p[2],
    EDUCATION = p[3],
    DEFAULT = p[4],
    HOUSING = p[5],
    LOAN = p[6],
    CONTACT = p[7],
    MONTH = p[8],
    DAY_OF_WEEK = p[9],
    CAMPAIGN = p[10],
    PDAYS = p[11],
    PREVIOUS = p[12],
    POUTCOME = p[13],
    EMP_VAR_RATE = p[14],
    CONS_PRICE_IDX = p[15],
    CONS_CONF_IDX = p[16],
    EURIBOR3M = p[17],
    EMPLOYED = p[18],
    TARGET = p[19]
))

In [11]:
# Criando um Dataframe
rdd_df = spSession.createDataFrame(rdd_row)
rdd_df.cache()

DataFrame[AGE: string, JOB: string, MARITAL: string, EDUCATION: string, DEFAULT: string, HOUSING: string, LOAN: string, CONTACT: string, MONTH: string, DAY_OF_WEEK: string, CAMPAIGN: string, PDAYS: string, PREVIOUS: string, POUTCOME: string, EMP_VAR_RATE: string, CONS_PRICE_IDX: string, CONS_CONF_IDX: string, EURIBOR3M: string, EMPLOYED: string, TARGET: string]

In [12]:
# Find count for empty, None, Null, Nan with string literals.
from pyspark.sql.functions import col,isnan,when,count

rdd_na = rdd_df.select([count(when(col(c).contains('None') | col(c).contains('NULL') | \
                            (col(c) == '' ) | col(c).isNull() | isnan(c), c )).alias(c)
                    for c in rdd_df.columns])
rdd_na.show()

+---+---+-------+---------+-------+-------+----+-------+-----+-----------+--------+-----+--------+--------+------------+--------------+-------------+---------+--------+------+
|AGE|JOB|MARITAL|EDUCATION|DEFAULT|HOUSING|LOAN|CONTACT|MONTH|DAY_OF_WEEK|CAMPAIGN|PDAYS|PREVIOUS|POUTCOME|EMP_VAR_RATE|CONS_PRICE_IDX|CONS_CONF_IDX|EURIBOR3M|EMPLOYED|TARGET|
+---+---+-------+---------+-------+-------+----+-------+-----+-----------+--------+-----+--------+--------+------------+--------------+-------------+---------+--------+------+
|  0|  0|      1|        0|      1|      1|   0|      0|    0|          1|       0|    0|       0|       2|           0|             1|            0|        0|       0|     0|
+---+---+-------+---------+-------+-------+----+-------+-----+-----------+--------+-----+--------+--------+------------+--------------+-------------+---------+--------+------+



In [13]:
list_columns = rdd_df.columns

for column in list_columns:
    count = rdd_df.select(column).distinct().count()
    print(f'Column: {column}\tCount: {count}')

Column: AGE	Count: 78
Column: JOB	Count: 12
Column: MARITAL	Count: 5
Column: EDUCATION	Count: 8
Column: DEFAULT	Count: 4
Column: HOUSING	Count: 4
Column: LOAN	Count: 3
Column: CONTACT	Count: 2
Column: MONTH	Count: 10
Column: DAY_OF_WEEK	Count: 6
Column: CAMPAIGN	Count: 42
Column: PDAYS	Count: 27
Column: PREVIOUS	Count: 8
Column: POUTCOME	Count: 4
Column: EMP_VAR_RATE	Count: 10
Column: CONS_PRICE_IDX	Count: 27
Column: CONS_CONF_IDX	Count: 26
Column: EURIBOR3M	Count: 316
Column: EMPLOYED	Count: 11
Column: TARGET	Count: 2


### Handling Data Missing

Columns with missing values:
* MARITAL
* DEFAULT
* HOUSING
* DAY_OF_WEEK
* POUTCOME
* CONS_PRICE_IDX

In [43]:
def verificarNA(c):
    if c == 'None' | c == 'NULL' | c == '' | isnan(c):
        return True
    return False

def mapNA(c, value):
    if verificarNA(c):
        return value
    return c

**MARITAL**

In [29]:
df_group = spSession.createDataFrame(rdd_df.groupBy(['TARGET', 'MARITAL']).agg({'MARITAL': 'count'}).collect())

#df.orderBy(["age", "name"], ascending=[0, 1]).collect()

df_group = df_group.orderBy(['MARITAL', 'count(MARITAL)'], ascending=[1, 0])

df_group.collect()

[Row(TARGET='no', MARITAL='', count(MARITAL)=1),
 Row(TARGET='no', MARITAL='divorced', count(MARITAL)=4136),
 Row(TARGET='yes', MARITAL='divorced', count(MARITAL)=476),
 Row(TARGET='no', MARITAL='married', count(MARITAL)=22396),
 Row(TARGET='yes', MARITAL='married', count(MARITAL)=2532),
 Row(TARGET='no', MARITAL='single', count(MARITAL)=9947),
 Row(TARGET='yes', MARITAL='single', count(MARITAL)=1620),
 Row(TARGET='no', MARITAL='unknown', count(MARITAL)=68),
 Row(TARGET='yes', MARITAL='unknown', count(MARITAL)=12)]

In [61]:
def verificarNA(c):
    if c == 'None' or c == 'NULL' or c == '' or c == 'married':
        return True
    return False

teste = '4'
verificarNA(teste)

False

In [68]:
def verificarNA(c):
    c = c.upper()
    if c == 'NONE' or c == 'NULL' or c == '' or c == 'NAN':
        return True
    return False

def func1(x):
    firstName = x.MARITAL
    lastName = x.TARGET
    
    #Fazer para todas as colunas
    if verificarNA(x.MARITAL):
        firstName = 'NULO'
    
    return (firstName,lastName)

rdd2=rdd_df.rdd.map(lambda x: func1(x))

In [67]:
rdd2.collect()

[('NULO', 'no'),
 ('NULO', 'no'),
 ('NULO', 'no'),
 ('NULO', 'no'),
 ('NULO', 'no'),
 ('NULO', 'no'),
 ('NULO', 'no'),
 ('NULO', 'no'),
 ('single', 'no'),
 ('single', 'no'),
 ('NULO', 'no'),
 ('single', 'no'),
 ('single', 'no'),
 ('divorced', 'no'),
 ('NULO', 'no'),
 ('NULO', 'no'),
 ('NULO', 'no'),
 ('NULO', 'no'),
 ('NULO', 'no'),
 ('single', 'no'),
 ('NULO', 'no'),
 ('NULO', 'no'),
 ('single', 'no'),
 ('single', 'no'),
 ('NULO', 'no'),
 ('NULO', 'no'),
 ('NULO', 'no'),
 ('NULO', 'no'),
 ('single', 'no'),
 ('NULO', 'no'),
 ('NULO', 'no'),
 ('NULO', 'no'),
 ('NULO', 'no'),
 ('NULO', 'no'),
 ('divorced', 'no'),
 ('NULO', 'no'),
 ('NULO', 'no'),
 ('NULO', 'no'),
 ('NULO', 'no'),
 ('NULO', 'no'),
 ('unknown', 'no'),
 ('NULO', 'no'),
 ('single', 'no'),
 ('NULO', 'no'),
 ('NULO', 'no'),
 ('single', 'no'),
 ('NULO', 'no'),
 ('NULO', 'no'),
 ('NULO', 'no'),
 ('NULO', 'no'),
 ('NULO', 'no'),
 ('NULO', 'no'),
 ('NULO', 'no'),
 ('single', 'no'),
 ('NULO', 'no'),
 ('NULO', 'no'),
 ('NULO', 'no')

In [None]:
rdd_fillna = rdd_row.map()

### Alterando o tipo de dado

In [None]:
from pyspark.sql.types import IntegerType,BooleanType,DateType, StringType

df = rdd_df.withColumn('AGE', col('AGE').cast(IntegerType()))

In [None]:
df.dtypes

In [None]:
df_group = df.groupBy(['TARGET', 'MARITAL']).agg({'MARITAL': 'count'}).collect()

In [None]:
sorted(df_group)

In [None]:
#df4.na.fill({'age': 50, 'name': 'unknown'}).show()
df_fillna = df.na.fill({'MARITAL': 'unknown'})

In [None]:
df_group = df_fillna.groupBy(['TARGET', 'MARITAL']).agg({'MARITAL': 'count'}).collect()
df_group

In [None]:
df_fillna = df.mapInPandas(lambda x: mapNA(x, 'MAPPED'), df.Select('MARITAL'))

### Normalização dos dados

In [None]:
string_indexer = StringIndexer(inputCol = 'JOB', outputCol = 'IDX_JOB')
si_model = string_indexer.fit(rdd_df)

In [None]:
rdd_df_norm = si_model.transform(rdd_df)

In [None]:
rdd_df_norm.select("JOB","IDX_JOB").distinct().collect()