In [30]:
import findspark
findspark.init('/home/ubuntu/spark-3.2.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('algorithms').getOrCreate()

In [13]:
data = spark.read.load('Korea Income and Welfare Output.csv', format="csv", header=True, inferSchema=True)

data.printSchema()

print('Number of columns: ', len(data.columns))
print('Total number of rows: ', data.count())

root
 |-- year: integer (nullable = true)
 |-- region: integer (nullable = true)
 |-- gender: integer (nullable = true)
 |-- year_born: integer (nullable = true)
 |-- education_tier: integer (nullable = true)
 |-- marriage_tier: integer (nullable = true)
 |-- income_tier: integer (nullable = true)
 |-- occupation_code: integer (nullable = true)

Number of columns:  8
Total number of rows:  52579


In [3]:
data.columns

['year',
 'region',
 'gender',
 'year_born',
 'education_tier',
 'marriage_tier',
 'income_tier',
 'occupation_code']

In [14]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [15]:
assembler = VectorAssembler(
  inputCols=['year',
             'region',
             'year_born',
             'education_tier',
             'marriage_tier',
             'income_tier',
             'occupation_code'],
              outputCol="features")

In [16]:
output = assembler.transform(data)

print(output)

DataFrame[year: int, region: int, gender: int, year_born: int, education_tier: int, marriage_tier: int, income_tier: int, occupation_code: int, features: vector]


In [17]:
final_data = output.select("features",'gender')

final_data.show(10, False)

+------------------------------------+------+
|features                            |gender|
+------------------------------------+------+
|[2005.0,1.0,1962.0,2.0,1.0,2.0,51.0]|1     |
|[2006.0,1.0,1962.0,2.0,1.0,2.0,51.0]|1     |
|[2007.0,1.0,1962.0,2.0,1.0,2.0,51.0]|1     |
|[2008.0,1.0,1962.0,2.0,1.0,2.0,53.0]|1     |
|[2009.0,1.0,1962.0,2.0,1.0,2.0,53.0]|1     |
|[2010.0,1.0,1962.0,2.0,1.0,2.0,53.0]|1     |
|[2011.0,1.0,1962.0,2.0,1.0,2.0,53.0]|1     |
|[2012.0,1.0,1962.0,2.0,1.0,2.0,53.0]|1     |
|[2013.0,1.0,1962.0,2.0,1.0,2.0,53.0]|1     |
|[2014.0,1.0,1962.0,2.0,1.0,2.0,53.0]|1     |
+------------------------------------+------+
only showing top 10 rows



In [18]:
(train_data,test_data) = final_data.randomSplit([0.8,0.2])

print(train_data.count())
print(test_data.count())

42150
10429


In [20]:
#from pyspark.ml import Pipeline
from pyspark.ml.classification import (DecisionTreeClassifier, RandomForestClassifier)

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
acc_evaluator = MulticlassClassificationEvaluator(labelCol="gender", predictionCol="prediction", metricName="accuracy")

In [21]:
dtc = DecisionTreeClassifier(labelCol='gender',featuresCol='features')
dtc_model = dtc.fit(train_data)
dtc_predictions = dtc_model.transform(test_data)

dtc_acc = acc_evaluator.evaluate(dtc_predictions)
print(dtc_acc)

0.903538210758462


In [28]:
rfc = RandomForestClassifier(labelCol='gender',featuresCol='features',numTrees=10)
rfc_model = rfc.fit(train_data)
rfc_predictions = rfc_model.transform(test_data)
rfc_predictions.show(10)

rfc_acc = acc_evaluator.evaluate(rfc_predictions)
print(rfc_acc)

+--------------------+------+--------------------+--------------------+----------+
|            features|gender|       rawPrediction|         probability|prediction|
+--------------------+------+--------------------+--------------------+----------+
|[2005.0,1.0,1959....|     2|[0.0,1.3151212984...|[0.0,0.1315121298...|       2.0|
|[2005.0,1.0,1959....|     2|[0.0,1.8993791422...|[0.0,0.1899379142...|       2.0|
|[2005.0,1.0,1959....|     2|[0.0,1.3151212984...|[0.0,0.1315121298...|       2.0|
|[2005.0,1.0,1959....|     2|[0.0,1.3151212984...|[0.0,0.1315121298...|       2.0|
|[2005.0,1.0,1959....|     2|[0.0,1.3151212984...|[0.0,0.1315121298...|       2.0|
|[2005.0,1.0,1959....|     2|[0.0,1.3151212984...|[0.0,0.1315121298...|       2.0|
|[2005.0,1.0,1959....|     2|[0.0,1.3151212984...|[0.0,0.1315121298...|       2.0|
|[2005.0,1.0,1959....|     2|[0.0,1.3151212984...|[0.0,0.1315121298...|       2.0|
|[2005.0,1.0,1959....|     2|[0.0,1.0518714480...|[0.0,0.1051871448...|       2.0|
|[20

In [29]:
print(rfc_model.featureImportances) 

(7,[0,1,2,3,4,5,6],[0.004341824006876491,0.004200097064169931,0.021784287322713432,0.0016524617232221658,0.8583636081241375,0.062309620537806,0.04734810122107439])


In [27]:
print("Here are the results!")
print('-'*40)
print('A single decision tree has an accuracy of: {0:2.2f}%'.format(dtc_acc*100))
print('-'*40)
print('A random forest ensemble has an accuracy of: {0:2.2f}%'.format(rfc_acc*100))

Here are the results!
----------------------------------------
A single decision tree has an accuracy of: 90.35%
----------------------------------------
A random forest ensemble has an accuracy of: 88.88%
