In [1]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [2]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
adult = fetch_ucirepo(id=2)

# data (as pandas dataframes)
X = adult.data.features
y = adult.data.targets

# metadata
print(adult.metadata)

# variable information
print(adult.variables)


{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether income exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Mon Aug 07 2023', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': 'Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the following conditions: ((AAG

In [3]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=a3fd2139952fa38cb5d8412b8302f4407c86a25594465e3feacaa628567089fb
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [4]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('adult').getOrCreate()

In [5]:
data = spark.read.csv('/content/adult.data', header=True, inferSchema=True)

In [6]:
data.show()

+---+-----------------+--------+-------------+----+--------------------+------------------+--------------+-------------------+-------+-------+---+----+--------------+------+
| 39|        State-gov|   77516|    Bachelors|  13|       Never-married|      Adm-clerical| Not-in-family|              White|   Male|   2174|  0|  40| United-States| <=50K|
+---+-----------------+--------+-------------+----+--------------------+------------------+--------------+-------------------+-------+-------+---+----+--------------+------+
| 50| Self-emp-not-inc| 83311.0|    Bachelors|13.0|  Married-civ-spouse|   Exec-managerial|       Husband|              White|   Male|    0.0|0.0|13.0| United-States| <=50K|
| 38|          Private|215646.0|      HS-grad| 9.0|            Divorced| Handlers-cleaners| Not-in-family|              White|   Male|    0.0|0.0|40.0| United-States| <=50K|
| 53|          Private|234721.0|         11th| 7.0|  Married-civ-spouse| Handlers-cleaners|       Husband|              Black|   M

In [7]:
labels = ['age','workclass','fnlwgt','education','numbers','marital','occupation','relation','race','gender','gain','loss','hourlypay','country','income']

In [8]:
df = data.toDF(*labels)
df.show()

+---+-----------------+--------+-------------+-------+--------------------+------------------+--------------+-------------------+-------+-------+----+---------+--------------+------+
|age|        workclass|  fnlwgt|    education|numbers|             marital|        occupation|      relation|               race| gender|   gain|loss|hourlypay|       country|income|
+---+-----------------+--------+-------------+-------+--------------------+------------------+--------------+-------------------+-------+-------+----+---------+--------------+------+
| 50| Self-emp-not-inc| 83311.0|    Bachelors|   13.0|  Married-civ-spouse|   Exec-managerial|       Husband|              White|   Male|    0.0| 0.0|     13.0| United-States| <=50K|
| 38|          Private|215646.0|      HS-grad|    9.0|            Divorced| Handlers-cleaners| Not-in-family|              White|   Male|    0.0| 0.0|     40.0| United-States| <=50K|
| 53|          Private|234721.0|         11th|    7.0|  Married-civ-spouse| Handlers-

In [9]:
# Checking for the corret datatype for each column
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: double (nullable = true)
 |-- education: string (nullable = true)
 |-- numbers: double (nullable = true)
 |-- marital: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relation: string (nullable = true)
 |-- race: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- gain: double (nullable = true)
 |-- loss: double (nullable = true)
 |-- hourlypay: double (nullable = true)
 |-- country: string (nullable = true)
 |-- income: string (nullable = true)



In [10]:
# Convert the some given feature in correct datatype
from pyspark.sql.functions import col
for i in ['age','fnlwgt','numbers','gain','loss','hourlypay']:
  new_df = df.withColumn(i, col(i).cast('integer'))

In [11]:
new_df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: double (nullable = true)
 |-- education: string (nullable = true)
 |-- numbers: double (nullable = true)
 |-- marital: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relation: string (nullable = true)
 |-- race: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- gain: double (nullable = true)
 |-- loss: double (nullable = true)
 |-- hourlypay: integer (nullable = true)
 |-- country: string (nullable = true)
 |-- income: string (nullable = true)



In [12]:
# Now Check for the NULL Values
from pyspark.sql.functions import *

new_df.select([count(when(col(c).isNull(), c)).alias(c) for c in new_df.columns]).show()

+---+---------+------+---------+-------+-------+----------+--------+----+------+----+----+---------+-------+------+
|age|workclass|fnlwgt|education|numbers|marital|occupation|relation|race|gender|gain|loss|hourlypay|country|income|
+---+---------+------+---------+-------+-------+----------+--------+----+------+----+----+---------+-------+------+
|  0|        0|     0|        0|      0|      0|         0|       0|   0|     0|   0|   0|        0|      0|     0|
+---+---------+------+---------+-------+-------+----------+--------+----+------+----+----+---------+-------+------+



In [13]:
# Check for different values of columns
df.select('workclass').distinct().show()

+-----------------+
|        workclass|
+-----------------+
|        State-gov|
|      Federal-gov|
| Self-emp-not-inc|
|        Local-gov|
|          Private|
|                ?|
|     Self-emp-inc|
|      Without-pay|
|     Never-worked|
+-----------------+



In [14]:
# replace the unwanted value  with None
df =new_df.replace("?", None)

In [15]:
# Now Check for the NULL Values
from pyspark.sql.functions import *

df.select([count(when(col(c).isNull(), c)).alias(c) for c in new_df.columns]).show()

+---+---------+------+---------+-------+-------+----------+--------+----+------+----+----+---------+-------+------+
|age|workclass|fnlwgt|education|numbers|marital|occupation|relation|race|gender|gain|loss|hourlypay|country|income|
+---+---------+------+---------+-------+-------+----------+--------+----+------+----+----+---------+-------+------+
|  0|        0|     0|        0|      0|      0|         0|       0|   0|     0|   0|   0|        0|      0|     0|
+---+---------+------+---------+-------+-------+----------+--------+----+------+----+----+---------+-------+------+



In [16]:
df.groupby("country").count().show()

+-------------------+-----+
|            country|count|
+-------------------+-----+
| Dominican-Republic|   70|
|            Ireland|   24|
|               Cuba|   95|
|          Guatemala|   64|
|               Iran|   43|
|             Taiwan|   51|
|        El-Salvador|  106|
|      United-States|29169|
|              South|   80|
|              Japan|   62|
|          Nicaragua|   34|
|             Canada|  121|
|           Cambodia|   19|
|               Laos|   18|
|            Germany|  137|
|    Trinadad&Tobago|   19|
|               Peru|   31|
|            Ecuador|   28|
|         Yugoslavia|   16|
|            Vietnam|   67|
+-------------------+-----+
only showing top 20 rows



In [17]:
df = df.fillna("United-States", subset = ['country'])


In [18]:
df = df.fillna("Private", subset = ['workclass'])

In [19]:
df = df.fillna("Prof-speciality", subset = ['occupation'])

In [20]:
df.select([count(when(col(c).isNull(), c)).alias(c) for c in new_df.columns]).show()

+---+---------+------+---------+-------+-------+----------+--------+----+------+----+----+---------+-------+------+
|age|workclass|fnlwgt|education|numbers|marital|occupation|relation|race|gender|gain|loss|hourlypay|country|income|
+---+---------+------+---------+-------+-------+----------+--------+----+------+----+----+---------+-------+------+
|  0|        0|     0|        0|      0|      0|         0|       0|   0|     0|   0|   0|        0|      0|     0|
+---+---------+------+---------+-------+-------+----------+--------+----+------+----+----+---------+-------+------+



In [21]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler


In [22]:
df.columns

['age',
 'workclass',
 'fnlwgt',
 'education',
 'numbers',
 'marital',
 'occupation',
 'relation',
 'race',
 'gender',
 'gain',
 'loss',
 'hourlypay',
 'country',
 'income']

In [23]:
categorical_cols = [
 'workclass',
 'education',
 'marital',
 'occupation',
 'relation',
 'race',
 'gender',
 'country',
]

numerical_cols = ['age','fnlwgt','numbers','gain','loss','hourlypay']
label = 'income'

In [24]:
indexer = [StringIndexer(inputCol = c, outputCol = f"{c}_index", handleInvalid = "keep") for c in categorical_cols]


In [25]:
label_indexer = StringIndexer(inputCol = "income", outputCol = "label", handleInvalid = "keep")



In [26]:
assembler = VectorAssembler(inputCols = [f"{c}_index" for c in categorical_cols] + numerical_cols, outputCol = "features")

In [27]:
lr = LogisticRegression(featuresCol = "features", labelCol = "label")

In [28]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages = indexer + [label_indexer, assembler, lr])

In [29]:
train_data, test_data = df.randomSplit([0.8, 0.2])

In [30]:
model = pipeline.fit(train_data)

In [31]:
predictions = model.transform(test_data)

In [32]:
predictions.show()

+---+----------+--------+-------------+-------+--------------+------------------+----------+-------------------+-------+------+----+---------+--------------+------+---------------+---------------+-------------+----------------+--------------+----------+------------+-------------+-----+--------------------+--------------------+--------------------+----------+
|age| workclass|  fnlwgt|    education|numbers|       marital|        occupation|  relation|               race| gender|  gain|loss|hourlypay|       country|income|workclass_index|education_index|marital_index|occupation_index|relation_index|race_index|gender_index|country_index|label|            features|       rawPrediction|         probability|prediction|
+---+----------+--------+-------------+-------+--------------+------------------+----------+-------------------+-------+------+----+---------+--------------+------+---------------+---------------+-------------+----------------+--------------+----------+------------+------------

In [33]:
predictions.select("prediction", "label", "features").show()

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|[3.0,5.0,1.0,7.0,...|
|       0.0|  0.0|[3.0,7.0,1.0,7.0,...|
|       0.0|  0.0|[3.0,7.0,1.0,7.0,...|
|       0.0|  0.0|[3.0,1.0,1.0,7.0,...|
|       0.0|  0.0|[3.0,10.0,1.0,7.0...|
|       0.0|  0.0|[3.0,7.0,1.0,7.0,...|
|       0.0|  0.0|[3.0,5.0,1.0,7.0,...|
|       0.0|  0.0|[3.0,7.0,1.0,7.0,...|
|       0.0|  0.0|[3.0,10.0,1.0,7.0...|
|       0.0|  0.0|[3.0,5.0,1.0,7.0,...|
|       0.0|  0.0|[3.0,7.0,1.0,7.0,...|
|       0.0|  0.0|[3.0,7.0,1.0,7.0,...|
|       0.0|  0.0|[2.0,5.0,1.0,0.0,...|
|       0.0|  0.0|[2.0,5.0,1.0,1.0,...|
|       0.0|  0.0|[2.0,11.0,1.0,3.0...|
|       0.0|  0.0|[0.0,7.0,1.0,5.0,...|
|       0.0|  0.0|[0.0,7.0,1.0,4.0,...|
|       0.0|  0.0|[0.0,7.0,1.0,5.0,...|
|       0.0|  0.0|[0.0,5.0,1.0,4.0,...|
|       0.0|  0.0|(14,[1,2,3,4,8,9,...|
+----------+-----+--------------------+
only showing top 20 rows



In [34]:
predictions.select("prediction", "label").distinct().show()

+----------+-----+
|prediction|label|
+----------+-----+
|       1.0|  1.0|
|       0.0|  1.0|
|       1.0|  0.0|
|       0.0|  0.0|
+----------+-----+



In [35]:
predictions.groupBy("prediction", "label").count().show()

+----------+-----+-----+
|prediction|label|count|
+----------+-----+-----+
|       1.0|  1.0|  776|
|       0.0|  1.0|  780|
|       1.0|  0.0|  283|
|       0.0|  0.0| 4562|
+----------+-----+-----+



In [36]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol = "prediction", labelCol = "label", metricName = "accuracy")


In [37]:
accuracy = evaluator.evaluate(predictions)
accuracy

0.8339321980940478

In [38]:
df.show()

+---+-----------------+--------+-------------+-------+--------------------+------------------+--------------+-------------------+-------+-------+----+---------+--------------+------+
|age|        workclass|  fnlwgt|    education|numbers|             marital|        occupation|      relation|               race| gender|   gain|loss|hourlypay|       country|income|
+---+-----------------+--------+-------------+-------+--------------------+------------------+--------------+-------------------+-------+-------+----+---------+--------------+------+
| 50| Self-emp-not-inc| 83311.0|    Bachelors|   13.0|  Married-civ-spouse|   Exec-managerial|       Husband|              White|   Male|    0.0| 0.0|       13| United-States| <=50K|
| 38|          Private|215646.0|      HS-grad|    9.0|            Divorced| Handlers-cleaners| Not-in-family|              White|   Male|    0.0| 0.0|       40| United-States| <=50K|
| 53|          Private|234721.0|         11th|    7.0|  Married-civ-spouse| Handlers-

## Now We are Going to Implementing Naive Bayes With Help of Pyspark Framework of Python

* Generally we are Going to take dataset as it is like above here as well

In [39]:
 categorical_cols = [
 'education',
 'marital',
 'occupation',
 'relation',
 'race',
 'gender',
 'country',

]

numerical_cols = ['age','fnlwgt','numbers','gain','loss','hourlypay']
label = 'income'

In [40]:
def indexer(df, cols):
  indexer = StringIndexer(inputCol = cols, outputCol = f"{cols}_index", handleInvalid = "keep")
  indexed = indexer.fit(df).transform(df)
  return indexed


In [41]:
indexer(df, 'workclass').show()


+---+-----------------+--------+-------------+-------+--------------------+------------------+--------------+-------------------+-------+-------+----+---------+--------------+------+---------------+
|age|        workclass|  fnlwgt|    education|numbers|             marital|        occupation|      relation|               race| gender|   gain|loss|hourlypay|       country|income|workclass_index|
+---+-----------------+--------+-------------+-------+--------------------+------------------+--------------+-------------------+-------+-------+----+---------+--------------+------+---------------+
| 50| Self-emp-not-inc| 83311.0|    Bachelors|   13.0|  Married-civ-spouse|   Exec-managerial|       Husband|              White|   Male|    0.0| 0.0|       13| United-States| <=50K|            1.0|
| 38|          Private|215646.0|      HS-grad|    9.0|            Divorced| Handlers-cleaners| Not-in-family|              White|   Male|    0.0| 0.0|       40| United-States| <=50K|            0.0|
| 53|

In [42]:
for cols in categorical_cols:
  index_df = indexer(new_df, cols)
  new_df = index_df


In [43]:
 new_df = index_df


In [44]:
new_df.show()

+---+-----------------+--------+-------------+-------+--------------------+------------------+--------------+-------------------+-------+-------+----+---------+--------------+------+---------------+-------------+----------------+--------------+----------+------------+-------------+
|age|        workclass|  fnlwgt|    education|numbers|             marital|        occupation|      relation|               race| gender|   gain|loss|hourlypay|       country|income|education_index|marital_index|occupation_index|relation_index|race_index|gender_index|country_index|
+---+-----------------+--------+-------------+-------+--------------------+------------------+--------------+-------------------+-------+-------+----+---------+--------------+------+---------------+-------------+----------------+--------------+----------+------------+-------------+
| 50| Self-emp-not-inc| 83311.0|    Bachelors|   13.0|  Married-civ-spouse|   Exec-managerial|       Husband|              White|   Male|    0.0| 0.0| 

In [45]:
assembler = VectorAssembler(inputCols = [f"{c}_index" for c in categorical_cols] + numerical_cols, outputCol = "features")

In [46]:
vector = assembler.transform(new_df)

In [47]:
vector.show()

+---+-----------------+--------+-------------+-------+--------------------+------------------+--------------+-------------------+-------+-------+----+---------+--------------+------+---------------+-------------+----------------+--------------+----------+------------+-------------+--------------------+
|age|        workclass|  fnlwgt|    education|numbers|             marital|        occupation|      relation|               race| gender|   gain|loss|hourlypay|       country|income|education_index|marital_index|occupation_index|relation_index|race_index|gender_index|country_index|            features|
+---+-----------------+--------+-------------+-------+--------------------+------------------+--------------+-------------------+-------+-------+----+---------+--------------+------+---------------+-------------+----------------+--------------+----------+------------+-------------+--------------------+
| 50| Self-emp-not-inc| 83311.0|    Bachelors|   13.0|  Married-civ-spouse|   Exec-manag

# Find The Correlation Using Correlation Matrix
* Pearson - r1
* Spearman - r2

In [48]:
from pyspark.ml.stat import Correlation

In [49]:
r1 = Correlation.corr(vector, 'features').head()

In [50]:
print("Pearson correlation matrix:\n" + str(r1[0]))

Pearson correlation matrix:
DenseMatrix([[ 1.00000000e+00, -5.43416782e-03, -1.64898072e-02,
              -1.40359718e-02,  3.20363191e-02, -4.02192490e-02,
               8.29989518e-02,  7.61126901e-02,  3.44797621e-02,
              -1.70080159e-01,  5.92854586e-02,  2.19370853e-02,
              -7.92968005e-03],
             [-5.43416782e-03,  1.00000000e+00,  4.46484310e-02,
               4.11141505e-01,  6.89452591e-02,  4.08306814e-01,
               2.15272445e-02,  3.05671551e-02,  5.45065475e-03,
              -1.04881636e-01, -5.77150459e-02, -5.33558913e-02,
              -1.46710822e-01],
             [-1.64898072e-02,  4.46484310e-02,  1.00000000e+00,
               5.26534903e-02,  3.65743398e-02, -3.45920157e-02,
              -3.15452062e-04, -5.94370103e-02,  1.55767708e-02,
              -3.16544139e-01, -7.14525414e-02, -4.84331521e-02,
              -7.72244909e-02],
             [-1.40359718e-02,  4.11141505e-01,  5.26534903e-02,
               1.00000000e+00, 

In [51]:
r2 = Correlation.corr(vector, "features", "spearman").head()

In [52]:
print("Spearman correlation matrix:\n" + str(r2[0]))

Spearman correlation matrix:
DenseMatrix([[ 1.        , -0.03810954, -0.12668817, -0.0365897 , -0.0013781 ,
              -0.02272837,  0.12222815,  0.04408941,  0.0131838 ,  0.29282209,
               0.05706276,  0.03123954,  0.01966969],
             [-0.03810954,  1.        ,  0.07998075,  0.66129197,  0.10053435,
               0.45472477, -0.00328774, -0.15117104,  0.01539424, -0.09460986,
              -0.11162906, -0.06780698, -0.20603001],
             [-0.12668817,  0.07998075,  1.        ,  0.09577575,  0.06332943,
               0.00218237,  0.01937029, -0.10199891,  0.01460209, -0.35488984,
              -0.07996246, -0.05485479, -0.13256778],
             [-0.0365897 ,  0.66129197,  0.09577575,  1.        ,  0.14297465,
               0.6029828 ,  0.03757128, -0.32093609,  0.01910783, -0.10718362,
              -0.10159153, -0.068143  , -0.29835   ],
             [-0.0013781 ,  0.10053435,  0.06332943,  0.14297465,  1.        ,
               0.0992671 ,  0.22341099, -0.0

In [53]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [80]:
indexer = StringIndexer(inputCol = "income", outputCol = "label", handleInvalid = "keep")
data = indexer.fit(vector).transform(vector)


In [81]:
nb = NaiveBayes(featuresCol="features",labelCol = 'label',smoothing=1.0, modelType="multinomial")

In [82]:
splits = data.randomSplit([0.6, 0.4], 1234)
train = splits[0]
test = splits[1]

In [67]:
nb = NaiveBayes(featuresCol="features",labelCol = 'label',smoothing=1.0, modelType="multinomial")

In [84]:
model = nb.fit(train)



In [95]:
# select example rows to display.
predictions = model.transform(test)
predictions.show()

+---+---------+--------+---------+-------+--------------+----------+---------------+------+-------+----+----+---------+--------------+------+---------------+-------------+----------------+--------------+----------+------------+-------------+--------------------+-----+--------------------+--------------------+----------+
|age|workclass|  fnlwgt|education|numbers|       marital|occupation|       relation|  race| gender|gain|loss|hourlypay|       country|income|education_index|marital_index|occupation_index|relation_index|race_index|gender_index|country_index|            features|label|       rawPrediction|         probability|prediction|
+---+---------+--------+---------+-------+--------------+----------+---------------+------+-------+----+----+---------+--------------+------+---------------+-------------+----------------+--------------+----------+------------+-------------+--------------------+-----+--------------------+--------------------+----------+
| 17|        ?| 34019.0|     10th|

In [86]:
# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

Test set accuracy = 0.7855888863445538


# Now We are Going to Implement Decision Tree Model

In [87]:
from pyspark.ml.classification import DecisionTreeClassifier

In [112]:
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxBins=45, maxDepth= 8,impurity= "Entropy")  # Set maxBins to at least 43
model = dt.fit(train)

In [113]:
predictions = model.transform(test)

In [114]:
# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|[7.0,1.0,7.0,2.0,...|
|       0.0|  0.0|[11.0,1.0,7.0,2.0...|
|       0.0|  0.0|[5.0,1.0,7.0,2.0,...|
|       0.0|  0.0|[7.0,1.0,7.0,2.0,...|
|       0.0|  0.0|[5.0,1.0,7.0,2.0,...|
+----------+-----+--------------------+
only showing top 5 rows



In [117]:
# Select (prediction, true label) and compute test error

evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g" % (accuracy))
print("Test Error = %g " % (1.0 - accuracy))

Accuracy = 0.850011
Test Error = 0.149989 


In [116]:
print(model)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_42b42aa68456, depth=8, numNodes=175, numClasses=3, numFeatures=13
