In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317130 sha256=e7fc0aa749313a3ded6d38f4b216eb68ca46f2a8eee55ded74a7db99120c0a0a
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [2]:
import pyspark
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.master("local").appName("Practices").getOrCreate()

In [5]:
spark

In [6]:
df = spark.read.csv('input.csv', inferSchema = True, header = True)
df.printSchema()

root
 |-- MSSV: integer (nullable = true)
 |-- NMDT: double (nullable = true)
 |-- CTRR: double (nullable = true)
 |-- HTS: double (nullable = true)
 |-- KTLT: double (nullable = true)
 |-- CTDLGT: double (nullable = true)
 |-- KTMT: double (nullable = true)
 |-- LTHDT: double (nullable = true)
 |-- MHH: double (nullable = true)
 |-- HCSDL: double (nullable = true)
 |-- HDH: double (nullable = true)
 |-- NMTTNT: integer (nullable = true)
 |-- DHMT: integer (nullable = true)
 |-- MMANM: integer (nullable = true)



### Decision Tree Classifier - PySpark Implementation

In [7]:
from pyspark.ml.feature import VectorAssembler

In [8]:
df.columns

['MSSV',
 'NMDT',
 'CTRR',
 'HTS',
 'KTLT',
 'CTDLGT',
 'KTMT',
 'LTHDT',
 'MHH',
 'HCSDL',
 'HDH',
 'NMTTNT',
 'DHMT',
 'MMANM']

After loading the data, we will perform some further data processing before we train our Decision Tree algorithm:
- Transformer — create a new “features” column that store all data features as an array
- Data Split — split the data into 80% training data and 20% testing data

In [9]:
assembler = VectorAssembler(inputCols = ['NMDT','CTRR','HTS','KTLT','CTDLGT','KTMT','LTHDT','MHH','HCSDL','HDH'],
                           outputCol = 'features')
output = assembler.transform(df)
output.show(truncate = False)

+-------+----+----+---+----+------+----+-----+----+-----+---+------+----+-----+-------------------------------------------+
|MSSV   |NMDT|CTRR|HTS|KTLT|CTDLGT|KTMT|LTHDT|MHH |HCSDL|HDH|NMTTNT|DHMT|MMANM|features                                   |
+-------+----+----+---+----+------+----+-----+----+-----+---+------+----+-----+-------------------------------------------+
|75321  |6.5 |6.5 |5.0|6.0 |5.5   |6.5 |7.5  |7.0 |8.0  |6.5|1     |0   |1    |[6.5,6.5,5.0,6.0,5.5,6.5,7.5,7.0,8.0,6.5]  |
|81657  |7.5 |7.0 |5.5|7.0 |7.0   |7.5 |7.5  |6.0 |7.5  |7.5|1     |0   |0    |[7.5,7.0,5.5,7.0,7.0,7.5,7.5,6.0,7.5,7.5]  |
|209312 |7.0 |6.5 |5.5|7.5 |4.5   |7.0 |0.0  |7.0 |6.0  |8.0|1     |1   |0    |[7.0,6.5,5.5,7.5,4.5,7.0,0.0,7.0,6.0,8.0]  |
|583833 |6.5 |7.0 |6.0|5.5 |6.5   |7.0 |8.0  |7.0 |6.0  |7.0|1     |1   |1    |[6.5,7.0,6.0,5.5,6.5,7.0,8.0,7.0,6.0,7.0]  |
|1509861|7.5 |5.0 |5.0|9.0 |3.5   |6.5 |6.0  |7.0 |7.5  |6.5|0     |0   |0    |[7.5,5.0,5.0,9.0,3.5,6.5,6.0,7.0,7.5,6.5]  |
|2047807

In [10]:
output.select('features','NMTTNT','DHMT','MMANM').show(truncate = False)

+-------------------------------------------+------+----+-----+
|features                                   |NMTTNT|DHMT|MMANM|
+-------------------------------------------+------+----+-----+
|[6.5,6.5,5.0,6.0,5.5,6.5,7.5,7.0,8.0,6.5]  |1     |0   |1    |
|[7.5,7.0,5.5,7.0,7.0,7.5,7.5,6.0,7.5,7.5]  |1     |0   |0    |
|[7.0,6.5,5.5,7.5,4.5,7.0,0.0,7.0,6.0,8.0]  |1     |1   |0    |
|[6.5,7.0,6.0,5.5,6.5,7.0,8.0,7.0,6.0,7.0]  |1     |1   |1    |
|[7.5,5.0,5.0,9.0,3.5,6.5,6.0,7.0,7.5,6.5]  |0     |0   |0    |
|[6.5,6.5,7.5,9.5,8.5,6.5,8.5,7.5,8.5,8.5]  |1     |1   |0    |
|[9.0,8.0,9.5,10.0,9.0,8.0,8.0,10.0,8.0,8.5]|0     |0   |0    |
|[8.5,7.0,7.0,7.0,6.0,8.0,7.0,6.5,7.5,8.0]  |0     |0   |0    |
|[7.5,8.5,8.5,10.0,8.5,8.0,8.0,8.5,0.0,9.0] |0     |0   |0    |
|[5.5,7.0,7.0,7.5,7.5,6.0,8.0,7.0,7.5,7.5]  |1     |1   |1    |
|[8.0,7.0,6.0,7.5,5.0,7.0,6.5,6.5,0.0,5.5]  |0     |0   |0    |
|[7.5,8.0,6.5,9.0,8.5,7.5,7.0,8.5,8.0,7.0]  |0     |1   |0    |
|[7.0,6.5,6.0,6.5,7.5,6.0,7.5,5.5,7.5,6.

In [27]:
model_df = output.select('features', 'NMTTNT', 'DHMT', 'MMANM')
train_df, test_df = model_df.randomSplit([0.8,0.2])
print(train_df.count(), test_df.count())

286 73


In [15]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

Once the data has been processed, we can train our Decision Tree classifier using the `pyspark.ml.classification` library on our training data. Then, we can use our testing data to make predictions, and `pyspark.ml.evaluation` library to get the accuracy and precision of our classifier.

In [28]:
def courseClassifier(course_name):
    # train the model (df_classifier) using training data
    df_classifier = DecisionTreeClassifier(labelCol=course_name).fit(train_df)
    
    # test the model and make predictions using test data
    df_predict = df_classifier.transform(test_df)
    df_predict.select(course_name, 'prediction').show()
    
    df_accuracy = MulticlassClassificationEvaluator(labelCol=course_name,
                                               metricName='accuracy').evaluate(df_predict)
    df_precision = MulticlassClassificationEvaluator(labelCol=course_name,
                                                metricName='weightedPrecision').evaluate(df_predict)
    
    print("Accuracy: ", df_accuracy)
    print("Precision: ", df_precision)
    return df_classifier

In [33]:
NMTTNT_classifier = courseClassifier('NMTTNT')
NMTTNT_classifier.featureImportances

+------+----------+
|NMTTNT|prediction|
+------+----------+
|     0|       0.0|
|     1|       1.0|
|     0|       0.0|
|     1|       1.0|
|     0|       1.0|
|     1|       1.0|
|     1|       1.0|
|     1|       1.0|
|     1|       1.0|
|     1|       1.0|
|     1|       1.0|
|     1|       1.0|
|     1|       1.0|
|     1|       0.0|
|     0|       1.0|
|     1|       1.0|
|     1|       1.0|
|     1|       1.0|
|     1|       1.0|
|     1|       1.0|
+------+----------+
only showing top 20 rows

Accuracy:  0.8356164383561644
Precision:  0.8348540798094104


SparseVector(10, {0: 0.4723, 1: 0.1324, 2: 0.0964, 3: 0.0661, 4: 0.0938, 5: 0.0782, 6: 0.0319, 7: 0.0289})

In [34]:
DHMT_classifier = courseClassifier('DHMT')
DHMT_classifier.featureImportances

+----+----------+
|DHMT|prediction|
+----+----------+
|   0|       0.0|
|   1|       1.0|
|   0|       0.0|
|   0|       1.0|
|   0|       0.0|
|   1|       0.0|
|   0|       1.0|
|   1|       1.0|
|   0|       1.0|
|   1|       1.0|
|   1|       1.0|
|   0|       1.0|
|   0|       1.0|
|   1|       1.0|
|   1|       1.0|
|   1|       1.0|
|   1|       1.0|
|   1|       1.0|
|   0|       0.0|
|   1|       1.0|
+----+----------+
only showing top 20 rows

Accuracy:  0.8082191780821918
Precision:  0.8036262650538064


SparseVector(10, {0: 0.2865, 1: 0.0326, 2: 0.1655, 3: 0.0668, 4: 0.1058, 5: 0.0921, 6: 0.2036, 9: 0.0471})

In [35]:
MMAMM_classifier = courseClassifier('MMANM')
MMAMM_classifier.featureImportances

+-----+----------+
|MMANM|prediction|
+-----+----------+
|    0|       0.0|
|    1|       1.0|
|    0|       0.0|
|    1|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       1.0|
|    0|       1.0|
|    1|       0.0|
|    0|       1.0|
|    0|       1.0|
|    1|       0.0|
|    1|       0.0|
|    0|       0.0|
|    1|       1.0|
|    0|       1.0|
|    0|       1.0|
|    1|       0.0|
|    0|       1.0|
|    0|       0.0|
+-----+----------+
only showing top 20 rows

Accuracy:  0.7123287671232876
Precision:  0.7207701747985008


SparseVector(10, {0: 0.1483, 1: 0.1193, 3: 0.1918, 4: 0.0859, 5: 0.044, 6: 0.1305, 7: 0.0957, 8: 0.1845})

In [None]:
# The following code lines only help visualizing DTs.

In [37]:
MMAMM_classifier.toDebugString

'DecisionTreeClassificationModel: uid=DecisionTreeClassifier_d046351cd097, depth=5, numNodes=39, numClasses=2, numFeatures=10\n  If (feature 3 <= 8.25)\n   If (feature 4 <= 6.75)\n    If (feature 0 <= 6.25)\n     Predict: 0.0\n    Else (feature 0 > 6.25)\n     If (feature 0 <= 7.25)\n      If (feature 3 <= 5.25)\n       Predict: 0.0\n      Else (feature 3 > 5.25)\n       Predict: 1.0\n     Else (feature 0 > 7.25)\n      Predict: 0.0\n   Else (feature 4 > 6.75)\n    If (feature 8 <= 8.25)\n     If (feature 6 <= 8.25)\n      If (feature 5 <= 5.75)\n       Predict: 1.0\n      Else (feature 5 > 5.75)\n       Predict: 0.0\n     Else (feature 6 > 8.25)\n      If (feature 1 <= 8.25)\n       Predict: 1.0\n      Else (feature 1 > 8.25)\n       Predict: 0.0\n    Else (feature 8 > 8.25)\n     If (feature 8 <= 8.75)\n      If (feature 3 <= 5.75)\n       Predict: 1.0\n      Else (feature 3 > 5.75)\n       Predict: 0.0\n     Else (feature 8 > 8.75)\n      If (feature 0 <= 7.75)\n       Predict: 0.0\

In [38]:
def parse_debug_string_lines(lines):
    block = []
    while lines:
        if lines[0].startswith('If'):
            bl = ' '.join(lines.pop(0).split()[1:]).replace('(', '').replace(')', '')
            block.append({'name': bl, 'children': parse_debug_string_lines(lines)})

            if lines[0].startswith('Else'):
                be = ' '.join(lines.pop(0).split()[1:]).replace('(', '').replace(')', '')
                block.append({'name': be, 'children': parse_debug_string_lines(lines)})
        elif not lines[0].startswith(('If', 'Else')):
            block2 = lines.pop(0)
            block.append({'name': block2})
        else:
            break
    return block


def debug_str_to_json(debug_string):
    data = []
    for line in debug_string.splitlines():
        if line.strip():
            line = line.strip()
            data.append(line)
        else:
            break
        if not line: break

    json = {'name': 'Root', 'children': parse_debug_string_lines(data[1:])}
    return json

In [40]:
f_type_to_flist_dict = output.schema['features'].metadata["ml_attr"]["attrs"]

f_index_to_name_dict = {}

for f_type, f_list in f_type_to_flist_dict.items():
  for f in f_list:
    f_index = f['idx']
    f_name = f['name']
    f_index_to_name_dict[f_index] = f_name




print(f_index_to_name_dict)

{0: 'NMDT', 1: 'CTRR', 2: 'HTS', 3: 'KTLT', 4: 'CTDLGT', 5: 'KTMT', 6: 'LTHDT', 7: 'MHH', 8: 'HCSDL', 9: 'HDH'}


In [41]:
import json

dict_tree_json = debug_str_to_json(NMTTNT_classifier.toDebugString)

print(json.dumps(dict_tree_json,indent = 1 ))

{
 "name": "Root",
 "children": [
  {
   "name": "feature 0 <= 7.75",
   "children": [
    {
     "name": "feature 0 <= 6.75",
     "children": [
      {
       "name": "feature 2 <= 4.25",
       "children": [
        {
         "name": "feature 0 <= 6.25",
         "children": [
          {
           "name": "Predict: 0.0"
          }
         ]
        },
        {
         "name": "feature 0 > 6.25",
         "children": [
          {
           "name": "Predict: 1.0"
          }
         ]
        }
       ]
      },
      {
       "name": "feature 2 > 4.25",
       "children": [
        {
         "name": "feature 7 <= 8.25",
         "children": [
          {
           "name": "Predict: 1.0"
          }
         ]
        },
        {
         "name": "feature 7 > 8.25",
         "children": [
          {
           "name": "feature 0 <= 6.25",
           "children": [
            {
             "name": "Predict: 0.0"
            }
           ]
          },
          {
       