# Problem 1
Apply your skills to classify protein foldType with Decision Tree Classifier

## Imports

In [1]:
from mmtfPyspark.ml import SparkMultiClassClassifier, datasetBalancer                                 
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import mltoolkit

## Configure Spark Session

In [2]:
spark = SparkSession.builder \
                    .master("local[*]") \
                    .appName("datasetClassifierProblemset") \
                    .getOrCreate()

## TODO-1: Read in data from parquet file

In [3]:
parquetFile = './input_features/'
data = spark.read.parquet(parquetFile).cache()

## TODO-2: Select alpha, beta, alpha+beta foldtypes

In [4]:
data = data.where((data.foldType == 'alpha') | (data.foldType == 'beta') | (data.foldType == 'alpha+beta'))
print(f"Total number of data: {data.count()}")

Total number of data: 14443


## TODO-3: Downsample data

In [5]:
label = 'foldType'

data = datasetBalancer.downsample(data, label, 1)
print(f"Dataset size (balanced)  : {data.count()}")
    
data.groupby(label).count().show()

Dataset size (balanced)  : 3777
+----------+-----+
|  foldType|count|
+----------+-----+
|alpha+beta| 1290|
|      beta| 1253|
|     alpha| 1234|
+----------+-----+



## TODO-4: Decision Tree Classifier with PySpark

In [7]:
from pyspark.ml.classification import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
mcc = SparkMultiClassClassifier(dtc, label)
matrics = mcc.fit(data)
for k,v in matrics.items(): print(f"{k}\t{v}")


 Class	Train	Test
alpha+beta	913	377
beta	871	382
alpha	874	360

Sample predictions: DecisionTreeClassifier
+----------------+----------+----------+----------+----------+--------------------+------------+-----------------+--------------------+----------+--------------+
|structureChainId|     alpha|      beta|      coil|  foldType|            features|indexedLabel|    rawPrediction|         probability|prediction|predictedLabel|
+----------------+----------+----------+----------+----------+--------------------+------------+-----------------+--------------------+----------+--------------+
|          1GVE.B|0.48504984|0.15614618|  0.358804|alpha+beta|[-0.0642171993628...|         0.0|[328.0,85.0,69.0]|[0.68049792531120...|       0.0|    alpha+beta|
|          1R4X.A|0.17153284|0.43430656| 0.3941606|alpha+beta|[-0.2385288135872...|         0.0|[328.0,85.0,69.0]|[0.68049792531120...|       0.0|    alpha+beta|
|          1T82.B|0.34013605| 0.3605442| 0.2993197|alpha+beta|[-0.1013272784789..

## BONUS: Decision Tree Classifier with sklearn

In [11]:
from sklearn.tree import DecisionTreeClassifier

df = data.toPandas()
dtc = DecisionTreeClassifier()
mcc = mltoolkit.MultiClassClassifier(dtc, label)
matrics = mcc.fit(df)
for k,v in matrics.items(): print(f"{k}\t{v}")


 Class	Train	Test

alpha+beta	892	398

beta	880	373

alpha	871	363

Total time taken: 0.37964391708374023

Methods	DecisionTreeClassifier
F Score	0.7048688682381032
Accuracy	0.7072310405643739
Precision	0.7052244843588513
Recall	0.7072310405643739
False Positive Rate	0.14684557411058605
True Positive Rate	0.7094922862329233
	
Confusion Matrix
['alpha+beta' 'beta' 'alpha']
[[240  83  75]
 [ 50 308  15]
 [ 80  29 254]]


In [8]:
spark.stop()