# Machine learning - Features extraction

Runs binary and multi-class classifiers on a given dataset.
Dataset are read as Parquet file. The dataset must contain a feature vector named "features" and a classification column.

## Imports

In [1]:
from mmtfPyspark.ml import SparkMultiClassClassifier, datasetBalancer                                 
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.classification import DecisionTreeClassifier, LogisticRegression, MultilayerPerceptronClassifier, RandomForestClassifier

## Configure Spark Session

In [2]:
spark = SparkSession.builder \
                    .master("local[4]") \
                    .appName("datasetClassifier") \
                    .getOrCreate()

## Read in data from parquet file

In [3]:
parquetFile = './input_features/'
data = spark.read.parquet(parquetFile).cache()

print(f"Total number of data: {data.count()}")
data.toPandas().head()

Total number of data: 18491


Unnamed: 0,structureChainId,alpha,beta,coil,foldType,features
0,1LBU.A,0.361502,0.107981,0.530516,other,"[-0.0676212902400502, -0.19547940407580924, 0...."
1,1LC0.A,0.410345,0.275862,0.313793,alpha+beta,"[-0.0792541990609706, 0.040140426752662904, 0...."
2,1LC5.A,0.428169,0.157746,0.414084,alpha+beta,"[-0.019729205948549527, -0.04122427391714465, ..."
3,1LFP.A,0.427984,0.234568,0.337449,alpha+beta,"[-0.19374114630233136, 0.1957719982226169, 0.1..."
4,1LFW.A,0.32265,0.273504,0.403846,alpha+beta,"[-0.29538419953426287, 0.010069241060186297, 0..."


## Select only alpha and beta foldType

In [4]:
data = data.where((data.foldType == 'alpha') | (data.foldType == 'beta')) #| (data.foldType == 'other'))

print(f"Total number of data: {data.count()}")
data.toPandas().head()

Total number of data: 4937


Unnamed: 0,structureChainId,alpha,beta,coil,foldType,features
0,1LGH.A,0.857143,0.0,0.142857,alpha,"[-0.26188569416038016, 0.30686419253999536, 0...."
1,1LGH.B,0.744186,0.0,0.255814,alpha,"[-0.10527046018449421, 0.1883829649199139, 0.2..."
2,1LGH.D,0.857143,0.0,0.142857,alpha,"[-0.26188569416038016, 0.30686419253999536, 0...."
3,1LGH.E,0.744186,0.0,0.255814,alpha,"[-0.10527046018449421, 0.1883829649199139, 0.2..."
4,1LGH.G,0.857143,0.0,0.142857,alpha,"[-0.26188569416038016, 0.30686419253999536, 0...."


## Basic dataset information and setting

In [5]:
label = 'foldType'
testFraction = 0.1
seed = 123

vector = data.first()["features"]
featureCount = len(vector)
print(f"Feature count    : {featureCount}")
    
classCount = int(data.select(label).distinct().count())
print(f"Class count    : {classCount}")

print(f"Dataset size (unbalanced)    : {data.count()}")
    
data.groupby(label).count().show()
data = datasetBalancer.downsample(data, label, 1)
print(f"Dataset size (balanced)  : {data.count()}")
    
data.groupby(label).count().show()

Feature count    : 50
Class count    : 2
Dataset size (unbalanced)    : 4937
+--------+-----+
|foldType|count|
+--------+-----+
|    beta| 1253|
|   alpha| 3684|
+--------+-----+

Dataset size (balanced)  : 2487
+--------+-----+
|foldType|count|
+--------+-----+
|    beta| 1253|
|   alpha| 1234|
+--------+-----+



## Random Forest Classifier

In [6]:
rfc = RandomForestClassifier()
mcc = SparkMultiClassClassifier(rfc, label, testFraction, seed)
matrics = mcc.fit(data)
for k,v in matrics.items(): print(f"{k}\t{v}")


 Class	Train	Test
beta	1129	124
alpha	1096	138

Sample predictions: RandomForestClassifier
+----------------+-----------+----------+----------+--------+--------------------+------------+--------------------+--------------------+----------+--------------+
|structureChainId|      alpha|      beta|      coil|foldType|            features|indexedLabel|       rawPrediction|         probability|prediction|predictedLabel|
+----------------+-----------+----------+----------+--------+--------------------+------------+--------------------+--------------------+----------+--------------+
|          3C5X.C|0.037037037|0.56790125|0.39506173|    beta|[-0.0963613044383...|         0.0|[12.0384237981223...|[0.60192118990611...|       0.0|          beta|
|          4D7C.A|0.044715445| 0.5406504|0.41463414|    beta|[-0.0572097956032...|         0.0|[12.3579385477575...|[0.61789692738787...|       0.0|          beta|
|          5LTG.B|        0.0| 0.5786164|0.42138365|    beta|[-0.1108294218264...|      

## Logistic Regression Classifier

In [7]:
lr = LogisticRegression()
mcc = SparkMultiClassClassifier(lr, label, testFraction, seed)
matrics = mcc.fit(data)
for k,v in matrics.items(): print(f"{k}\t{v}")


 Class	Train	Test
beta	1129	124
alpha	1096	138

Sample predictions: LogisticRegression
+----------------+-----------+----------+----------+--------+--------------------+------------+--------------------+--------------------+----------+--------------+
|structureChainId|      alpha|      beta|      coil|foldType|            features|indexedLabel|       rawPrediction|         probability|prediction|predictedLabel|
+----------------+-----------+----------+----------+--------+--------------------+------------+--------------------+--------------------+----------+--------------+
|          3C5X.C|0.037037037|0.56790125|0.39506173|    beta|[-0.0963613044383...|         0.0|[3.47993446955267...|[0.97011142018302...|       0.0|          beta|
|          4D7C.A|0.044715445| 0.5406504|0.41463414|    beta|[-0.0572097956032...|         0.0|[-0.0084331427036...|[0.49789172681872...|       1.0|         alpha|
|          5LTG.B|        0.0| 0.5786164|0.42138365|    beta|[-0.1108294218264...|         0

## Simple Multilayer Perception Classifier

In [8]:
layers = [featureCount, 32, 32, classCount]
mpc = MultilayerPerceptronClassifier().setLayers(layers) \
                                          .setBlockSize(128) \
                                          .setSeed(1234) \
                                          .setMaxIter(100)
mcc = SparkMultiClassClassifier(mpc, label, testFraction, seed)
matrics = mcc.fit(data)
for k,v in matrics.items(): print(f"{k}\t{v}")


 Class	Train	Test
beta	1129	124
alpha	1096	138

Sample predictions: MultilayerPerceptronClassifier
+----------------+-----------+----------+----------+--------+--------------------+------------+----------+--------------+
|structureChainId|      alpha|      beta|      coil|foldType|            features|indexedLabel|prediction|predictedLabel|
+----------------+-----------+----------+----------+--------+--------------------+------------+----------+--------------+
|          3C5X.C|0.037037037|0.56790125|0.39506173|    beta|[-0.0963613044383...|         0.0|       0.0|          beta|
|          4D7C.A|0.044715445| 0.5406504|0.41463414|    beta|[-0.0572097956032...|         0.0|       0.0|          beta|
|          5LTG.B|        0.0| 0.5786164|0.42138365|    beta|[-0.1108294218264...|         0.0|       1.0|         alpha|
|          2B4H.B|0.018348623|0.63761467| 0.3440367|    beta|[-0.1803915034813...|         0.0|       0.0|          beta|
|          2QF4.A| 0.01764706| 0.5117647|0.470

## Terminate Spark

In [9]:
spark.stop()