# Perform ranked SVM on the 80-20 splitting dataset

Take the optimal SIS 2D features as an example to train a SVM model. The other SIS features can be trained in a similar way. Overall, the optimal SIS 2D feature results in the best accuracy on the validation set.

**Input data with 2D SIS features:** RawData/train_2D.csv, RawData/test_2D.csv   
**Output:** accuracy and F1 of MLR model

## Predefined functions

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import DataFrame
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, recall_score

def rankedTransform(feat_mat,targets):
    '''
    Transform the feature matrix and targets into pairwise ranking format
    '''
    feat_vecs = feat_mat[:,None,:] - feat_mat[None,:,:]
    # Only need the upper triangle
    IU3 = np.triu_indices(feat_vecs.shape[0],1)
    new_feat_mat = feat_vecs[IU3]
    
    target_vecs = targets[:,None] - targets[None,:]
    # Only need the upper triangle
    IU3 = np.triu_indices(target_vecs.shape[0],1)
    new_targets = target_vecs[IU3]
    new_targets = np.sign(new_targets)

    return new_feat_mat.astype(float), new_targets.astype(int)

## Load data

In [2]:
train_2D_df = pd.read_csv('../RawData/train_2D.csv')
test_2D_df = pd.read_csv('../RawData/test_2D.csv')

In [3]:
train_2D_df.head()

Unnamed: 0,CompName,Ehull,(cbrt(NNaLst)+(AnionChgStdLst)^2),((EWaldSumLst)^2*(XWithNaLst*RDiffStdLst))
0,MgZr(SO4)3,-60.260567,0.0,0.0
1,MgTi(SO4)3,-27.986572,0.0,125.960243
2,MgSn(SO4)3,-30.640966,0.0,46.109451
3,Mg4Nb2(SO4)9,-42.783127,0.0,79.651306
4,ZrZn(SO4)3,-40.419098,0.0,24.253452


In [4]:
test_2D_df.head()

Unnamed: 0,CompName,Ehull,(cbrt(NNaLst)+(AnionChgStdLst)^2),((EWaldSumLst)^2*(XWithNaLst*RDiffStdLst))
0,MgGe(SO4)3,-9.215537,0.0,306.012177
1,Mg4Ta2(SO4)9,-48.21566,0.0,74.183754
2,ZnSn(SO4)3,-21.229073,0.0,94.826508
3,ZnGe(SO4)3,-25.11343,0.0,414.33493
4,CaHf(SO4)3,-105.50385,0.0,137.997589


## Generate 0-1 labels of synthetic ranking

In [5]:
trans_train_feat_mat, trans_train_targets = rankedTransform(train_2D_df.to_numpy()[:,2:],train_2D_df.to_numpy()[:,1])
trans_test_feat_mat, trans_test_targets = rankedTransform(test_2D_df.to_numpy()[:,2:],test_2D_df.to_numpy()[:,1])
print('Training data:')
print('X:', trans_train_feat_mat.shape, 'Y:', trans_train_targets.shape)
print('Testing data:')
print('X:', trans_test_feat_mat.shape, 'Y:', trans_test_targets.shape)

Training data:
X: (4815856, 2) Y: (4815856,)
Testing data:
X: (301476, 2) Y: (301476,)


In [6]:
print('Print a few examples:')

for feature, target in zip(trans_train_feat_mat[:10], trans_train_targets[:10]):
    print(np.round(feature, 3), '\t==>', target)

Print a few examples:
[   0.   -125.96] 	==> -1
[  0.    -46.109] 	==> -1
[  0.    -79.651] 	==> -1
[  0.    -24.253] 	==> -1
[   0.    -197.342] 	==> -1
[   0.    -146.327] 	==> 1
[   0.   -139.96] 	==> 1
[   0.    -141.555] 	==> -1
[   0.    -292.881] 	==> -1
[   0.    -363.914] 	==> -1


## Train and validate MLR model

In [7]:
clf = LinearSVC(penalty='l2',loss='squared_hinge',dual=False,tol=0.0001,C=1.0,multi_class='ovr',\
              fit_intercept=True,intercept_scaling=1,class_weight=None,verbose=0,\
              random_state=None,max_iter=1000)

In [8]:
clf.fit(trans_train_feat_mat, np.int32(trans_train_targets))
Y_test_predict = clf.predict(trans_test_feat_mat)
test_score = accuracy_score(np.int32(trans_test_targets),Y_test_predict)
test_F1 = f1_score(np.int32(trans_test_targets),Y_test_predict);
print(f'Accuracy = {test_score:.4f}, F1 score = {test_F1:.4f}')

Accuracy = 0.8471, F1 score = 0.7232
