### Perform ranked SVM on the 80-20 splitting dataset

In [15]:
import pandas as pd
from pandas import DataFrame
import os
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, recall_score
import numpy as np

prefix = os.getcwd()

In [2]:
train_2D_df = pd.read_csv('train_2D.csv')
test_2D_df = pd.read_csv('test_2D.csv')

In [3]:
train_2D_df.head()

Unnamed: 0,CompName,Ehull,(cbrt(NNaLst)+(AnionChgStdLst)^2),((EWaldSumLst)^2*(XWithNaLst*RDiffStdLst))
0,MgZr(SO4)3,-60.260567,0.0,0.0
1,MgTi(SO4)3,-27.986572,0.0,125.960243
2,MgSn(SO4)3,-30.640966,0.0,46.109451
3,Mg4Nb2(SO4)9,-42.783127,0.0,79.651306
4,ZrZn(SO4)3,-40.419098,0.0,24.253452


In [4]:
test_2D_df.head()

Unnamed: 0,CompName,Ehull,(cbrt(NNaLst)+(AnionChgStdLst)^2),((EWaldSumLst)^2*(XWithNaLst*RDiffStdLst))
0,MgGe(SO4)3,-9.215537,0.0,306.012177
1,Mg4Ta2(SO4)9,-48.21566,0.0,74.183754
2,ZnSn(SO4)3,-21.229073,0.0,94.826508
3,ZnGe(SO4)3,-25.11343,0.0,414.33493
4,CaHf(SO4)3,-105.50385,0.0,137.997589


In [5]:
def rankedTransform(feat_mat,targets):
    '''
    Transform the feature matrix and targets into pairwise ranking format
    '''
    feat_vecs = feat_mat[:,None,:] - feat_mat[None,:,:]
    # Only need the upper triangle
    IU3 = np.triu_indices(feat_vecs.shape[0],1)
    new_feat_mat = feat_vecs[IU3]
    
    target_vecs = targets[:,None] - targets[None,:]
    # Only need the upper triangle
    IU3 = np.triu_indices(target_vecs.shape[0],1)
    new_targets = target_vecs[IU3]
    new_targets = np.sign(new_targets)

    return new_feat_mat, new_targets

In [6]:
trans_train_feat_mat, trans_train_targets = rankedTransform(train_2D_df.to_numpy()[:,2:],train_2D_df.to_numpy()[:,1])
trans_test_feat_mat, trans_test_targets = rankedTransform(test_2D_df.to_numpy()[:,2:],test_2D_df.to_numpy()[:,1])
print(trans_train_feat_mat.shape,trans_train_targets.shape)
print(trans_test_feat_mat.shape,trans_test_targets.shape)

(4815856, 2) (4815856,)
(301476, 2) (301476,)


In [7]:
print(trans_train_feat_mat[:10], trans_train_targets[:10])

[[0.0 -125.96024322509766]
 [0.0 -46.10945129394531]
 [0.0 -79.65130615234375]
 [0.0 -24.25345230102539]
 [0.0 -197.34197998046875]
 [0.0 -146.327392578125]
 [0.0 -139.96044921875]
 [0.0 -141.5548858642578]
 [0.0 -292.8807067871094]
 [0.0 -363.9143371582031]] [-1 -1 -1 -1 -1 1 1 -1 -1 -1]


In [8]:
clf = LinearSVC(penalty='l2',loss='squared_hinge',dual=False,tol=0.0001,C=1.0,multi_class='ovr',\
              fit_intercept=True,intercept_scaling=1,class_weight=None,verbose=0,\
              random_state=None,max_iter=1000)

In [9]:
clf.fit(trans_train_feat_mat, np.int32(trans_train_targets))
Y_test_predict = clf.predict(trans_test_feat_mat)
test_score = accuracy_score(np.int32(trans_test_targets),Y_test_predict)
test_F1 = f1_score(np.int32(trans_test_targets),Y_test_predict);

In [10]:
print(f'Accuracy = {test_score}, F1 score = {test_F1}')

Accuracy = 0.8470923058551925, F1 score = 0.7232116052021662


In [16]:
all_feat_mat = np.concatenate((trans_train_feat_mat,trans_test_feat_mat))
all_targets = np.concatenate((trans_train_targets,trans_test_targets))
clf.fit(all_feat_mat, np.int32(all_targets))
Y_all_predict = clf.predict(all_feat_mat)
test_score = accuracy_score(np.int32(all_targets), Y_all_predict)
test_F1 = f1_score(np.int32(all_targets),Y_all_predict,average='weighted')
test_recall = recall_score(np.int32(all_targets),Y_all_predict,average='weighted')
print(f'Accuracy = {test_score}, F1 score = {test_F1}, test_recall = {test_recall}')

Accuracy = 0.8419983303799714, F1 score = 0.8383727489306032, test_recall = 0.8419983303799714


In [21]:
clf = LinearSVC(penalty='l2',loss='squared_hinge',dual=False,tol=0.0001,C=1.0,multi_class='ovr',\
              fit_intercept=True,intercept_scaling=1,class_weight=None,verbose=0,\
              random_state=None,max_iter=1000)
clf.fit(trans_train_feat_mat, np.int32(trans_train_targets))
Y_train_predict = clf.predict(trans_train_feat_mat)
print(Y_train_predict.shape)
test_score = accuracy_score(np.int32(trans_train_targets), Y_train_predict)
test_F1 = f1_score(np.int32(trans_train_targets), Y_train_predict,average='weighted')
test_recall = recall_score(np.int32(trans_train_targets), Y_train_predict,average='weighted')
print(f'Accuracy = {test_score}, F1 score = {test_F1}, test_recall = {test_recall}')

(4815856,)
Accuracy = 0.8416887880368517, F1 score = 0.8380910818841897, test_recall = 0.8416887880368517
