In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import label_ranking_average_precision_score

In [2]:
NUM_FEATURES = 5000
NUM_CLASSES = 3993

In [3]:
#Get true predictions
y_val = pd.read_csv("../data/expanded/dev_labels.csv", names=range(NUM_CLASSES))

In [4]:
#Tree
y_rf = pd.read_csv("../public_data/saved_rf_probabilities.csv", names=range(NUM_CLASSES),header= 0)

In [5]:
score_rf = label_ranking_average_precision_score(y_val,y_rf)
score_rf

0.5604580391775857

In [6]:
#KNN
y_knn = pd.read_csv("../public_data/knn.csv", names=range(NUM_CLASSES),header = 0)

In [7]:
score_knn = label_ranking_average_precision_score(y_val,y_knn)
score_knn

0.2912844932011161

In [20]:
#SVM
y_svm = pd.read_csv("../public_data/svm.csv", names=range(NUM_CLASSES),header=0)

In [22]:
score_svm = label_ranking_average_precision_score(y_val,y_svm)
score_svm

0.3391564618701288

In [12]:
#NN
#from numpy import load
#y_nn = np.load('../public_data/nn_predictions.npz')
#y_nn = y_nn['predictions']
y_nn = pd.read_csv("../public_data/nn_ensemble.csv", names=range(NUM_CLASSES))

In [29]:
y_nn.shape

(1314, 3993)

In [13]:
score_nn = label_ranking_average_precision_score(y_val,y_nn)
score_nn

0.6377158697574089

# Convex combination

In [None]:
#Do a convex combination based upon validation scores
#Note convex combination does not seem to work the best.
total = score_rf + score_knn + score_svm + score_mlp

rf_w = score_rf / total
knn_w = score_knn / total
svm_w = score_svm/total
mlp_w = score_nn/total

In [14]:
#Can use convex combination or set manually.
#ensemble = 0.83*y_nn.values + 0.14*y_rf.values + 0.02*y_svm.values + 0.01*y_knn.values
ensemble = 0.90*y_nn.values + 0.055*y_svm.values + 0.045*y_knn.values

In [15]:
score_ensemble = label_ranking_average_precision_score(y_val,ensemble)

In [16]:
#Small improvement over sole rf
score_ensemble

0.639161008641929

# Ensemble weight tuning 

# All 4 : NN, RF, SVM and KNN

In [217]:
import random

In [298]:
i = 0
best_score = 0
while i< 200:
    
    if i%10 ==0:
        print(i)
        
    y_test_nn = np.random.uniform(0.7,1)
    y_test_rf = np.random.uniform(0,1-y_test_nn)
    y_test_svm = np.random.uniform(0,1-y_test_nn-y_test_rf)
    y_test_knn = 1-y_test_nn-y_test_rf-y_test_svm
    
    
    ensemble_score = y_test_nn*y_nn + y_test_rf*y_rf.values + y_test_svm*y_svm.values +y_test_knn*y_knn.values
    temp_score = label_ranking_average_precision_score(y_val,ensemble_score)
    
    if temp_score > best_score:
        best_score = temp_score
        
        best_weights = []
        best_weights.append(y_test_nn)
        best_weights.append(y_test_rf)
        best_weights.append(y_test_svm)
        best_weights.append(y_test_knn)
    i+=1

0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190


In [300]:
best_weights

[0.8278064911389091,
 0.13922899875924638,
 0.020779083303470794,
 0.012185426798373686]

In [299]:
best_score

0.6416695436767239

# NN and RF

In [301]:
i = 0
best_score = 0
while i< 100:
    
    if i%10 ==0:
        print(i)
        
    y_test_nn = np.random.uniform(0.4,1)
    y_test_rf = 1-y_test_nn
    
    
    ensemble_score = y_test_nn*y_nn + y_test_rf*y_rf.values 
    temp_score = label_ranking_average_precision_score(y_val,ensemble_score)
    
    if temp_score > best_score:
        best_score = temp_score
        
        best_weights = []
        best_weights.append(y_test_nn)
        best_weights.append(y_test_rf)
    i+=1

0
10
20
30
40
50
60
70
80
90


In [302]:
best_score

0.6389017438947554

# NN, RF and SVM (best combination)

In [201]:
i = 0
best_score = 0
while i< 500:
    
    if i%10 ==0:
        print(i)
        
    y_test_nn = np.random.uniform(0.4,1)
    y_test_rf = np.random.uniform(0,1-y_test_nn)
    y_test_svm = 1-y_test_nn-y_test_rf    
    
    ensemble_score = y_test_nn*y_nn + y_test_rf*y_rf.values + y_test_svm*y_svm.values
    temp_score = label_ranking_average_precision_score(y_val,ensemble_score)
    
    if temp_score > best_score:
        best_score = temp_score
        
        best_weights = []
        best_weights.append(y_test_nn)
        best_weights.append(y_test_rf)
        best_weights.append(y_test_svm)
    i+=1

0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490


In [202]:
best_score

0.6345056739829807

In [203]:
best_weights

[0.8437156989129617, 0.13313503152459621, 0.023149269562442065]

# NN, KNN and SVM

In [26]:
i = 0
best_score = 0
while i< 300:
    
    if i%10 ==0:
        print(i)
        
    y_test_nn = np.random.uniform(0.7,1)
    y_test_svm = np.random.uniform(0,1-y_test_nn)
    y_test_knn= 1-y_test_nn-y_test_svm
    
    ensemble_score = y_test_nn*y_nn.values + y_test_svm*y_svm.values +y_test_knn*y_knn.values
    temp_score = label_ranking_average_precision_score(y_val,ensemble_score)
    
    if temp_score > best_score:
        best_score = temp_score
        
        best_weights = []
        best_weights.append(y_test_nn)
        best_weights.append(y_test_svm)
        best_weights.append(y_test_knn)
    i+=1

0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290


In [27]:
best_score

0.6395300891188415

In [28]:
best_weights

[0.8649735188584167, 0.06884659525043907, 0.06617988589114426]

# 3 NN

In [38]:
nn1 = pd.read_csv("../public_data/nn_predictions_1.csv", names=range(NUM_CLASSES))
nn2 = pd.read_csv("../public_data/nn_predictions_2.csv", names=range(NUM_CLASSES))
nn4 = pd.read_csv("../public_data/nn_predictions_4.csv", names=range(NUM_CLASSES))

In [42]:
i = 0
best_score = 0
while i< 200:
    
    if i%10 ==0:
        print(i)
        
    y_test_nn1 = np.random.uniform(0.3,0.4)
    y_test_nn2 = np.random.uniform(0.3,0.4)
    y_test_nn4 = np.random.uniform(0.3,0.4)
    y_test_svm = np.random.uniform(0,1-y_test_nn1-y_test_nn2-y_test_nn4)
    y_test_knn = 1-y_test_nn1-y_test_nn2-y_test_nn4-y_test_svm
    
    ensemble_score = y_test_nn1*nn1.values+y_test_nn2*nn2.values+y_test_nn4*nn4.values+y_test_svm*y_svm.values+y_test_knn*y_knn.values 
    temp_score = label_ranking_average_precision_score(y_val,ensemble_score)
    
    if temp_score > best_score:
        best_score = temp_score
        
        best_weights = []
        best_weights.append(y_test_nn1)
        best_weights.append(y_test_nn2)
        best_weights.append(y_test_nn4)
        best_weights.append(y_test_svm)
        best_weights.append(y_test_knn)
    i+=1

0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190


In [43]:
best_score

0.6393641591973998

In [44]:
best_weights

[0.3061031233068452,
 0.3135977603025091,
 0.3379173997838836,
 0.02118637788834763,
 0.021195338718414523]