In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import label_ranking_average_precision_score as LRAP

In [2]:
NUM_FEATURES = 5000
NUM_CLASSES = 3993

In [3]:
#Get true predictions
y_val = np.array(pd.read_csv("../data/expanded/dev_labels.csv", names=range(NUM_CLASSES)))

In [4]:
y_val.shape

(1314, 3993)

In [5]:
#Tree
y_rf = np.array(pd.read_csv("../public_data/saved_rf_probabilities.csv", names=range(NUM_CLASSES),header= 0))

In [6]:
y_rf.shape

(1314, 3993)

In [7]:
score_rf = LRAP(y_val,y_rf)
score_rf

0.5604580391775857

In [8]:
#KNN
y_knn = np.array(pd.read_csv("../public_data/knn.csv", names=range(NUM_CLASSES),header = 0))

In [9]:
score_knn = LRAP(y_val,y_knn)
score_knn

0.2912844932011161

In [10]:
#SVM
y_svm = np.array(pd.read_csv("../public_data/svm.csv", names=range(NUM_CLASSES), header = 0))

In [11]:
score_svm = LRAP(y_val,y_svm)
score_svm

0.3391564618701288

In [12]:
#NN
from numpy import load
y_nn = np.array(pd.read_csv('../public_data/nn_ensemble.csv', names=range(NUM_CLASSES)))

In [13]:
y_nn.shape

(1314, 3993)

In [14]:
score_nn = LRAP(y_val,y_nn)
score_nn

0.6377158697574089

# Linear Regression

In [29]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression(fit_intercept = False)

In [30]:
# Create the new feature space:
flat_y = y_val.flatten()
flat_x = pd.DataFrame({'rf': y_rf.flatten(), 'knn': y_knn.flatten(), 
                       'svm': y_svm.flatten(), 'nn': y_nn.flatten()})

In [31]:
lr.fit(flat_x, flat_y)
best_coefs = lr.coef_
best_coefs

array([0.4162748 , 0.06372309, 0.12348627, 0.55210131])

In [32]:
ensemble = best_coefs[0]*y_rf + best_coefs[1]*y_knn + best_coefs[2]*y_svm + best_coefs[3]*y_nn

In [33]:
LRAP(y_val, ensemble)

0.6355328731282031

# Ridge regression

In [20]:
from sklearn.linear_model import Ridge

ridge = Ridge()

In [21]:
ridge.fit(flat_x, flat_y)
best_coefs_ridge = ridge.coef_
best_coefs_ridge

array([0.41527248, 0.0640503 , 0.12357985, 0.55201876])

In [22]:
ensemble2 = best_coefs_ridge[0]*y_rf + best_coefs_ridge[1]*y_knn + best_coefs_ridge[2]*y_svm + best_coefs_ridge[3]*y_nn

In [23]:
LRAP(y_val, ensemble2)

0.6356017283381509

# Logistic regression

In [39]:
from sklearn.linear_model import LogisticRegression

logr = LogisticRegression()

In [40]:
logr.fit(flat_x, flat_y)

array([[13.07276334,  0.43036206,  3.52409978,  3.09973243]])

In [45]:
best_coefs_logr = logr.coef_
best_coefs_logr = best_coefs_logr[0]

In [46]:
ensemble3 = best_coefs_logr[0]*y_rf + best_coefs_logr[1]*y_knn + best_coefs_logr[2]*y_svm + best_coefs_logr[3]*y_nn

In [47]:
LRAP(y_val, ensemble3)

0.6229818811855752

# Best found ensemble

In [36]:
ensemble3 = 0.83*y_nn + 0.14*y_rf + 0.02*y_svm + 0.01*y_knn

In [37]:
LRAP(y_val, ensemble3)

0.6399142787929145

Let's try balancing the classes first and then doing the regression. We can also try bootstrapping a regression.

In [26]:
print("number of 1s:",np.sum(flat_y))
print("number of 0s:",len(flat_y))
print("%:", np.sum(flat_y)/len(flat_y) * 100)

number of 1s: 6969.0
number of 0s: 5246802
%: 0.13282376579104757


Let's try upweighting. I'm going to add more 1s until they're 10% of the data

In [53]:
len(positive_indices)

5246802

In [73]:
def double_the_positive_class(x, y, times = 1):
    positive_indices = (y == 1)
    y_addition = y[positive_indices]
    x_addition = x.loc[positive_indices,:]
    
    x = pd.concat([x, x_addition])
    y = np.concatenate([y, y_addition])
    
    if times == 1:
        return x,y
    else:
        return double_the_positive_class(x, y, times - 1)

In [105]:
flat_x_upweight, flat_y_upweight = double_the_positive_class(flat_x, flat_y, times = 11)

In [106]:
print("resulting x:", flat_x_upweight.shape)
print("resulting y:", flat_y_upweight.shape)

resulting x: (19512345, 4)
resulting y: (19512345,)


In [107]:
print("number of 1s:",np.sum(flat_y_upweight))
print("number of 0s:",len(flat_y_upweight))
print("%:", np.sum(flat_y_upweight)/len(flat_y_upweight) * 100)

number of 1s: 14272512.0
number of 0s: 19512345
%: 73.14606214681014


Now we try linear regression again

In [108]:
lr2 = LinearRegression(fit_intercept = False)

In [109]:
lr2.fit(flat_x_upweight, flat_y_upweight)
best_coefs_upweight = lr.coef_
best_coefs_upweight

array([0.4162748 , 0.06372309, 0.12348627, 0.55210131])

In [110]:
flat_x.columns

Index(['rf', 'knn', 'svm', 'nn'], dtype='object')

In [111]:
ensemble_upweight = best_coefs_upweight[0]*y_rf + best_coefs_upweight[1]*y_knn + \
best_coefs_upweight[2]*y_svm + best_coefs_upweight[3]*y_nn

In [112]:
LRAP(y_val, ensemble_upweight)

0.6355328731282031