In [113]:
import pandas as pd
import numpy as np
from sklearn.metrics import label_ranking_average_precision_score as LRAP

In [114]:
NUM_FEATURES = 5000
NUM_CLASSES = 3993

In [115]:
#Get true predictions
y_val = np.array(pd.read_csv("../data/expanded/dev_labels.csv", names=range(NUM_CLASSES)))

In [116]:
y_val.shape

(1314, 3993)

In [117]:
#Tree
y_rf = np.array(pd.read_csv("../public_data/saved_rf_probabilities.csv", names=range(NUM_CLASSES),header= 0))

In [118]:
y_rf.shape

(1314, 3993)

In [119]:
score_rf = LRAP(y_val,y_rf)
score_rf

0.5604580391775857

In [120]:
#KNN
y_knn = np.array(pd.read_csv("../public_data/knn.csv", names=range(NUM_CLASSES),header = 0))

In [121]:
score_knn = LRAP(y_val,y_knn)
score_knn

0.2912844932011161

In [122]:
#SVM
y_svm = np.array(pd.read_csv("../public_data/svm.csv", names=range(NUM_CLASSES), header = 0))

In [123]:
score_svm = LRAP(y_val,y_svm)
score_svm

0.3391564618701288

In [124]:
#NN
from numpy import load
y_nn = np.array(pd.read_csv('../public_data/nn_ensemble.csv', names=range(NUM_CLASSES)))

In [125]:
y_nn.shape

(1314, 3993)

In [126]:
score_nn = LRAP(y_val,y_nn)
score_nn

0.6377158697574089

In [136]:
def lrap_ensemble(best_coefs):
    ensemble = best_coefs[0]*y_rf + best_coefs[1]*y_knn + best_coefs[2]*y_svm + best_coefs[3]*y_nn
    metric = LRAP(y_val, ensemble)
    
    if metric > 0.6399142787929145:
        print("Better than best found")
    
    return LRAP(y_val, ensemble)

In [186]:
def compute_sqr_loss(y_pred, y_actual):
    return sum((y_pred - y_actual)**2)/len(y_pred)

# Linear Regression

In [178]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression(fit_intercept = False)

In [179]:
# Create the new feature space:
flat_y = y_val.flatten()
flat_x = pd.DataFrame({'rf': y_rf.flatten(), 'knn': y_knn.flatten(), 
                       'svm': y_svm.flatten(), 'nn': y_nn.flatten()})

In [180]:
lr.fit(flat_x, flat_y)
best_coefs = lr.coef_
best_coefs

array([0.4162748 , 0.06372309, 0.12348627, 0.55210131])

In [187]:
compute_sqr_loss(lr.predict(flat_x), flat_y)

0.0007824619900270735

In [188]:
lrap_ensemble(best_coefs)

0.6355328731282031

# Ridge regression

In [20]:
from sklearn.linear_model import Ridge

ridge = Ridge()

In [21]:
ridge.fit(flat_x, flat_y)
best_coefs_ridge = ridge.coef_
best_coefs_ridge

array([0.41527248, 0.0640503 , 0.12357985, 0.55201876])

In [195]:
lrap_ensemble(best_coefs_ridge)

0.6356017283381509

# Logistic regression

In [39]:
from sklearn.linear_model import LogisticRegression

logr = LogisticRegression()

In [40]:
logr.fit(flat_x, flat_y)

array([[13.07276334,  0.43036206,  3.52409978,  3.09973243]])

In [45]:
best_coefs_logr = logr.coef_
best_coefs_logr = best_coefs_logr[0]

In [135]:
lrap_ensemble(best_coefs_logr)

0.6229818811855752

# Best found ensemble

In [36]:
ensemble3 = 0.83*y_nn + 0.14*y_rf + 0.02*y_svm + 0.01*y_knn

In [37]:
LRAP(y_val, ensemble3)

0.6399142787929145

Let's try balancing the classes first and then doing the regression. We can also try bootstrapping a regression.

In [26]:
print("number of 1s:",np.sum(flat_y))
print("number of 0s:",len(flat_y))
print("%:", np.sum(flat_y)/len(flat_y) * 100)

number of 1s: 6969.0
number of 0s: 5246802
%: 0.13282376579104757


Let's try upweighting. I'm going to add more 1s until they're 10% of the data

In [53]:
len(positive_indices)

5246802

In [73]:
def double_the_positive_class(x, y, times = 1):
    positive_indices = (y == 1)
    y_addition = y[positive_indices]
    x_addition = x.loc[positive_indices,:]
    
    x = pd.concat([x, x_addition])
    y = np.concatenate([y, y_addition])
    
    if times == 1:
        return x,y
    else:
        return double_the_positive_class(x, y, times - 1)

In [105]:
flat_x_upweight, flat_y_upweight = double_the_positive_class(flat_x, flat_y, times = 11)

In [106]:
print("resulting x:", flat_x_upweight.shape)
print("resulting y:", flat_y_upweight.shape)

resulting x: (19512345, 4)
resulting y: (19512345,)


In [107]:
print("number of 1s:",np.sum(flat_y_upweight))
print("number of 0s:",len(flat_y_upweight))
print("%:", np.sum(flat_y_upweight)/len(flat_y_upweight) * 100)

number of 1s: 14272512.0
number of 0s: 19512345
%: 73.14606214681014


Now we try linear regression again

In [164]:
lr2 = LinearRegression(fit_intercept = False)

In [165]:
lr2.fit(flat_x_upweight, flat_y_upweight)
best_coefs_upweight = lr2.coef_
best_coefs_upweight

array([0.79295166, 0.03310764, 0.4477252 , 0.25271007])

In [166]:
flat_x.columns

Index(['rf', 'knn', 'svm', 'nn'], dtype='object')

In [168]:
lrap_ensemble(best_coefs_upweight)

0.621022538578565

# Feature engineering

In [128]:
flat_x.head(5)

Unnamed: 0,rf,knn,svm,nn
0,0.0,0.0,0.0,0.000177
1,0.0,0.0,0.0,0.001591
2,0.0,0.0,0.0,0.000203
3,0.23,0.0,0.0,0.0985
4,0.0,0.0,0.0,0.00012


In [158]:
flat_x_expanded = flat_x.copy()

In [196]:
flat_x_expanded['rf_nn'] = flat_x['rf'] * flat_x['nn']
flat_x_expanded['rf_svm'] = flat_x['rf'] * flat_x['svm']
flat_x_expanded['nn_knn'] = flat_x['nn'] * flat_x['knn']

In [191]:
lr3 = LinearRegression(fit_intercept = False)

In [192]:
lr3.fit(flat_x_expanded, flat_y)
best_coefs_interaction = lr3.coef_
best_coefs_interaction

array([ 0.64664329,  0.07529995,  0.10413383,  0.77085496, -0.77381678,
        0.08353425])

In [193]:
ensemble = best_coefs_interaction[0]*y_rf + \
best_coefs_interaction[1]*y_knn + \
best_coefs_interaction[2]*y_svm + \
best_coefs_interaction[3]*y_nn + \
best_coefs_interaction[4]*np.multiply(y_nn, y_rf) + \
best_coefs_interaction[5]*np.multiply(y_svm, y_rf) + \
best_coefs_interaction[6]*np.multiply(y_nn, y_knn)

In [194]:
LRAP(y_val, ensemble)

0.6337451520733466