# Build prediction for AICROWD

In [1]:
import numpy as np

from utils_predictions_manipulation import*
from utils_nans_manipulation import*
from utils_data_loading import*
from utils_features_manipulation import*
from logistic_regression import*

## Load data

In [2]:
traindata,_ = load_data('Data/train.csv')

In [3]:
testdata,_ = load_data('Data/test.csv')

In [4]:
X_train, Y_train = structure_data(traindata)

In [5]:
X_test,_ = structure_data(testdata)

## Data preparation

In [6]:
# Replace -999 value with np.nan
X_nans = replace_bad_data_with_nans(X_train, -999)
X_nans_test = replace_bad_data_with_nans(X_test, -999)

In [7]:
#Remove columns with more than 50% nans
X_clean,del_indexes = replace_nans_with_median(X_nans, 1)

In [8]:
X_test_del = delete_nans_indexes(X_nans_test, del_indexes)

In [None]:
# Replace remaining nans with corresponding feature median
X_test_clean = replace_test_nans_with_median(X_test_del, X_clean)

## Split data by Jet_num

In [None]:
#Define spliting thresholds for ["Feature", "Value"]
thresh = [[19,0.5],
          [19,1.5],
          [19,2.5]]

#Returns set of all possible combinations of splits definded in thresh
#Last set in X_sets, Y_sets contains data split using all thresholds
X_sets, Y_sets, thresholds = split_data_set(X_clean, Y_train, thresh)
_, Test_ind, _ = split_data_set(X_clean, np.array(range(len(Y_train))), thresh)
print("Train split sizes:", [sets.shape[0] for sets in X_sets[-1]])

#Returns split data for test. Array containign indices is passed (Train_ind) and later used to re-merge prediction.
X_sets_t, Train_ind, thresholds = split_data_set(X_test_clean, np.array(range(X_test_clean.shape[0])), thresh)
print("Test split sizes:", [sets.shape[0] for sets in X_sets_t[-1]])

## Creating predictions for each split group
Data is split by Jet_num and processed. All four groups are processed identically, except for group4 where column 22 is removed (all nans).

Each group has a unique polynomial feature expansion. These specific expansions are obtained using test_for_gamma_all/stop functions. Where for each group, and increasing number of polynomials are tested (accounting for adjustments to gamma). See "Models_testing_split_data_logit.ipnyb"

#### Group 1

In [None]:
#Selecting expansions applied to each feature for each group
degree_test = [1, 1/2, 2, 1/3, 3, 1/4, 4, 1/5, 5, 1/6, 6, 1/7]


#Transforming Train Data
X_pass = X_sets[-1][0] #Select group
X_pass = np.delete(X_pass,19,axis=1) #Delete jet_nums column

deg_v = [degree_test for i in range(X_pass.shape[1])] #Define polynomials to use
ind_v = list(range(1,X_pass.shape[1])) #Define columns to expand

X_pass = build_poly_index(X_pass, ind_v, deg_v) #Build expanded array
X_pass, mean, std = standardize(X_pass) #Standardize

Y_pass = Y_sets[-1][0] #Select output vector

#Building model
initial_w = np.ones((X_pass.shape[1]))*(-0.01)
ws,loss = logistic_regression(Y_pass, X_pass, initial_w, max_iters=10000, gamma=0.2, print_=True)

#Transforming Test Data
Xt_pass = X_sets_t[-1][0]
Xt_pass = np.delete(Xt_pass,19,axis=1)
Xt_pass = build_poly_index(Xt_pass, ind_v, deg_v)
Xt_pass,_,_ = standardize_test(Xt_pass, mean, std)

#Building predictions
Y_test = sigmoid(Xt_pass.dot(ws))
Y_pred_G1 = probability_to_prediction(Y_test)

#### Group 2

In [None]:
degree_test = [1, 1/2, 2, 1/3]

#Transforming Train Data
X_pass = X_sets[-1][1]
X_pass = np.delete(X_pass,19,axis=1)

deg_v = [degree_test for i in range(X_pass.shape[1])]
ind_v = list(range(1,X_pass.shape[1]))

X_pass = build_poly_index(X_pass, ind_v, deg_v)
X_pass,mean,std = standardize(X_pass)

Y_pass = Y_sets[-1][1]

#Building model
initial_w = np.ones((X_pass.shape[1]))*(-0.01)
ws,loss = logistic_regression(Y_pass, X_pass, initial_w, max_iters=10000, gamma=0.5, print_=True)

#Transforming Test Data
Xt_pass = X_sets_t[-1][1]
Xt_pass = np.delete(Xt_pass,19,axis=1)
Xt_pass = build_poly_index(Xt_pass, ind_v, deg_v)
Xt_pass,_,_ = standardize_test(Xt_pass, mean, std)

#Building predictions
Y_test = sigmoid(Xt_pass.dot(ws))
Y_pred_G2 = probability_to_prediction(Y_test)

#### Group 3

In [None]:
degree_test = [1, 2, 3, 4, 5, 6, 7, 1/2, 1/3, 1/4]

#Transforming Train Data
X_pass = X_sets[-1][2]
X_pass = np.delete(X_pass,19,axis=1)

deg_v = [degree_test for i in range(X_pass.shape[1])]
ind_v = list(range(1,X_pass.shape[1]))

X_pass = build_poly_index(X_pass, ind_v, deg_v)
X_pass,mean,std = standardize(X_pass)

Y_pass = Y_sets[-1][2]

#Building model
initial_w = np.ones((X_pass.shape[1]))*(-0.01)
ws,loss = logistic_regression(Y_pass, X_pass, initial_w, max_iters=10000, gamma=0.5, print_=True)

#Transforming Test Data
Xt_pass = X_sets_t[-1][2]
Xt_pass = np.delete(Xt_pass,19,axis=1)
Xt_pass = build_poly_index(Xt_pass, ind_v, deg_v)
Xt_pass,_,_ = standardize_test(Xt_pass, mean, std)

#Building predictions
Y_test = sigmoid(Xt_pass.dot(ws))
Y_pred_G3 = probability_to_prediction(Y_test)

#### Group 4 

In [None]:
degree_test = [1, 1/2, 2, 1/3, 3, 1/4, 4]

#Transforming Train Data
X_pass = X_sets[-1][3]
X_pass = np.delete(X_pass,19,axis=1)
X_pass = np.delete(X_pass,22,axis=1) #Column is deleted because it contains only nans

deg_v = [degree_test for i in range(X_pass.shape[1])]
ind_v = list(range(1,X_pass.shape[1]))

X_pass = build_poly_index(X_pass, ind_v, deg_v)
X_pass, mean, std = standardize(X_pass)

Y_pass = Y_sets[-1][3]

#Building model
initial_w = np.ones((X_pass.shape[1]))*(-0.01)
ws,loss = logistic_regression(Y_pass, X_pass, initial_w, max_iters=10000, gamma=1, print_=True)

#Transforming Test Data
Xt_pass = X_sets_t[-1][3]
Xt_pass = np.delete(Xt_pass,19,axis=1)
Xt_pass = np.delete(Xt_pass,22,axis=1)
Xt_pass = build_poly_index(Xt_pass, ind_v, deg_v)
Xt_pass,_,_ = standardize_test(Xt_pass, mean, std)

#Building predictions
Y_test = sigmoid(Xt_pass.dot(ws))
Y_pred_G4 = probability_to_prediction(Y_test)

In [None]:
#Verifying sizes are correct
print("Test split sizes:", [sets.shape[0] for sets in X_sets_t[-1]])
print("Test pred. sizes:", [len(Y_pred_G1), len(Y_pred_G2), len(Y_pred_G3), len(Y_pred_G4)])

In [None]:
#Used stored indices to re-merge predictiosn in correct order
Y_pred = np.zeros(X_test.shape[0])

for i,j in zip(Train_ind[-1][0], Y_pred_G1):
    Y_pred[i]= j
for i,j in zip(Train_ind[-1][1], Y_pred_G2):
    Y_pred[i]= j
for i,j in zip(Train_ind[-1][2], Y_pred_G3):
    Y_pred[i]= j
for i,j in zip(Train_ind[-1][3], Y_pred_G4):
    Y_pred[i]= j

#Verify all predictions are defined
any(Y_pred == 0)

### Storing Output

In [None]:
ids = testdata[['Id']]

Y_final = np.c_[np.array(ids, dtype=np.int64), Y_pred]

np.savetxt("submission.csv", Y_final, delimiter=',', header="Id,Prediction", comments="", fmt='%d')