In [1]:
import csv
import numpy as np
from matplotlib import pyplot as plt 
import itertools

In [2]:
data_path = "Data/train.csv"

dtypes = "i8,S5,f8,f8,f8,f8,f8,f8,f8,f8,f8,f8,f8,f8,f8,f8,f8,f8,f8,f8,f8,f8,f8,f8,i8,f8,f8,f8,f8,f8,f8,f8"
with open(data_path, 'r') as f:
    reader = csv.reader(f, delimiter=',')
    headers = next(reader)
data = np.genfromtxt(data_path, delimiter=",",names=True, dtype=dtypes)

# REMOVING -999.0 values, turining to np.nan
data2 = data.copy()
nan_ind = []
for col in headers[2:]:
    for i in range(len(data2[col])):
        if data2[col][i] == -999.0:
            data2[col][i] = np.nan
            nan_ind.append(i)

nan_ind.sort()
nan_ind = list(set(nan_ind))

In [3]:
def split_by_thresholds(data2):
    #Identifiers
    split_d = {}

    split_d["PRI_jet_num"] = ["J0","J1","J2","J3"]
    split_d["DER_met_phi_centrality"] = ["p+","p-"]
    split_d["DER_lep_eta_centrality"] = ["e+","e-"] 

    # Expand jet_num to include combinations
    comb = list(itertools.combinations(split_d["PRI_jet_num"],1)) 
    comb = list(itertools.combinations(split_d["PRI_jet_num"],2)) + comb
    comb = list(itertools.combinations(split_d["PRI_jet_num"],3)) + comb
    split_d["PRI_jet_num"] = comb

    comb = split_d["PRI_jet_num"] + split_d["DER_met_phi_centrality"] + split_d["DER_lep_eta_centrality"]

    # Prepare combinations of 2 sets of features
    comb2 = list(itertools.product(*list(split_d.values())[0:2]))
    comb2 = list(itertools.product(*list(split_d.values())[1:3])) + comb2
    comb2 = list(itertools.product(*[list(split_d.values())[0],list(split_d.values())[-1]])) + comb2

    # Prepare combinations of 3 sets of features
    comb3 = list(itertools.product(*list(split_d.values())[0:3]))

    combination_names = comb+comb2+comb3
    
    #ACTUALLY GENERATING TRUTH TABLES
    # Select which header to split and how
    split_d = {}

    split_d["PRI_jet_num"] = [data2["PRI_jet_num"] == 0,
                              data2["PRI_jet_num"] == 1,
                              data2["PRI_jet_num"] == 2,
                              data2["PRI_jet_num"] == 3]

    split_d["DER_met_phi_centrality"] = [data2["DER_met_phi_centrality"] > 0,
                                         data2["DER_met_phi_centrality"] < 0]

    split_d["DER_lep_eta_centrality"] = [data2["DER_lep_eta_centrality"] > 0.5,
                                         data2["DER_lep_eta_centrality"] < 0.5]   


    # Expand jet_num to include combinations
    comb = list(itertools.combinations(split_d["PRI_jet_num"],1)) 
    comb = list(itertools.combinations(split_d["PRI_jet_num"],2)) + comb
    comb = list(itertools.combinations(split_d["PRI_jet_num"],3)) + comb
    comb = [np.logical_or.reduce(tup) for tup in comb]
    split_d["PRI_jet_num"] =  comb

    # Combinations of 1 set of features
    comb = split_d["PRI_jet_num"] + split_d["DER_met_phi_centrality"] + split_d["DER_lep_eta_centrality"]

    # Combinations of 2 sets of features
    comb2 = list(itertools.product(*list(split_d.values())[0:2]))
    comb2 = list(itertools.product(*list(split_d.values())[1:3])) + comb2
    comb2 = list(itertools.product(*[list(split_d.values())[0],list(split_d.values())[-1]])) + comb2
    comb2 = [np.logical_or.reduce(tup) for tup in comb2]

    # Combinations of 3 sets of features
    comb3 = list(itertools.product(*list(split_d.values())[0:3]))
    comb3 = [np.logical_or.reduce(tup) for tup in comb3]

    # All outputs
    combination = comb+comb2+comb3
    return combination, combination_names

In [4]:
def get_as_list_of_lists_training(xinput, extra_params=False):
    xlist=[]
    for i in xinput:
        sublist=[]
        for x in i:
            sublist.append(x)
        if extra_params:
            sublist=sublist[2:]       # Should be 1: if prediction is removed from the input data
        xlist.append(sublist)
        
    return xlist

In [5]:
def add_fake_feature(listx):
    for x in listx:
        x.insert(0,1)

In [6]:
def split_data(x, y, ratio, seed=1):
    """split the dataset based on the split ratio."""
    # set seed
    np.random.seed(seed)

    # split the data based on the given ratio   
    p = np.random.permutation(len(y))
    y = y[p]
    x = x[p]
    
    limit = int(len(y)*ratio)
        
    return x[:limit],x[limit:],y[:limit],y[limit:]

In [7]:
def least_squares(y, tx):
    """calculate the least squares."""
    XX = np.dot(tx.transpose(),tx)
    B = np.dot(tx.transpose(),y)
    if len(XX.shape)==0:
        w_s = B/XX
    else:
        w_s = np.linalg.solve(XX,B)    
    
    loss = compute_loss(y, tx, w_s)
    
    return w_s, loss 

In [8]:
def compute_loss(y, tx, w):
    """Calculate the loss using MSE."""
    e = y - tx.dot(w)
    lv = 0.5*np.mean(np.square(e))
    return lv

In [9]:
# defining a function that executes the model

def execute_LPM(xtrainInput, ytrainInput, xtestInput, ytestInput):

    xlist = get_as_list_of_lists_training(xtestInput)
    add_fake_feature(xlist)

    trainxlist = get_as_list_of_lists_training(xtrainInput)
    add_fake_feature(trainxlist)

    # formatting y to array of 0,1
    #y=(ytrainInput==b's').astype(int)
    Ytrain=np.array(ytrainInput)
    Xtrain=np.array(trainxlist)
        
    weights,MSE=least_squares(Ytrain, Xtrain)
    
    #applying predictions
    xtest=np.array(xlist)

    if (len(weights.shape) == 0):
        return 0,0
    
    Ypred=np.dot(xtest,weights)
    Ypredclass=(Ypred>0.5).astype(int)
    
    for i in range(Ypredclass.size):
        if Ypredclass[i]==0:
            Ypredclass[i]=-1
            
    Ytest=np.array(ytestInput)
    n=len(Ytest)

    #test accuracy
    acctest=(Ypredclass==Ytest).sum()/n
    #train accuracy
    nt=Ytrain.size
    ypredtrain=np.dot(Xtrain,weights)
    ypredtrainclass=(ypredtrain>0.5).astype(int)
    acctrain=(Ytrain==ypredtrainclass).sum()/nt

    '''
    idcolumn=xtrainInput[['Id']]
    idlist=[i[0] for i in idcolumn]

    a=np.append(np.array(idlist).reshape(-1,1).astype(int),Ypredclass.reshape(-1,1).astype(int),axis=1)

    np.savetxt("submission.csv", a, delimiter=',', header="Id,Prediction", comments="", fmt='%d')
    '''
    
    return acctrain,acctest

In [10]:
comb, comb_n = split_by_thresholds(data2)

  split_d["DER_lep_eta_centrality"] = [data2["DER_lep_eta_centrality"] > 0.5,
  data2["DER_lep_eta_centrality"] < 0.5]


In [None]:
ratio = 0.8 # Ratio of train data

for c_tr,c_n in zip(comb,comb_n):
    data_s = data2[c_tr]
    ylist = (data_s['Prediction']==b'b').astype(int)
    xlist = get_as_list_of_lists_training(data_s, True)
    
    Xtrain, Xtest, Ytrain, Ytest = split_data(np.array(xlist), ylist, ratio, seed=1)
    
    nans = {}
    
    for point in Xtrain:
        for i,feat in enumerate(point):
            if np.isnan(feat):
                if i in nans:
                    nans[i] += 1
                else:
                    nans[i] = 1
                
    print(c_n, nans)

('J0', 'J1', 'J2') {4: 141899, 5: 141899, 6: 141899, 12: 141899, 26: 141899, 27: 141899, 28: 141899, 23: 79966, 24: 79966, 25: 79966, 0: 29349}
('J0', 'J1', 'J3') {4: 141900, 5: 141900, 6: 141900, 12: 141900, 26: 141900, 27: 141900, 28: 141900, 23: 79826, 24: 79826, 25: 79826, 0: 28140}
('J0', 'J2', 'J3') {0: 24400, 4: 79889, 5: 79889, 6: 79889, 12: 79889, 23: 79889, 24: 79889, 25: 79889, 26: 79889, 27: 79889, 28: 79889}
('J1', 'J2', 'J3') {4: 61979, 5: 61979, 6: 61979, 12: 61979, 26: 61979, 27: 61979, 28: 61979, 0: 9591}
('J0', 'J1') {4: 141965, 5: 141965, 6: 141965, 12: 141965, 26: 141965, 27: 141965, 28: 141965, 0: 26815, 23: 79938, 24: 79938, 25: 79938}
('J0', 'J2') {0: 23189, 4: 79892, 5: 79892, 6: 79892, 12: 79892, 23: 79892, 24: 79892, 25: 79892, 26: 79892, 27: 79892, 28: 79892}
('J0', 'J3') {4: 79957, 5: 79957, 6: 79957, 12: 79957, 23: 79957, 24: 79957, 25: 79957, 26: 79957, 27: 79957, 28: 79957, 0: 22119}
('J1', 'J2') {4: 62053, 5: 62053, 6: 62053, 12: 62053, 26: 62053, 27: 62

In [17]:
# Deletes all points that contain NaNs
# Issue: causes matrix to shrink considerably, making it singular

def delete_nan_points(x,y):
    ret_x, ret_y = zip(*[(x_point,y_point) for x_point,y_point in zip(x,y) if not np.isnan(x_point).any()])
    return ret_x, ret_y

In [25]:
ratio = 0.8 # Ratio of train data

for c_tr,c_n in zip(comb,comb_n):
    data_s = data2[c_tr]
    ylist = (data_s['Prediction']==b'b').astype(int)
    xlist = get_as_list_of_lists_training(data_s, True)
    
    Xtrain, Xtest, Ytrain, Ytest = split_data(np.array(xlist), ylist, ratio, seed=1)
    
    Xtest, Ytest = delete_nan_points(Xtest,Ytest)
    Xtrain, Ytrain = delete_nan_points(Xtrain,Ytrain)
                
    if len(Xtrain) == 0 or len(Xtest) == 0:
        print(c_n, "No operation can be performed, as all points have NaNs")
        continue
    
    acctrain,acctest = execute_LPM(Xtrain, Ytrain, Xtest, Ytest)
    
    print(c_n, acctest, acctrain)

182268 182268
45568 45568
30


LinAlgError: Singular matrix

In [None]:
data2[comb[0]]

In [None]:
data2[comb[0]][2000]

In [None]:
Xtrain.shape

In [None]:
Ytrain.shape

In [None]:
Ytest.shape