In [1]:
import csv
import numpy as np
from matplotlib import pyplot as plt 
import itertools

In [2]:
data_path = "Data/train.csv"

dtypes = "i8,S5,f8,f8,f8,f8,f8,f8,f8,f8,f8,f8,f8,f8,f8,f8,f8,f8,f8,f8,f8,f8,f8,f8,i8,f8,f8,f8,f8,f8,f8,f8"
with open(data_path, 'r') as f:
    reader = csv.reader(f, delimiter=',')
    headers = next(reader)
data = np.genfromtxt(data_path, delimiter=",",names=True, dtype=dtypes)

# REMOVING -999.0 values, turining to np.nan
data2 = data.copy()
nan_ind = []
for col in headers[2:]:
    for i in range(len(data2[col])):
        if data2[col][i] == -999.0:
            data2[col][i] = np.nan
            nan_ind.append(i)

nan_ind.sort()
nan_ind = list(set(nan_ind))

In [3]:
def split_by_thresholds(data2):
    #Identifiers
    split_d = {}

    split_d["PRI_jet_num"] = ["J0","J1","J2","J3"]
    split_d["DER_met_phi_centrality"] = ["p+","p-"]
    split_d["DER_lep_eta_centrality"] = ["e+","e-"] 

    # Expand jet_num to include combinations
    comb = list(itertools.combinations(split_d["PRI_jet_num"],1)) 
    comb = list(itertools.combinations(split_d["PRI_jet_num"],2)) + comb
    comb = list(itertools.combinations(split_d["PRI_jet_num"],3)) + comb
    split_d["PRI_jet_num"] = comb

    comb = split_d["PRI_jet_num"] + split_d["DER_met_phi_centrality"] + split_d["DER_lep_eta_centrality"]

    # Prepare combinations of 2 sets of features
    comb2 = list(itertools.product(*list(split_d.values())[0:2]))
    comb2 = list(itertools.product(*list(split_d.values())[1:3])) + comb2
    comb2 = list(itertools.product(*[list(split_d.values())[0],list(split_d.values())[-1]])) + comb2

    # Prepare combinations of 3 sets of features
    comb3 = list(itertools.product(*list(split_d.values())[0:3]))

    combination_names = comb+comb2+comb3
    
    #ACTUALLY GENERATING TRUTH TABLES
    # Select which header to split and how
    split_d = {}

    split_d["PRI_jet_num"] = [data2["PRI_jet_num"] == 0,
                              data2["PRI_jet_num"] == 1,
                              data2["PRI_jet_num"] == 2,
                              data2["PRI_jet_num"] == 3]

    split_d["DER_met_phi_centrality"] = [data2["DER_met_phi_centrality"] > 0,
                                         data2["DER_met_phi_centrality"] < 0]

    split_d["DER_lep_eta_centrality"] = [data2["DER_lep_eta_centrality"] > 0.5,
                                         data2["DER_lep_eta_centrality"] < 0.5]   


    # Expand jet_num to include combinations
    comb = list(itertools.combinations(split_d["PRI_jet_num"],1)) 
    comb = list(itertools.combinations(split_d["PRI_jet_num"],2)) + comb
    comb = list(itertools.combinations(split_d["PRI_jet_num"],3)) + comb
    comb = [np.logical_or.reduce(tup) for tup in comb]
    split_d["PRI_jet_num"] =  comb

    # Combinations of 1 set of features
    comb = split_d["PRI_jet_num"] + split_d["DER_met_phi_centrality"] + split_d["DER_lep_eta_centrality"]

    # Combinations of 2 sets of features
    comb2 = list(itertools.product(*list(split_d.values())[0:2]))
    comb2 = list(itertools.product(*list(split_d.values())[1:3])) + comb2
    comb2 = list(itertools.product(*[list(split_d.values())[0],list(split_d.values())[-1]])) + comb2
    comb2 = [np.logical_or.reduce(tup) for tup in comb2]

    # Combinations of 3 sets of features
    comb3 = list(itertools.product(*list(split_d.values())[0:3]))
    comb3 = [np.logical_or.reduce(tup) for tup in comb3]

    # All outputs
    combination = comb+comb2+comb3
    return combination, combination_names

In [4]:
def get_as_list_of_lists_training(xinput, extra_params=False):
    xlist=[]
    for i in xinput:
        sublist=[]
        for x in i:
            sublist.append(x)
        if extra_params:
            sublist=sublist[2:]       # Should be 1: if prediction is removed from the input data
        xlist.append(sublist)
        
    return xlist

In [5]:
def add_fake_feature(listx):
    for x in listx:
        x.insert(0,1)

In [6]:
def split_data(x, y, ratio, seed=1):
    """split the dataset based on the split ratio."""
    # set seed
    np.random.seed(seed)

    # split the data based on the given ratio: TODO    
    p = np.random.permutation(len(y))
    y = y[p]
    x = x[p]
    
    limit = int(len(y)*ratio)
        
    return x[:limit],x[limit:],y[:limit],y[limit:]

In [7]:
def least_squares(y, tx):
    """calculate the least squares solution."""
    XX = np.dot(tx.transpose(),tx)
    B = np.dot(tx.transpose(),y)
    if len(XX.shape)==0:
        w_s = B/XX
    else:
        w_s = np.linalg.solve(XX,B)    
    
    loss = compute_loss(y, tx, w_s)
    
    return w_s, loss 

In [8]:
def compute_loss(y, tx, w):
    """Calculate the loss using MSE."""
    e = y - tx.dot(w)
    lv = 0.5*np.mean(np.square(e))
    return lv

In [9]:
# defining a function that executes the model

def execute_LPM(xtrainInput, ytrainInput, xtestInput, ytestInput):

    xlist = get_as_list_of_lists_training(xtestInput)
    add_fake_feature(xlist)

    trainxlist = get_as_list_of_lists_training(xtrainInput)
    add_fake_feature(trainxlist)

    # formatting y to array of 0,1
    #y=(ytrainInput==b's').astype(int)
    Ytrain=np.array(ytrainInput)
    Xtrain=np.array(trainxlist)
        
    MSE,weights=least_squares(Ytrain, Xtrain)

    #applying predictions
    xtest=np.array(xlist)

    if (len(weights.shape) == 0):
        return 0,0
    
    Ypred=np.dot(xtest,weights)
    Ypredclass=(Ypred>0.5).astype(int)
    
    for i in range(Ypredclass.size):
        if Ypredclass[i]==0:
            Ypredclass[i]=-1
            
    n=len(Ytest)
    Ytest=np.array(ytestInput)

    #test accuracy
    acctest=(Ypredclass==Ytest).sum()/n
    #train accuracy
    nt=Ytrain.size
    ypredtrain=np.dot(Xtrain,weights)
    ypredtrainclass=(ypredtrain>0.5).astype(int)
    acctrain=(Ytrain==ypredtrainclass).sum()/nt

    '''
    idcolumn=xtrainInput[['Id']]
    idlist=[i[0] for i in idcolumn]

    a=np.append(np.array(idlist).reshape(-1,1).astype(int),Ypredclass.reshape(-1,1).astype(int),axis=1)

    np.savetxt("submission.csv", a, delimiter=',', header="Id,Prediction", comments="", fmt='%d')
    '''
    
    return acctrain,acctest

In [10]:
## 

In [11]:
comb, comb_n = split_by_thresholds(data2)

In [12]:
ratio = 0.2
#THESE RATIO FIX??? CAREFUL represents percentage of test data

for c_tr,c_n in zip(comb,comb_n):
    data_s = data2[c_tr]
    ylist = (data_s['Prediction']==b'b').astype(int)
    xlist = get_as_list_of_lists_training(data_s, True)
    
    Xtest, Xtrain, Ytest, Ytrain= split_data(np.array(xlist), ylist, ratio, seed=1)
    
    acctrain,acctest = execute_LPM(Xtrain, Ytrain, Xtest, Ytest)
    
    print(c_n, acctest, acctrain)

('J0', 'J1', 'J2') 0 0
('J0', 'J1', 'J3') 0 0
('J0', 'J2', 'J3') 0 0
('J1', 'J2', 'J3') 0 0
('J0', 'J1') 0 0
('J0', 'J2') 0 0
('J0', 'J3') 0 0
('J1', 'J2') 0 0
('J1', 'J3') 0 0
('J2', 'J3') 0 0
('J0',) 0 0
('J1',) 0 0
('J2',) 0 0
('J3',) 0 0
p+ 0 0
p- 0 0
e+ 0 0
e- 0 0
(('J0', 'J1', 'J2'), 'e+') 0 0
(('J0', 'J1', 'J2'), 'e-') 0 0
(('J0', 'J1', 'J3'), 'e+') 0 0
(('J0', 'J1', 'J3'), 'e-') 0 0
(('J0', 'J2', 'J3'), 'e+') 0 0
(('J0', 'J2', 'J3'), 'e-') 0 0


KeyboardInterrupt: 

In [13]:
data2[comb[0]]

array([(100000, b's', 138.47 ,  51.655,  97.827, 27.98 , 0.91, 124.711, 2.666, 3.064, 41.928, 197.76 , 1.582,  1.396, 0.2, 32.638,  1.017,  0.381,  51.626,  2.273, -2.414, 16.824, -0.277, 258.733, 2, 67.435, 2.15 ,  0.444, 46.062, 1.24, -2.475, 113.497),
       (100001, b'b', 160.937,  68.768, 103.235, 48.146,  nan,     nan,   nan, 3.473,  2.078, 125.157, 0.879,  1.414, nan, 42.014,  2.039, -3.011,  36.918,  0.501,  0.103, 44.704, -1.916, 164.546, 1, 46.226, 0.725,  1.158,    nan,  nan,    nan,  46.226),
       (100002, b'b',     nan, 162.172, 125.953, 35.635,  nan,     nan,   nan, 3.148,  9.336, 197.814, 3.776,  1.414, nan, 32.154, -0.705, -2.093, 121.409, -0.953,  1.052, 54.283, -2.186, 260.414, 1, 44.251, 2.053, -2.028,    nan,  nan,    nan,  44.251),
       ...,
       (349997, b's', 105.457,  60.526,  75.839, 39.757,  nan,     nan,   nan, 2.39 , 22.183, 120.462, 1.202,  0.529, nan, 35.636, -0.266, -3.132,  42.834,  0.381,  0.851, 23.419, -2.89 , 198.907, 1, 41.992, 1.8  , -0.166, 

In [24]:
data2[comb[0]][2000]

(102208, b'b', 106.903, 8.881, 71.607, 16.087, nan, nan, nan, 2.925, 16.087, 72.017, 0.683, 1.39, nan, 42.786, -0.955, 1.944, 29.231, -1.35, -1.44, 13.072, -1.899, 239.301, 0, nan, nan, nan, nan, nan, nan, 0.)

In [None]:
Xtrain.shape

In [None]:
Ytrain.shape

In [None]:
Ytest.shape