In [1]:
import h5py
import pandas
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt

def load_data(name):
    with h5py.File(f'{name}.h5', 'r') as f:
        filename = name.split('/')[-1]
        return pandas.DataFrame(f[filename][:], dtype=np.float64)

train = load_data('C:/Users/Chris Bhysicisd/Desktop/Applied machine learning/Project 1/train')
test  = load_data('C:/Users/Chris Bhysicisd/Desktop/Applied machine learning/Project 1/test')

In [2]:
print (f'Shape of training data set: {train.shape}')
print (f'Shape of test data set: {test.shape}')

Shape of training data set: (162500, 166)
Shape of test data set: (160651, 164)


In [3]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree
from sklearn.datasets import load_iris, load_wine
from sklearn.metrics import accuracy_score,log_loss
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif

In [4]:
all_variables = ['actualInteractionsPerCrossing', 'averageInteractionsPerCrossing', 'correctedActualMu', 'correctedAverageMu', 'correctedScaledActualMu', 'correctedScaledAverageMu', 'NvtxReco', 'p_nTracks', 'p_pt_track', 'p_eta', 'p_phi', 'p_charge', 'p_qOverP', 'p_z0', 'p_d0', 'p_sigmad0', 'p_d0Sig', 'p_EptRatio', 'p_dPOverP', 'p_z0theta', 'p_etaCluster', 'p_phiCluster', 'p_eCluster', 'p_rawEtaCluster', 'p_rawPhiCluster', 'p_rawECluster', 'p_eClusterLr0', 'p_eClusterLr1', 'p_eClusterLr2', 'p_eClusterLr3', 'p_etaClusterLr1', 'p_etaClusterLr2', 'p_phiClusterLr2', 'p_eAccCluster', 'p_f0Cluster', 'p_etaCalo', 'p_phiCalo', 'p_eTileGap3Cluster', 'p_cellIndexCluster', 'p_phiModCalo', 'p_etaModCalo', 'p_dPhiTH3', 'p_R12', 'p_fTG3', 'p_weta2', 'p_Reta', 'p_Rphi', 'p_Eratio', 'p_f1', 'p_f3', 'p_Rhad', 'p_Rhad1', 'p_deltaEta1', 'p_deltaPhiRescaled2', 'p_TRTPID', 'p_TRTTrackOccupancy', 'p_numberOfInnermostPixelHits', 'p_numberOfPixelHits', 'p_numberOfSCTHits', 'p_numberOfTRTHits', 'p_numberOfTRTXenonHits', 'p_chi2', 'p_ndof', 'p_SharedMuonTrack', 'p_E7x7_Lr2', 'p_E7x7_Lr3', 'p_E_Lr0_HiG', 'p_E_Lr0_LowG', 'p_E_Lr0_MedG', 'p_E_Lr1_HiG', 'p_E_Lr1_LowG', 'p_E_Lr1_MedG', 'p_E_Lr2_HiG', 'p_E_Lr2_LowG', 'p_E_Lr2_MedG', 'p_E_Lr3_HiG', 'p_E_Lr3_LowG', 'p_E_Lr3_MedG', 'p_ambiguityType', 'p_asy1', 'p_author', 'p_barys1', 'p_core57cellsEnergyCorrection', 'p_deltaEta0', 'p_deltaEta2', 'p_deltaEta3', 'p_deltaPhi0', 'p_deltaPhi1', 'p_deltaPhi2', 'p_deltaPhi3', 'p_deltaPhiFromLastMeasurement', 'p_deltaPhiRescaled0', 'p_deltaPhiRescaled1', 'p_deltaPhiRescaled3', 'p_e1152', 'p_e132', 'p_e235', 'p_e255', 'p_e2ts1', 'p_ecore', 'p_emins1', 'p_etconeCorrBitset', 'p_ethad', 'p_ethad1', 'p_f1core', 'p_f3core', 'p_maxEcell_energy', 'p_maxEcell_gain', 'p_maxEcell_time', 'p_maxEcell_x', 'p_maxEcell_y', 'p_maxEcell_z', 'p_nCells_Lr0_HiG', 'p_nCells_Lr0_LowG', 'p_nCells_Lr0_MedG', 'p_nCells_Lr1_HiG', 'p_nCells_Lr1_LowG', 'p_nCells_Lr1_MedG', 'p_nCells_Lr2_HiG', 'p_nCells_Lr2_LowG', 'p_nCells_Lr2_MedG', 'p_nCells_Lr3_HiG', 'p_nCells_Lr3_LowG', 'p_nCells_Lr3_MedG', 'p_pos', 'p_pos7', 'p_poscs1', 'p_poscs2', 'p_ptconeCorrBitset', 'p_ptconecoreTrackPtrCorrection', 'p_r33over37allcalo', 'p_topoetconeCorrBitset', 'p_topoetconecoreConeEnergyCorrection', 'p_topoetconecoreConeSCEnergyCorrection', 'p_weta1', 'p_widths1', 'p_widths2', 'p_wtots1', 'p_e233', 'p_e237', 'p_e277', 'p_e2tsts1', 'p_ehad1', 'p_emaxs1', 'p_fracs1', 'p_DeltaE', 'p_E3x5_Lr0', 'p_E3x5_Lr1', 'p_E3x5_Lr2', 'p_E3x5_Lr3', 'p_E5x7_Lr0', 'p_E5x7_Lr1', 'p_E5x7_Lr2', 'p_E5x7_Lr3', 'p_E7x11_Lr0', 'p_E7x11_Lr1', 'p_E7x11_Lr2', 'p_E7x11_Lr3', 'p_E7x7_Lr0', 'p_E7x7_Lr1' ]


In [5]:
X = train[all_variables]
y = train['Truth']
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
'''
15-best feature selection
'''
KBest = SelectKBest(mutual_info_classif, k=15).fit(X, y)
short_feature_names = KBest.get_support(1)
X_new = X[X.columns[short_feature_names]] # final features

In [7]:

'''
make a list of 15 features used
'''
lista = X_new.columns.tolist()
X_test=test[lista]
lista

['p_Reta',
 'p_Rphi',
 'p_Eratio',
 'p_Rhad',
 'p_Rhad1',
 'p_deltaEta1',
 'p_deltaPhiRescaled2',
 'p_E7x7_Lr3',
 'p_deltaEta2',
 'p_ethad',
 'p_ethad1',
 'p_f3core',
 'p_ehad1',
 'p_E5x7_Lr3',
 'p_E7x11_Lr3']

In [8]:
X_new

Unnamed: 0,p_Reta,p_Rphi,p_Eratio,p_Rhad,p_Rhad1,p_deltaEta1,p_deltaPhiRescaled2,p_E7x7_Lr3,p_deltaEta2,p_ethad,p_ethad1,p_f3core,p_ehad1,p_E5x7_Lr3,p_E7x11_Lr3
0,0.948979,0.959359,0.966417,-0.046782,-0.028112,-0.000695,0.000808,201.940689,-0.000545,-1870.002930,-1123.725952,0.003414,-3632.921143,337.980713,470.177124
1,0.932377,0.840511,0.936768,-0.012263,-0.007378,0.001981,-0.001013,412.321869,0.001674,-574.843201,-345.829071,0.003056,-876.445007,412.321869,460.203613
2,0.919559,0.932751,0.976664,0.254060,0.241843,-0.017685,0.011909,3492.513672,-0.007667,7618.711914,7252.364746,0.045345,18070.835938,3492.513672,3333.052734
3,0.950418,0.951195,0.983606,0.018691,0.014779,0.001504,0.000747,755.622925,0.000201,684.548950,541.277222,0.004143,2071.589844,921.178040,1127.115356
4,0.933916,0.868344,0.950665,0.039488,0.016250,-0.002194,0.004280,-99.527588,0.000071,1379.716919,567.770386,0.000045,2306.320557,-75.167221,-188.182098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162495,0.927466,0.954334,0.942983,0.003995,-0.001940,0.000122,-0.000205,867.511475,0.000795,164.084869,-79.678810,0.005298,-256.751862,867.511475,748.347656
162496,0.941625,0.879206,0.942396,-0.013569,-0.000841,0.000140,0.011858,534.195374,-0.000785,-462.635895,-28.658590,0.003244,-104.133392,352.093262,665.589417
162497,0.951824,0.979545,0.956721,0.014000,0.020649,-0.000838,-0.005443,193.382263,0.000914,511.668976,754.674866,0.004215,2010.042236,287.444580,-13.175649
162498,0.916091,0.951640,0.962126,0.002730,-0.000733,-0.075802,-0.057194,1475.285034,-0.035139,105.687500,-28.383762,0.005570,-141.039429,1426.328613,1512.928101


In [9]:
X = X_new
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
import pandas as pd
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
import tensorflow as tf



X_test=test[lista]

input_dim = len(X_train.columns)

neurons = 64
epochs = 100
model = Sequential()

model.add(Dense(neurons, input_dim=input_dim, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=epochs, verbose=1, validation_split=0.2)
predictions = model.predict(X_val)



scores = model.evaluate(X_val, y_val, verbose=1)
print(scores)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [16]:
#Get the number of parameters
num_parameters = model.count_params()

# Print the number of parameters
print("Number of parameters:", num_parameters)

Number of parameters: 1089


In [11]:
y_pred =  model.predict(X_test)



In [12]:
y_pred[:100]

array([[9.0165353e-01],
       [8.8911432e-01],
       [8.9325309e-01],
       [9.0165353e-01],
       [1.6850144e-01],
       [9.0165353e-01],
       [9.0165353e-01],
       [9.0165353e-01],
       [9.0165353e-01],
       [9.0165353e-01],
       [4.9696597e-03],
       [9.0165353e-01],
       [9.0165353e-01],
       [1.9000391e-03],
       [9.0165353e-01],
       [9.0165353e-01],
       [5.5013103e-08],
       [9.0165353e-01],
       [9.0165353e-01],
       [9.0165353e-01],
       [6.3989537e-06],
       [9.0165353e-01],
       [8.8982046e-01],
       [9.0165353e-01],
       [9.0165353e-01],
       [8.1302917e-01],
       [9.0165353e-01],
       [9.0165353e-01],
       [7.6311415e-05],
       [8.2286257e-01],
       [9.0165353e-01],
       [9.0165353e-01],
       [5.1455587e-15],
       [9.0165353e-01],
       [3.0222135e-02],
       [5.4036741e-07],
       [1.9761922e-14],
       [9.0165353e-01],
       [9.0165353e-01],
       [9.0165353e-01],
       [9.0165353e-01],
       [9.016535

In [15]:
import numpy as np


# Generate indices for the index column
indices = np.arange(len(y_pred))




# Convert indices to a one-dimensional array
indices = np.ravel(indices)

# Round y_pred to 8 significant digits
y_pred_rounded = np.around(y_pred, decimals=8)

# Concatenate indices and cluster labels horizontally
data_with_index = np.column_stack((indices, y_pred))

# Save the data with index to a text file
np.savetxt("y_pred_NN.txt", data_with_index, fmt=["%d"] + ["%.8f"], delimiter=",", header="", comments="")
