# Higgs Dataset

## What happens when you smash things together near the speed of light?

The Higgs boson has been sought after for decades. Can we use machine learning to gather any more information about it?

In [22]:
import pandas as pd
import keras
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
%matplotlib inline
np.random.seed(0)

In [23]:
column_names = ['Class Label', 'lepton pT', 'lepton eta', 
                'lepton phi', 'missing energy magnitude',
                'missing energy phi', 'jet 1 pt', 'jet 1 eta',
                'jet 1 phi', 'jet 1 b-tag', 'jet 2 pt',  'jet 2 eta', 
                'jet 2 phi', 'jet 2 b-tag', 'jet 3 pt', 'jet 3 eta', 
                'jet 3 phi',' jet 3 b-tag', 'jet 4 pt', 'jet 4 eta', 
                'jet 4 phi', 'jet 4 b-tag', 'm_jj', 'm_jjj', 'm_lv', 
                'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb'
               ]

In [24]:
df = pd.read_csv('datasets/HIGGS.csv', names = column_names)

In [25]:
df.head()

Unnamed: 0,Class Label,lepton pT,lepton eta,lepton phi,missing energy magnitude,missing energy phi,jet 1 pt,jet 1 eta,jet 1 phi,jet 1 b-tag,...,jet 4 eta,jet 4 phi,jet 4 b-tag,m_jj,m_jjj,m_lv,m_jlv,m_bb,m_wbb,m_wwbb
0,1.0,0.869293,-0.635082,0.22569,0.32747,-0.689993,0.754202,-0.248573,-1.092064,0.0,...,-0.010455,-0.045767,3.101961,1.35376,0.979563,0.978076,0.920005,0.721657,0.988751,0.876678
1,1.0,0.907542,0.329147,0.359412,1.49797,-0.31301,1.095531,-0.557525,-1.58823,2.173076,...,-1.13893,-0.000819,0.0,0.30222,0.833048,0.9857,0.978098,0.779732,0.992356,0.798343
2,1.0,0.798835,1.470639,-1.635975,0.453773,0.425629,1.104875,1.282322,1.381664,0.0,...,1.128848,0.900461,0.0,0.909753,1.10833,0.985692,0.951331,0.803252,0.865924,0.780118
3,0.0,1.344385,-0.876626,0.935913,1.99205,0.882454,1.786066,-1.646778,-0.942383,0.0,...,-0.678379,-1.360356,0.0,0.946652,1.028704,0.998656,0.728281,0.8692,1.026736,0.957904
4,1.0,1.105009,0.321356,1.522401,0.882808,-1.205349,0.681466,-1.070464,-0.921871,0.0,...,-0.373566,0.113041,0.0,0.755856,1.361057,0.98661,0.838085,1.133295,0.872245,0.808487


In [26]:
df.describe()

Unnamed: 0,Class Label,lepton pT,lepton eta,lepton phi,missing energy magnitude,missing energy phi,jet 1 pt,jet 1 eta,jet 1 phi,jet 1 b-tag,...,jet 4 eta,jet 4 phi,jet 4 b-tag,m_jj,m_jjj,m_lv,m_jlv,m_bb,m_wbb,m_wwbb
count,11000000.0,11000000.0,11000000.0,11000000.0,11000000.0,11000000.0,11000000.0,11000000.0,11000000.0,11000000.0,...,11000000.0,11000000.0,11000000.0,11000000.0,11000000.0,11000000.0,11000000.0,11000000.0,11000000.0,11000000.0
mean,0.5299203,0.9914658,-8.297618e-06,-1.327225e-05,0.9985364,2.613459e-05,0.9909152,-2.02752e-05,7.716199e-06,0.9999687,...,-5.756954e-06,1.744903e-05,1.0,1.03429,1.024805,1.050554,1.009742,0.9729596,1.033036,0.959812
std,0.499104,0.5653777,1.008827,1.006346,0.6000185,1.006326,0.4749747,1.009303,1.005901,1.027808,...,1.007694,1.006366,1.400209,0.6746354,0.3808074,0.1645763,0.3974453,0.5254063,0.3652556,0.3133378
min,0.0,0.2746966,-2.434976,-1.742508,0.0002370088,-1.743944,0.1375024,-2.969725,-1.741237,0.0,...,-2.497265,-1.742691,0.0,0.07507046,0.1986757,0.08304866,0.1320062,0.04786215,0.2951122,0.3307214
25%,0.0,0.5907533,-0.7383225,-0.8719308,0.5768156,-0.8712081,0.6789927,-0.687245,-0.8680962,0.0,...,-0.7141902,-0.8714789,0.0,0.7906095,0.8462266,0.9857525,0.7675732,0.6738168,0.8193964,0.7703901
50%,1.0,0.8533714,-5.415563e-05,-0.0002410638,0.8916277,0.0002125454,0.8948193,-2.543566e-05,5.813991e-05,1.086538,...,0.000372133,-0.0002642369,0.0,0.8949304,0.9506853,0.9897798,0.916511,0.8733798,0.9473447,0.8719701
75%,1.0,1.236226,0.7382142,0.870994,1.293056,0.8714708,1.17074,0.6871941,0.8683126,2.173076,...,0.7141017,0.8716055,3.101961,1.02473,1.083493,1.020528,1.142226,1.138439,1.140458,1.059248
max,1.0,12.09891,2.434868,1.743236,15.39682,1.743257,9.940391,2.969674,1.741454,2.173076,...,2.498009,1.743372,3.101961,40.19237,20.37278,7.992739,14.26244,17.76285,11.49652,8.374498


In [27]:
df.isnull().values.any()

False

In [28]:
labels = df["Class Label"]
labels.head()

df.drop(["Class Label"], axis=1, inplace=True)
df.head()

Unnamed: 0,lepton pT,lepton eta,lepton phi,missing energy magnitude,missing energy phi,jet 1 pt,jet 1 eta,jet 1 phi,jet 1 b-tag,jet 2 pt,...,jet 4 eta,jet 4 phi,jet 4 b-tag,m_jj,m_jjj,m_lv,m_jlv,m_bb,m_wbb,m_wwbb
0,0.869293,-0.635082,0.22569,0.32747,-0.689993,0.754202,-0.248573,-1.092064,0.0,1.374992,...,-0.010455,-0.045767,3.101961,1.35376,0.979563,0.978076,0.920005,0.721657,0.988751,0.876678
1,0.907542,0.329147,0.359412,1.49797,-0.31301,1.095531,-0.557525,-1.58823,2.173076,0.812581,...,-1.13893,-0.000819,0.0,0.30222,0.833048,0.9857,0.978098,0.779732,0.992356,0.798343
2,0.798835,1.470639,-1.635975,0.453773,0.425629,1.104875,1.282322,1.381664,0.0,0.851737,...,1.128848,0.900461,0.0,0.909753,1.10833,0.985692,0.951331,0.803252,0.865924,0.780118
3,1.344385,-0.876626,0.935913,1.99205,0.882454,1.786066,-1.646778,-0.942383,0.0,2.423265,...,-0.678379,-1.360356,0.0,0.946652,1.028704,0.998656,0.728281,0.8692,1.026736,0.957904
4,1.105009,0.321356,1.522401,0.882808,-1.205349,0.681466,-1.070464,-0.921871,0.0,0.800872,...,-0.373566,0.113041,0.0,0.755856,1.361057,0.98661,0.838085,1.133295,0.872245,0.808487


In [29]:
#Try using decision tree
scaler = StandardScaler()
scaler.fit(df)
scaled_X_vals = scaler.transform(df)
X_train, X_test, y_train, y_test = train_test_split(scaled_X_vals, labels, test_size=0.5)


In [30]:
clf = DecisionTreeClassifier()
fitted = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

f1 = f1_score(y_test, y_pred)
print("f1 score: {}".format(f1))

f1 score: 0.6783970129475747


In [31]:
X_train.shape

(5500000, 28)

In [32]:
from keras.models import Sequential
from keras.layers import Dense
seq_model = Sequential()
seq_model.add(Dense(28, activation='sigmoid', input_shape=(28,)))
seq_model.add(Dense(16, activation='sigmoid'))
seq_model.add(Dense(4, activation='sigmoid'))
seq_model.add(Dense(1, activation='sigmoid'))
seq_model.compile(loss='binary_crossentropy', optimizer='SGD', metrics=['accuracy'])
seq_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_17 (Dense)             (None, 28)                812       
_________________________________________________________________
dense_18 (Dense)             (None, 16)                464       
_________________________________________________________________
dense_19 (Dense)             (None, 4)                 68        
_________________________________________________________________
dense_20 (Dense)             (None, 1)                 5         
Total params: 1,349
Trainable params: 1,349
Non-trainable params: 0
_________________________________________________________________


In [33]:
seq_model.fit(X_train, y_train, batch_size=64, epochs=10, verbose=1, validation_data=(X_test, y_test))

Train on 5500000 samples, validate on 5500000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a3652efd0>

In [34]:
tanh_model = Sequential()
tanh_model.add(Dense(28, activation='tanh', input_shape=(28,)))
tanh_model.add(Dense(16, activation='tanh'))
tanh_model.add(Dense(4, activation='tanh'))
tanh_model.add(Dense(1, activation='tanh'))
tanh_model.compile(loss='binary_crossentropy', optimizer='SGD', metrics=['accuracy'])
tanh_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_21 (Dense)             (None, 28)                812       
_________________________________________________________________
dense_22 (Dense)             (None, 16)                464       
_________________________________________________________________
dense_23 (Dense)             (None, 4)                 68        
_________________________________________________________________
dense_24 (Dense)             (None, 1)                 5         
Total params: 1,349
Trainable params: 1,349
Non-trainable params: 0
_________________________________________________________________


In [35]:
tanh_model.fit(X_train, y_train, batch_size=64, epochs=10, verbose=1, validation_data=(X_test, y_test))

Train on 5500000 samples, validate on 5500000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a3657eda0>