# 1. Preparing Data

## 1.1 Load and Check Data

In [19]:
# import data
import pandas as pd
df = pd.read_csv("HeteroticOrbifoldMSSMs.csv")

In [6]:
# check if data is loaded correctly
print(df.shape)
df.head()

(124941, 15)


Unnamed: 0.1,Unnamed: 0,Y0,Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y8,Y9,Y10,Y11,Y12,Y13
0,0,Z2xZ2,7,3,238,47,1,1,9,6,1,6,0,12,42
1,1,Z2xZ2,7,3,214,47,1,1,5,2,1,6,0,12,50
2,2,Z2xZ2,7,3,250,51,1,1,7,4,1,8,0,16,48
3,3,Z2xZ2,7,3,250,35,1,1,7,4,1,10,0,12,46
4,4,Z2xZ2,7,3,234,51,1,1,7,4,1,6,0,20,42


In [7]:
# remove first column
df = df.drop("Unnamed: 0", axis = 1)

In [8]:
# check if data is now correct
print(df.shape)
df.head()

(124941, 14)


Unnamed: 0,Y0,Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y8,Y9,Y10,Y11,Y12,Y13
0,Z2xZ2,7,3,238,47,1,1,9,6,1,6,0,12,42
1,Z2xZ2,7,3,214,47,1,1,5,2,1,6,0,12,50
2,Z2xZ2,7,3,250,51,1,1,7,4,1,8,0,16,48
3,Z2xZ2,7,3,250,35,1,1,7,4,1,10,0,12,46
4,Z2xZ2,7,3,234,51,1,1,7,4,1,6,0,20,42


## 1.2 Split in Training and Test Set, One Hot Encode Geometry and other features

In [9]:
# one hot encode geometry
# pd.get_dummies() just One Hot Encodes categorical data
df = pd.get_dummies(df)
# check data
df.head()

Unnamed: 0,Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y8,Y9,Y10,...,Y0_Z2xZ6-I,Y0_Z3xZ3,Y0_Z3xZ6,Y0_Z4,Y0_Z4xZ4,Y0_Z6-I,Y0_Z6-II,Y0_Z6xZ6,Y0_Z8-I,Y0_Z8-II
0,7,3,238,47,1,1,9,6,1,6,...,0,0,0,0,0,0,0,0,0,0
1,7,3,214,47,1,1,5,2,1,6,...,0,0,0,0,0,0,0,0,0,0
2,7,3,250,51,1,1,7,4,1,8,...,0,0,0,0,0,0,0,0,0,0
3,7,3,250,35,1,1,7,4,1,10,...,0,0,0,0,0,0,0,0,0,0
4,7,3,234,51,1,1,7,4,1,6,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# save column names in list label
label = list(df)
# check label
print(label)

['Y1', 'Y2', 'Y3', 'Y4', 'Y5', 'Y6', 'Y7', 'Y8', 'Y9', 'Y10', 'Y11', 'Y12', 'Y13', 'Y0_Z12-I', 'Y0_Z12-II', 'Y0_Z2xZ2', 'Y0_Z2xZ4', 'Y0_Z2xZ6-I', 'Y0_Z3xZ3', 'Y0_Z3xZ6', 'Y0_Z4', 'Y0_Z4xZ4', 'Y0_Z6-I', 'Y0_Z6-II', 'Y0_Z6xZ6', 'Y0_Z8-I', 'Y0_Z8-II']


In [11]:
# split in input and output
# output: orbifold geometry, which we like to predict
# input: remaining features, which we like to use for predictions
X = df[label[0:13]]
y = df[label[13:]]

In [12]:
# check if split is made correct
print(X.head())
print(y.head())

   Y1  Y2   Y3  Y4  Y5  Y6  Y7  Y8  Y9  Y10  Y11  Y12  Y13
0   7   3  238  47   1   1   9   6   1    6    0   12   42
1   7   3  214  47   1   1   5   2   1    6    0   12   50
2   7   3  250  51   1   1   7   4   1    8    0   16   48
3   7   3  250  35   1   1   7   4   1   10    0   12   46
4   7   3  234  51   1   1   7   4   1    6    0   20   42
   Y0_Z12-I  Y0_Z12-II  Y0_Z2xZ2  Y0_Z2xZ4  Y0_Z2xZ6-I  Y0_Z3xZ3  Y0_Z3xZ6  \
0         0          0         1         0           0         0         0   
1         0          0         1         0           0         0         0   
2         0          0         1         0           0         0         0   
3         0          0         1         0           0         0         0   
4         0          0         1         0           0         0         0   

   Y0_Z4  Y0_Z4xZ4  Y0_Z6-I  Y0_Z6-II  Y0_Z6xZ6  Y0_Z8-I  Y0_Z8-II  
0      0         0        0         0         0        0         0  
1      0         0        0         0  

In [13]:
# One hot encode also input data, since this works better
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
X_ohe = ohe.fit_transform(X)
# Check format
X_ohe.toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [14]:
# make same (train test split with 20 percent test data) as above
from sklearn.model_selection import train_test_split
X_train_ohe, X_test_ohe, y_train, y_test = train_test_split(X_ohe, y, test_size = 0.2, random_state = 1)

In [15]:
print(X_train_ohe.shape)
print(X_test_ohe.shape)

(99952, 704)
(24989, 704)


# 2. Set up Neural Network

In [16]:
# setup model
from keras import models
from keras import layers
from keras import optimizers
from keras import regularizers

model = models.Sequential()
model.add(layers.Dense(600, activation = 'relu',
                       input_shape = (704,)))
model.add(layers.Dense(14, activation = 'softmax'))

model.compile(optimizer = optimizers.RMSprop(lr = 0.001),
              loss = 'categorical_crossentropy',
              metrics = ['accuracy'])

# show summary of model
model.summary()

Using TensorFlow backend.
W0711 15:31:18.983999 139745266288448 deprecation_wrapper.py:119] From /space/ga97hil/anaconda3/envs/keras/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0711 15:31:19.005706 139745266288448 deprecation_wrapper.py:119] From /space/ga97hil/anaconda3/envs/keras/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0711 15:31:19.008459 139745266288448 deprecation_wrapper.py:119] From /space/ga97hil/anaconda3/envs/keras/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0711 15:31:19.042044 139745266288448 deprecation_wrapper.py:119] From /space/ga97hil/anaconda3/envs/keras/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optim

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 600)               423000    
_________________________________________________________________
dense_2 (Dense)              (None, 14)                8414      
Total params: 431,414
Trainable params: 431,414
Non-trainable params: 0
_________________________________________________________________


In [17]:
# fit model
history = model.fit(X_train_ohe, y_train,
                    validation_split = 0.2,
                    batch_size = 256,
                    epochs = 30)

W0711 15:31:19.184976 139745266288448 deprecation.py:323] From /space/ga97hil/anaconda3/envs/keras/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W0711 15:31:19.290042 139745266288448 deprecation_wrapper.py:119] From /space/ga97hil/anaconda3/envs/keras/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.



Train on 79961 samples, validate on 19991 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [18]:
# Plot model history
import matplotlib.pyplot as plt

loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss) +1)

plt.plot(epochs, loss, 'bo', label = 'loss training')
plt.plot(epochs, val_loss, 'b', label = 'loss validation')
plt.title('Estimate of Loss Funcion training/validation')
plt.xlabel('epochs')
plt.ylabel('estimate of loss function')
plt.legend()
plt.show()

acc = history.history['acc']
val_acc = history.history['val_acc']

plt.plot(epochs, acc, 'bo', label = 'training')
plt.plot(epochs, val_acc, 'b', label = 'validataion')
plt.title('Accuracy of training/validation')
plt.xlabel('epochs')
plt.ylabel('estimate of accuracy')
plt.legend()
plt.show()

print('max Accuracy:', max(val_acc))
print('Epochs to max Accuracy:', val_acc.index(max(val_acc)))
print('min Loss:', min(val_loss))
print('Epochs to min Loss;', val_loss.index(min(val_loss)))

<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

max Accuracy: 0.8936021209663558
Epochs to max Accuracy: 28
min Loss: 0.35158313018024834
Epochs to min Loss; 14


## Example of model with some regularization

# 3. Predictions with Neural Network

## Warning: only do this if you have already adjusted your hyperparameters.

## If you touch the hyperparameters again after testing, this leads to Data Leakage and you cannot trust the predictions on the test set anymore

In [60]:
# setup model
from keras import models
from keras import layers
from keras import optimizers
from keras import regularizers

model = models.Sequential()
model.add(layers.Dense(600, activation = 'relu',
                       input_shape = (704,)))
model.add(layers.Dense(14, activation = 'softmax'))

model.compile(optimizer = optimizers.RMSprop(lr = 0.001),
              loss = 'categorical_crossentropy',
              metrics = ['accuracy'])

# show summary of model
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_21 (Dense)             (None, 600)               423000    
_________________________________________________________________
dense_22 (Dense)             (None, 14)                8414      
Total params: 431,414
Trainable params: 431,414
Non-trainable params: 0
_________________________________________________________________


In [61]:
# train again without validation split
model.fit(X_train_ohe, y_train, 
          batch_size = 256, 
          epochs = 13)

Epoch 1/13
Epoch 2/13
Epoch 3/13
Epoch 4/13
Epoch 5/13
Epoch 6/13
Epoch 7/13
Epoch 8/13
Epoch 9/13
Epoch 10/13
Epoch 11/13
Epoch 12/13
Epoch 13/13


<keras.callbacks.History at 0x7f87eccae9b0>

In [62]:
# make predictions
pred = model.predict(X_test_ohe)
y_hat = pred.argmax(axis=1)                  # take max argument of vector as prediction
y_true = y_test.values.argmax(axis=1)        # take max argument of vector as prediction

In [67]:
print(y_hat[0:20])
print(y_true[0:20])

[ 8  8 13  3  0  8  3  3  3  8  8  3  3  5  8  8  3  3  8  3]
[ 8  8 13  3  0  8  3  3  3  8  8  3  3  5  8 11  3  3  8  3]


In [64]:
# confusion matrix
import numpy as np
np.set_printoptions(linewidth=200)             # just for nicer printing

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_hat, y_true))

[[  151     0     0     7     0     1     2     0    11     0     3     0     0     0]
 [    0    37     0     4     3     0     1     0     1     0     0     0     0     7]
 [    0     1   272    20     0     0     0     2     0     0     1     0     0     0]
 [   16    32    57 10758   120     5    27     7   364     0    70    12    18   103]
 [    0     1     0    10    38     0     3     0     3     0     0     4     0     3]
 [    2     0     0     1     0   612    23     0    10     0     1     1     0     0]
 [    8     0     1     9     3    11   622     0   176     2     5    13     0     0]
 [    0     1     0     1     0     0     0    33     0     0     0     0     0     0]
 [   34     3     0   188    37    20   329     0  9038     1     8   415    16    12]
 [    0     0     0     0     0     0     0     0     0     9     0     0     0     0]
 [    6     4     5    21     4     1     2     0     5     0   161     0     2    17]
 [    0     0     0     5     2     0    16

In [70]:
# column names of Y
list(y)

['Y0_Z12-I',
 'Y0_Z12-II',
 'Y0_Z2xZ2',
 'Y0_Z2xZ4',
 'Y0_Z2xZ6-I',
 'Y0_Z3xZ3',
 'Y0_Z3xZ6',
 'Y0_Z4',
 'Y0_Z4xZ4',
 'Y0_Z6-I',
 'Y0_Z6-II',
 'Y0_Z6xZ6',
 'Y0_Z8-I',
 'Y0_Z8-II']

In [71]:
# calculate accuracy of our model on test set
sum(y_hat == y_true)/len(y_hat)

0.8993156989075193

Hence, we are able to predict around 89.9 pecent of the time the right orbifold geometry