# 1. Prepare Data

In [16]:
# load data
import pandas as pd
df = pd.read_csv("HeteroticOrbifoldMSSMs.csv")
df = df.drop("Unnamed: 0", axis = 1)
df.head()

Unnamed: 0,Y0,Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y8,Y9,Y10,Y11,Y12,Y13
0,Z2xZ2,7,3,238,47,1,1,9,6,1,6,0,12,42
1,Z2xZ2,7,3,214,47,1,1,5,2,1,6,0,12,50
2,Z2xZ2,7,3,250,51,1,1,7,4,1,8,0,16,48
3,Z2xZ2,7,3,250,35,1,1,7,4,1,10,0,12,46
4,Z2xZ2,7,3,234,51,1,1,7,4,1,6,0,20,42


In [8]:
# transform Y0 to int and save as y
from sklearn.preprocessing import LabelEncoder
# initialize LabelEncoder
le = LabelEncoder()
# fit and transform Y0
y = le.fit_transform(df.Y0)
# show how labels are transformed
trafo = {i:k for i, k in zip(le.classes_, le.transform(le.classes_))}
trafo

{'Z12-I': 0,
 'Z12-II': 1,
 'Z2xZ2': 2,
 'Z2xZ4': 3,
 'Z2xZ6-I': 4,
 'Z3xZ3': 5,
 'Z3xZ6': 6,
 'Z4': 7,
 'Z4xZ4': 8,
 'Z6-I': 9,
 'Z6-II': 10,
 'Z6xZ6': 11,
 'Z8-I': 12,
 'Z8-II': 13}

In [9]:
# save Y1, Y2, ..., Y13 in X
X = df.iloc[ : ,1:13]
# check format
X.head()

Unnamed: 0,Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y8,Y9,Y10,Y11,Y12
0,7,3,238,47,1,1,9,6,1,6,0,12
1,7,3,214,47,1,1,5,2,1,6,0,12
2,7,3,250,51,1,1,7,4,1,8,0,16
3,7,3,250,35,1,1,7,4,1,10,0,12
4,7,3,234,51,1,1,7,4,1,6,0,20


In [10]:
# split in training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
# shapes
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(99952, 12)
(24989, 12)
(99952,)
(24989,)


# 2. Fit Decision Tree

In [11]:
# fit tree on training set with max depth = 4
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth = 2, random_state = 15)
tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=15, splitter='best')

In [12]:
# visualize tree
from pydotplus import graph_from_dot_data
from sklearn.tree import export_graphviz
dot_data = export_graphviz(tree,
                           filled = True,
                           rounded = True,
                           class_names = ['Z12-I', 'Z12-II', 'Z2xZ2', 'Z2xZ4',
                                           'Z2xZ6-I', 'Z3xZ3', 'Z3xZ6', 'Z4',
                                           'Z4xZ4', 'Z6-I', 'Z6-II', 'Z6xZ6',
                                           'Z8-I', 'Z8-II'],
                           feature_names = list(X),
                           out_file = None)
graph = graph_from_dot_data(dot_data)
# graph.write_png('Plots/tree.png')

True

# 3. Prediction on test set

In [13]:
# make prediction
pred = tree.predict(X_test)

In [14]:
# increase linewidth of printing command for nicer printing
import numpy as np
np.set_printoptions(linewidth = 100)

# Plot confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(pred, y_test)

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [ 199,   78,  326, 9718,  144,  559,  573,   43, 4854,   11,  237,  178,  190,  471],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [  20,    4,    9, 1326,   66,   91,  454,    0, 4882,    1,   22,  521,    3,    9],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   

In [15]:
# claculate accuracy
sum(pred == y_test) / len(y_test)

0.5842570731121693

Around 58.4 percent of predictions are right with a Decision Tree with just 2 splits (better than 44.1 percent of null value)