Random Forest classifier

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 12})

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc

from sklearn.ensemble import RandomForestClassifier

Load the dataset and split it into trainings, test and validation data

In [None]:
# Load the dataset
data = pd.read_pickle('dataset.pkl')

# Extract the validation data from the dataset
val_data = data.loc[data.run >= 97].drop('run', axis=1)
data = data.loc[data.run <= 96].drop('run', axis=1)

# Split the remaining dataset into trainings and test data
test_size=0.25
train_complete = data[:int(len(data)*0.75)]
test_complete = data[int(len(data)*0.75):]

# Method to get data. The dataset can be reduced by a factor and the less important features can be dropped.
def get_data(factor=1):

    train = train_complete[:(len(train_complete)//factor)]
    test = test_complete[:(len(test_complete)//factor)]
    val = val_data[:(len(val_data)//factor)]
    
    print('Length trainings data:', len(train), 'Length test data:', len(test), 'Length validation data:', len(val), '\n')
    return train, test, val

def get_columns():
    return val_data.columns

train, test, val = get_data()

data.head(10)

Load a model

In [None]:
import joblib
forest_clf = joblib.load('models\model_0043')

train, test, val = get_data(factor=1)

Creating and training the model

In [None]:
# Get the trainings, test and validation data
factor = 1
train, test, val = get_data(factor=factor)

# Initialize the random forest classifier
forest_clf = RandomForestClassifier(n_jobs=-1, max_features='auto', n_estimators=40,criterion='entropy')

# Train the classifier on the trainings data
forest_clf.fit(train.drop('y', axis=1), train.y)

Compute the accuracy and F1-score

In [None]:
# Print accuracy and F1-score for the test and validation data
y_pred = forest_clf.predict(test.drop('y', axis=1))
print('\nAccuracy on test data:', accuracy_score(test.y, y_pred))
print('F1-score on test data:', f1_score(test.y, y_pred))
print('Precision on test data:', precision_score(test.y, y_pred))

y_pred = forest_clf.predict(val.drop('y', axis=1))
print('\nAccurcay on validation data:', accuracy_score(val.y, y_pred))
print('F1-score on validation data:', f1_score(val.y, y_pred))
print('Precision on trainings data:', precision_score(val.y, y_pred))

Save the model

In [None]:
from sklearn.externals import joblib
joblib.dump(forest_clf, 'models\model_0099')

Computes the precision and recall for different thresholds and plots the precision-Recal curve

In [None]:
# Compute the predictions of the classifier as probabilities
y_pred = np.transpose(forest_clf.predict_proba(val.drop('y', axis=1)))[1]

# Compute precision, recall and thresholds
precision, recall, thresholds = precision_recall_curve(val.y, y_pred)

# Computes the area under curve from the precision and recall
auc_score = auc(recall, precision)
print('Area under curve:', auc_score)

# Plot the precision-recall curves
#plt.plot([1, 0], [0, 1], 'k--') 
plt.axis([0, 1, 0, 1])
plt.plot(recall, precision, linestyle='-', label='Random Forest')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend(fontsize=10)
plt.tight_layout()
plt.savefig('forest_precision_recall.pdf', format='pdf')
plt.show()

Analyse and plot the feature importance

In [None]:
# Retrieves the feature importance from the random forest and prints it featurewise
importance = forest_clf.feature_importances_
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v), '-', get_columns()[i])
    pass

ticks = [w.replace('cosAngleBetweenMomentumAndVertexVector', 'cosMomemtumVertex') for w in val.drop('y', axis=1).columns]
ticks = [w.replace('_1', '') for w in ticks]
labels = []
first_photon, second_photon = [], []

for i in range(len(ticks)):
    if i % 2 != 0:
        second_photon.append(importance[i])
    else:
        first_photon.append(importance[i])
        labels.append(ticks[i])

for i in range(len(first_photon)):
    highest_index = i
    for j in np.arange(i, len(first_photon)):
        if first_photon[highest_index] < first_photon[j]:
            highest_index = j
            #print('hea')
        #print(highest_index)
    first_photon[i], first_photon[highest_index] = first_photon[highest_index], first_photon[i]
    second_photon[i], second_photon[highest_index] = second_photon[highest_index], second_photon[i]
    labels[i], labels[highest_index] = labels[highest_index], labels[i]
        
        
bar_width = 0.8
index = np.arange(0,len(first_photon)*2,2)
opacity = 0.8

color='g'

plt.bar(index, first_photon, bar_width,
    alpha=opacity,
    color='b',
    label='First photon')

plt.bar(index + bar_width, second_photon, bar_width,
    alpha=opacity,
    color='g',
    label='Second photon')

plt.xticks(index + bar_width / 2, labels, rotation=310, ha='left')

plt.xlabel('Feature')
plt.ylabel('Feature importance / a.u.')
plt.legend(fontsize=10)


plt.savefig('forest_feature_importance.pdf', format='pdf', bbox_inches = "tight")
plt.show()

In [None]:
data = [np.transpose(forest_clf.predict_proba(test.drop('y', axis=1)))[1], np.transpose(forest_clf.predict_proba(val.drop('y', axis=1)))[1], None]
np.save('models/model_0043_data.npy', data)