In [None]:
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBClassifier # conda install -c conda-forge xgboost

# Based on https://stackabuse.com/gradient-boosting-classifiers-in-python-with-scikit-learn/

In [None]:
def unison_shuffled_copies(a, b):
    #https://stackoverflow.com/questions/4601373/better-way-to-shuffle-two-numpy-arrays-in-unison
    #mix two arrays randomly in parallel
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

In [None]:
wildtype_data = np.loadtxt ("result0_wt.txt")
wildtype_data = wildtype_data [:,1:] # delete first column which is frame number
wildtype_label = np.zeros(len(wildtype_data)) #set wildtype labels to 1 

mutant_1_data = np.loadtxt ("result1_D132-H.txt")
mutant_1_data = mutant_1_data [:,1:] # delete first column which is frame number
mutant_1_label = np.ones(len(mutant_1_data)) #set mutant labels to 1

print('Wildtype Training Data Shape:', wildtype_data.shape)
print('Wildtype Label Data Shape:   ', wildtype_label.shape)
print('D132-H   Training Data Shape:', mutant_1_data.shape)
print('D132-H   Label Data Shape:   ', mutant_1_label.shape)

for j in range(1000): # print out examples of random data sets
    i = np.random.randint(0, len(wildtype_data)) # pick a random data set
    plt.plot(wildtype_data[i], color = "blue", alpha = 0.002)
    plt.plot(mutant_1_data[i], color = "red", alpha = 0.002)
plt.savefig("input.png", dpi=300)

In [None]:
# generate combined and shuffled input data file
lcp_data = np.vstack((wildtype_data, mutant_1_data))
print ("Combined input_data.shape:", lcp_data.shape)

label_data = np.hstack((wildtype_label, mutant_1_label))
print ("Combined label_data.shape:", label_data.shape, "\n")

# here we shuffle both tensors simultaneously to maintain the labels with each data set
lcp_data, label_data = unison_shuffled_copies (lcp_data, label_data)

In [None]:
#  normalize shuffled input data file
upper_limit          = int(len (lcp_data)) # get total length of concatenated data
upper_training_limit = int(len (lcp_data) * 0.8) # 80% of data used for training
print ("Total number of combined data points:\t\t\t",upper_limit, "\nTotal number of data points selected for training:\t", upper_training_limit, "\n")

lcp_data = lcp_data/100 # normalizing
train_data = lcp_data [0:upper_training_limit,:] # select training data - first 80%
test_data  = lcp_data [upper_training_limit:upper_limit,:] # select last 20% for testing

train_label = label_data [0:upper_training_limit] # select label data - first 80%
test_label  = label_data [upper_training_limit:upper_limit] # select last 20% for testing

print (train_data.shape, test_data.shape)
print (train_label.shape, test_label.shape)

In [None]:
xgb_clf = XGBClassifier()
xgb_clf.fit(train_data, train_label)
score = xgb_clf.score(test_data, test_label)
print('\n\n Classification Accuracy:\t', score)