In [None]:
import math
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split as TTSplit
from sklearn.metrics import f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RepeatedKFold
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.model_selection import KFold
# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import StandardScaler
# from sklearn.svm import SVC

mutations = ['R', 'H', 'K', 'D', 'E', 
             'S', 'T', 'N', 'Q', 
             'C', 'U', 'G', 'P', 
             'A', 'I','L', 'M', 'F', 'W', 'Y', 'V']

#Read in the file
train_data = pd.read_csv('train.csv')
#Read in separate columns ("x-labels" and "y-labels")
mutation_data = train_data['Sequence']
label_data = train_data['Active']

# function to turn string of mutations into list of 0 and 1's, 
# takes string s of 4 characters
def transform(s):
  l = []
  prot = [0]*len(mutations)
  for c in s:
    prot2 = prot.copy()
    prot2[mutations.index(c)]  = 1
    l += prot2
  return l

#transform training data
mutation_nn_old = pd.DataFrame(mutation_data).applymap(transform)
mutation_nn_new = pd.DataFrame(mutation_nn_old['Sequence'].tolist(), index=mutation_nn_old.index)
#mutation_nn_new.to_csv('mutations.csv')

#################
#The idea behind transform is the following:
#For each site, there are 21 amino acids and only 1 of them could occupy that site. Hence 21 possibilities for each site.
#The idea was to transform the input data (mutation_data) to account for that.
#That is, for a given 4 letter sequence, for each site we first make a list of zero-entries of length 21 (for 21 possible amino acids) 
#and mark only one entry with a 1, which corresponds to the amino acid occupying that site by using "mutations", with which we map the letter to the index of "mutations".
#Then for each site, there is exacly one 1 entry that markes which amino acid is in that site.
#Then we can concatenate these 4 lists to get a list of 84 entries, all of them 0 except for 4, which corresponds exactly to the given 4-letter-sequence.
#We do that for all 4-letter sequences that we have in "mutation_data".
#In that way we transform the input data as a pre-processing step
#################

In [None]:
len([x for x in label_data if 1==x])

4213

In [None]:
mutation_nn_new

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111995,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
111996,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
111997,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
111998,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
#Use these two lines when we want to use the entire training set to train the MLP Classifier
x_features = mutation_nn_new.to_numpy()
y_labels = label_data.to_numpy()

#Use this line when checking the validatition score of the classifier on a validation set
#Because this line splits the data into training set and test set
#x_features, x_validation, y_labels, y_validation = TTSplit(mutation_nn_new.to_numpy(), label_data.to_numpy(), train_size=0.85, test_size=0.15)

In [None]:
x_features #check

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
ls

[0m[01;34msample_data[0m/  submission.csv  test.csv  train.csv


In [None]:
y_labels #check

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
#################
#The code below show that we searched for a good classifier that is relatively quick to train and has a good f1 score
#The idea was to train each model (e.g. different classifier "classes" [e.g. MLPClassifier, SVC, etc.] or by varying the hyperparameters) and check which one performs good enough on the validation set (a score at least as high as the public score baseline was considered "good enough"). 
#One additional criteria, that was not relevant for performance was time. When the model took too long to train, then it was not condidered a candidate to look at any further because other models could potentially perform at least as good and require less time to be trained.
#After getting some classifiers that were good enough we trained each of them from scratch again, but this time on the entire training set.
#Then we compared their respective public score to get a feeling which classifiers were better its rivals.
#It turns out that some MLP - Classfiers performed good on both the validation set and the public score board.
#Those MLP - Classifiers were picked and the best of them (here MLPC2 or MLPC8) were considered the "best" of all the explored candidates
#################

#################
#Note that by training on the dataset, we mean, in this source-file, training on the UNBALANCED data set.
#In a other, seperate source-file, we actually took care of the UNBALANCED data set and made it balanced by subsampling
#such that every 100th epoche, the py-torch neural net was trained on a different but balanced data set containing as many active labels as inactive.
#It turns out that when using neural nets, the performance on both the validation set and the public score are nearly identical and both versions
#of the neural net pass the base line by a relativly big margin.
#################


#DTC = DecisionTreeClassifier(random_state=0)
#RFC = RandomForestClassifier(n_estimators=400, random_state=573853, warm_start=True)

#MLPC1 = MLPClassifier(random_state=403829, max_iter=10000, solver='adam', warm_start=True,activation='tanh',learning_rate='adaptive',shuffle=True, tol=0.00001)
MLPC2 = MLPClassifier(random_state=403829, max_iter=10000, solver='adam', warm_start=True,activation='logistic',learning_rate='adaptive',shuffle=True, tol=0.00001)
#MLPC3 = MLPClassifier(random_state=403829, max_iter=10000, solver='adam', warm_start=True,activation='relu',learning_rate='adaptive',shuffle=True, tol=0.00001)
#MLPC4 = MLPClassifier(random_state=403829, max_iter=10000, solver='sgd', warm_start=True,activation='tanh',learning_rate='adaptive',shuffle=True, tol=0.00001)
#MLPC5 = MLPClassifier(random_state=403829, max_iter=10000, solver='sgd', warm_start=True,activation='logistic',learning_rate='adaptive',shuffle=True, tol=0.00001)
#MLPC6 = MLPClassifier(random_state=403829, max_iter=10000, solver='sgd', warm_start=True,activation='relu',learning_rate='adaptive',shuffle=True, tol=0.00001)
#MLPC7 = MLPClassifier(random_state=403829, max_iter=10000, solver='adam', warm_start=True,activation='tanh',learning_rate='constant',shuffle=True, tol=0.00001)
#MLPC8 = MLPClassifier(random_state=403829, max_iter=10000, solver='adam', warm_start=True,activation='logistic',learning_rate='constant',shuffle=True, tol=0.00001)
#MLPC9 = MLPClassifier(random_state=403829, max_iter=10000, solver='adam', warm_start=True,activation='relu',learning_rate='constant',shuffle=True, tol=0.00001)

# SVC1 = make_pipeline(StandardScaler(), SVC(kernel='rbf', gamma='auto', tol=0.00001,random_state=403829))
# SVC2 = make_pipeline(StandardScaler(), SVC(kernel='sigmoid', gamma='auto', tol=0.00001,random_state=403829))
# SVC_list_classifier = [SVC1, SVC2]
# for deg in range(3,10):
#   SVC_list_classifier.append(make_pipeline(StandardScaler(), SVC(kernel='poly', degree=deg, gamma='auto', tol=0.00001,random_state=403829)))

In [None]:
classifier = MLPC2

In [None]:
#classifier_list = [MLPC2, MLPC3, MLPC6, MLPC8, MLPC9] #55 minutes
# for classifier in classifier_list:
#   rkf = RepeatedKFold(n_splits=250, n_repeats=1, random_state=47283)
#   for ind_train, ind_test in rkf.split(x_features):
#     train_x = x_features[ind_train]
#     train_y = y_labels[ind_train]
#     test_x = x_features[ind_test]
#     test_y = y_labels[ind_test]
#     classifier.fit(train_x, train_y)

In [None]:
#this specific construct is just a leftover from the verification score evaluation.
#It does not change the validity of the solution
rkf = RepeatedKFold(n_splits=250, n_repeats=1, random_state=47283)
for ind_train, ind_test in rkf.split(x_features):
  train_x = x_features[ind_train]
  train_y = y_labels[ind_train]
  test_x = x_features[ind_test]
  test_y = y_labels[ind_test]
  classifier.fit(train_x, train_y)

Validation score

In [None]:
# Use these three lines when we split the training data set

#y_predicted_verify = classifier.predict(x_validation)
#verification_score = f1_score(y_validation, y_predicted_verify)
#verification_score

In [None]:
#This demonstrates how we compared each neural net with each other, using the validation score as measure.

#classifier_list = [MLPC2, MLPC3, MLPC6, MLPC8, MLPC9]
# verification_score_list = []
# for classifier in classifier_list:
#   verification_score_list.append(f1_score(y_validation, classifier.predict(x_validation)))

# verification_score_list

Use classifier to predict labels of test data

In [None]:
test_data = pd.read_csv('test.csv')
test_mutation_data = test_data['Sequence']

test_mutation_nn_old = pd.DataFrame(test_mutation_data).applymap(transform)
test_mutation_nn_new = pd.DataFrame(test_mutation_nn_old['Sequence'].tolist(), index=test_mutation_nn_old.index)
test_mutation_nn_new = test_mutation_nn_new.to_numpy()

In [None]:
predicted = MLPC2.predict(test_mutation_nn_new)
predicted

array([0, 0, 0, ..., 0, 0, 0])

Save prediction into csv file

In [None]:
np.savetxt(r'submission.csv',predicted,delimiter=',',fmt='% 4d')