In [1]:
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from pyfm import pylibfm
import pandas as pd
import numpy as np

def getKmers(sequence, size=7):
    return [sequence[x:x+size].lower() \
        for x in range(len(sequence) - size + 1)]

-----

### Data for verification

In [3]:
x_df = pd.read_csv('data/Xtr0.csv')\
    .append(pd.read_csv('data/Xtr1.csv'))\
    .append(pd.read_csv('data/Xtr2.csv'))

y_df = pd.read_csv('data/Ytr0.csv')\
    .append(pd.read_csv('data/Ytr1.csv'))\
    .append(pd.read_csv('data/Ytr2.csv'))

xy_df = x_df.set_index('Id').join(y_df.set_index('Id'))
xy_df['words'] = xy_df.apply(lambda x: getKmers(x['seq']), axis=1)
xy_df = xy_df.drop('seq', axis=1)

from sklearn.feature_extraction.text import CountVectorizer
readable_words = list(xy_df['words'])
for item in range(len(readable_words)):
    readable_words[item] = ' '.join(readable_words[item])
cv = CountVectorizer(ngram_range=(4,4))
X = cv.fit_transform(readable_words)
y = xy_df.iloc[:, 0].values

-----

1) Linear kernel

In [112]:
kf = KFold(n_splits=5, random_state=111)
kf.get_n_splits(X)

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    linearclassifier = SVC(kernel='linear')
    linearclassifier.fit(X_train, y_train)
    y_pred = linearclassifier.predict(X_test).astype(int)
    
    print(accuracy_score(y_test, y_pred))

0.5425
0.585
0.635833333333
0.6625
0.63


2) RBF kernel

In [113]:
kf = KFold(n_splits=5, random_state=3)
kf.get_n_splits(X)

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    rbfclassifier = SVC(kernel='rbf')
    rbfclassifier.fit(X_train, y_train)
    y_pred = rbfclassifier.predict(X_test).astype(int)

    print(accuracy_score(y_test, y_pred))

0.4775
0.498333333333
0.49
0.5
0.510833333333


3) Sigmoid kernel

In [114]:
kf = KFold(n_splits=5, random_state=1)
kf.get_n_splits(X)

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    sigmoidclassifier = SVC(kernel='sigmoid')
    sigmoidclassifier.fit(X_train, y_train)
    y_pred = sigmoidclassifier.predict(X_test).astype(int)
    
    print(accuracy_score(y_test, y_pred))

0.4775
0.498333333333
0.49
0.5
0.510833333333


4) Polynomial kernel

In [115]:
kf = KFold(n_splits=5, random_state=7)
kf.get_n_splits(X)

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    polyclassifier = SVC(kernel='poly')
    polyclassifier.fit(X_train, y_train)
    y_pred = polyclassifier.predict(X_test).astype(int)
    
    print(accuracy_score(y_test, y_pred))

0.4775
0.498333333333
0.49
0.5
0.510833333333


5) KNN

In [116]:
kf = KFold(n_splits=5, random_state=11)
kf.get_n_splits(X)

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    knnclassifier = KNeighborsClassifier(n_neighbors=10)
    knnclassifier.fit(X_train, y_train)
    y_pred = knnclassifier.predict(X_test).astype(int)
    
    print(accuracy_score(y_test, y_pred))

0.558333333333
0.5475
0.5775
0.614166666667
0.591666666667


6) Spectrum kernel

In [None]:
from sklearn.preprocessing import normalize

def normK(X, Y):
    X_n = normalize(X, norm='l2', axis=1)
    Y_n = normalize(Y, norm='l2', axis=1)
    K = np.dot(X_n, Y_n.T)
    return K

kf = KFold(n_splits=10, random_state=5)
kf.get_n_splits(X)
for i in [0.1,0.5,0.6,0.7,0.8,0.9,1.0,1.1,1.5,2.0]:
    res = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        normlassifier = SVC(C=i, kernel=normK) 
        normlassifier.fit(X_train, y_train)  
        y_pred = normlassifier.predict(X_test).astype(int)

        res.append(accuracy_score(y_test, y_pred))
    print(np.mean(res))

7) Factorization machine

In [None]:
kf = KFold(n_splits=5, random_state=111)
kf.get_n_splits(X)

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    fm = pylibfm.FM(num_factors=2000, num_iter=5, \
                verbose=True, task="classification", \
                initial_learning_rate=0.0001, \
                learning_rate_schedule="optimal")
    fm.fit(X_train.astype(float), y_train)
    
    y_pred__ = fm.predict(X_test)
    y_pred_ = fm._prepare_y(y_pred__)
    y_pred = [0 if x==-1. else 1 for x in y_pred_]
    
    print(accuracy_score(y_test, y_pred))

Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training log loss: 11.50837
-- Epoch 2
Training log loss: 4.55398
-- Epoch 3
Training log loss: 1.53290
-- Epoch 4
Training log loss: 0.42346
-- Epoch 5


------

### Data for submission

In [2]:
x_df_train = pd.read_csv('data/Xtr0.csv')\
    .append(pd.read_csv('data/Xtr1.csv'))\
    .append(pd.read_csv('data/Xtr2.csv'))
x_df_test = pd.read_csv('data/Xte0.csv')\
    .append(pd.read_csv('data/Xte1.csv'))\
    .append(pd.read_csv('data/Xte2.csv'))
x_df_test['Id'] = x_df_test['Id'].apply(lambda x:x + 6000)
x_df = x_df_train.append(x_df_test)

y_df_train = pd.read_csv('data/Ytr0.csv')\
    .append(pd.read_csv('data/Ytr1.csv'))\
    .append(pd.read_csv('data/Ytr2.csv'))
# y_df_test = pd.read_csv('data/better.csv')
# y_df_test['Id'] = y_df_test['Id'].apply(lambda x:x + 6000)
# y_df_test['Bound'] = y_df_test['Bound'].astype(float)
y_df = y_df_train#.append(y_df_test)

xy_df = x_df.set_index('Id').join(y_df.set_index('Id'))

sequences = xy_df['seq'].values.astype(str)

xy_df['words'] = xy_df.apply(lambda x: getKmers(x['seq']), axis=1)
xy_df = xy_df.drop('seq', axis=1)
indices_train = xy_df.dropna().index.tolist()

# For not using additional training set, uncomment this line
indices_train = indices_train[0:6000]

sequences_train = np.array(sequences[indices_train].tolist())
sequences_test = np.array(sequences[6000:9000].tolist())

from sklearn.feature_extraction.text import CountVectorizer
readable_words = list(xy_df['words'])
for item in range(len(readable_words)):
    readable_words[item] = ' '.join(readable_words[item])
cv = CountVectorizer(ngram_range=(4,4))
X = cv.fit_transform(readable_words)

X_train = X[indices_train]
X_test = X[6000:9000]
y_train = xy_df.iloc[:, 0].values[indices_train].astype(int)
# y_test = y_df_test.iloc[:, 0].values.astype(int)

-----------

1) Linear kernel

In [23]:
linearclassifier = SVC(kernel='linear')
linearclassifier.fit(X_train, y_train)
y_pred = linearclassifier.predict(X_test).astype(int)

In [106]:
dataset = pd.DataFrame({'Id' : range(3000), 'Bound' : y_pred})\
    .set_index('Id')
dataset.to_csv('res/pred_kmer_linear.csv', index=True)

2) KNN

In [60]:
knnclassifier = KNeighborsClassifier(n_neighbors=10)
knnclassifier.fit(X_train, y_train)
y_pred = knnclassifier.predict(X_test).astype(int)

In [106]:
dataset = pd.DataFrame({'Id' : range(3000), 'Bound' : y_pred})\
    .set_index('Id')
dataset.to_csv('res/pred_kmer_knn.csv', index=True)

3) Spectrum kernel

In [None]:
from sklearn.preprocessing import normalize

def normK(X, Y):
    X_n = normalize(X, norm='l2', axis=1)
    Y_n = normalize(Y, norm='l2', axis=1)
    K = np.dot(X_n, Y_n.T)
    return K

normlassifier = SVC(C=0.9, kernel=normK)
normlassifier.fit(X_train, y_train)
y_pred = normlassifier.predict(X_test).astype(int)

In [None]:
dataset = pd.DataFrame({'Id' : range(3000), 'Bound' : y_pred})\
    .set_index('Id')
dataset.to_csv('res/pred_kmer_spectrum.csv', index=True)

4) Factorization machine

In [None]:
fm = pylibfm.FM(num_factors=2000, num_iter=5, verbose=True, \
        task="classification", initial_learning_rate=0.0001, \
        learning_rate_schedule="optimal")
fm.fit(X_train.astype(float), y_train)

y_pred__ = fm.predict(X_test)
y_pred_ = fm._prepare_y(y_pred__)
y_pred = [0 if x==-1. else 1 for x in y_pred_]

print(accuracy_score(y_test, y_pred))

Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1


In [None]:
dataset = pd.DataFrame({'Id' : range(3000), 'Bound' : y_pred})\
    .set_index('Id')
dataset.to_csv('res/pred_kmer_factor.csv', index=True)