In [1]:
!pip install optuna -q

In [2]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.metrics import accuracy_score

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

import optuna

In [3]:
X_test_mat100 = pd.read_csv('../data/Xte_mat100.csv',sep=' ',header=None).values
X_train_mat100 = pd.read_csv('../data/Xtr_mat100.csv',sep=' ',header=None).values

X_test_ = pd.read_csv('../data/Xte.csv',sep=',',index_col=0)
X_train_ = pd.read_csv('../data/Xtr.csv',sep=',',index_col=0)

y = pd.read_csv('../data/Ytr.csv',sep=',',index_col=0)

train_data = pd.concat([X_train_ , y],axis=1)

In [4]:
print('x_train: {} y_train {}'.format(X_train_mat100.shape,y.shape))
print('x_test: {}'.format(X_test_mat100.shape))


x_train: (2000, 100) y_train (2000, 1)
x_test: (1000, 100)


In [5]:
def get_train_test(X,y,p):
    X = scale(X)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=p, random_state=42)
    print(X_train.shape,X_test.shape,y_train.shape, y_test.shape)
    return X_train, X_test, y_train, y_test

### Model Test

In [6]:
class logisticregression():
    def __init__(self,train_data,train_labels,lamda=0.2,lr=0.01,decay=10,batch_size=None,epoch=10,print_every = 10):
        dummy_once = np.ones((len(train_data),1))
        self.train_data = np.hstack((dummy_once,train_data))
        self.train_labels = train_labels
        
        self.params = np.zeros((len(self.train_data[0]),1))
        
        self.lr = lr
        self.epoch = epoch
        self.batch_size = batch_size
        self.print_every = print_every
        self._lambda = lamda
        self.decay = decay
        
    def sigmoid(self,x):
        return 1/(1+np.exp(-x))
    
    def cost(self,y,y_pred):
        return -np.mean(y*np.log(y_pred)+(1-y)*np.log(1-y_pred))
    
    def gradient(self,y,y_pred,x):
        hassien = np.dot(y_pred.T,(1-y_pred))*np.linalg.pinv(np.dot(x.T,x))
        return np.dot(hassien,np.dot(x.T,(y_pred-y)))+(2*(self._lambda/len(y_pred))*self.params)
#         hassien = np.dot(y_pred.T,(1-y_pred))*np.linalg.inv(np.dot(x.T,x))
#         return np.dot(x.T,(y_pred-y))+(2*(self._lambda/len(y_pred))*self.params)
    
    def train(self):
        for i in range(self.epoch):
            y_pred = self.sigmoid(np.dot(self.train_data,self.params))
            loss = self.cost(self.train_labels,y_pred)
            
            gra = self.gradient(self.train_labels,y_pred,self.train_data)
            self.params -= self.lr*gra
            
            self.lr *= (1. / (1. + self.decay * i))
            
            if self.print_every:
                if i%self.print_every == 0 or i == self.epoch-1:
                    print('Epoch : {}  Loss: {}'.format(i,loss))
    def predict(self,test_data):
        result = self.sigmoid(np.dot(test_data,self.params[1:])+self.params[0])
        result[result > 0.5 ] = 1
        result[result <= 0.5 ] = 0
        return result
    
    def evaluate(self,test_data,labels):
        accuracy = accuracy_score(self.predict(test_data),labels)
        return accuracy

In [7]:
def cross_validate(x_data,y_data,lr,lamda=0.2,epoch=10,k=5,decay=10):
    if len(x_data)%k != 0:
        print('cant vsplit',len(x_data),' by ',k)
        return
    
    x_data_splitted = np.vsplit(x_data,k)
    y_data_splitted = np.vsplit(y_data,k)
    
    aggrigate_result = []
    for i in range(len(x_data_splitted)):
        train = []
        test = []
        items = [j for j in range(len(x_data_splitted)) if j !=i ]
        x_test = x_data_splitted[i]
        y_test = y_data_splitted[i]
        for item in items:
            if len(train) == 0:
                x_train = x_data_splitted[item]
                y_train = y_data_splitted[item]
            else:
                x_train = np.concatenate((x_train,x_data_splitted[item]), axis=0)
                y_train = np.concatenate((y_train,y_data_splitted[item]), axis=0)
        
        logistic = logisticregression(x_train,y_train,lamda=lamda,lr=lr,decay=decay,epoch=epoch,print_every=None)
        logistic.train()
        
        result = logistic.evaluate(x_test,y_test)
        aggrigate_result.append(result)
        
        value = sum(aggrigate_result)/len(aggrigate_result)
    return value

# Optimize

In [8]:
def getKmers(sequence, size=6):
    return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]

train_data['words'] = train_data.seq.apply(lambda x: ' '.join(getKmers(x)))
X_test_['words'] = X_test_.seq.apply(lambda x: ' '.join(getKmers(x)))
train_data.head(2)

Unnamed: 0_level_0,seq,Bound,words
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,GAGGGGCTGGGGAGGGGGCTGGCCCAGAGGCACCAGACTCTGCAGA...,1,gagggg aggggc ggggct gggctg ggctgg gctggg ctgg...
1,CGGCCTGGGGGCCACATGTGAGTGCTTACCTGTGTGGGGATGAGGG...,0,cggcct ggcctg gcctgg cctggg ctgggg tggggg gggg...


In [9]:
from sklearn.feature_extraction.text import CountVectorizer


data = pd.DataFrame(pd.concat([train_data.words,X_test_.words],axis=0))

train_text = data.words.values

cv = CountVectorizer(ngram_range=(4,4),max_features=400)
X = cv.fit_transform(train_text)
X = X.todense()


X.shape

(3000, 400)

In [10]:
cross_validate(np.array(X)[:20,:],y[:20],k=5,lr=0.001,lamda=0.003,epoch=10)

0.4

In [None]:
def objective(trial):
    lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
    lamda = trial.suggest_loguniform('lamda', 1e-5, 10)
    k =  trial.suggest_categorical('k', [4,5,8,10])
    epoch =  trial.suggest_int('epoch', 100, 300)
    decay = trial.suggest_int('decay', 3, 10)
    return cross_validate(np.array(X)[:2000,:], y,lr=lr,lamda=lamda,k=k,epoch=epoch,decay=decay)
# cross_validate(X_preprocess, y.reshape(-1,1),lr=0.001,epoch=200)

import optuna

sampler = optuna.samplers.TPESampler()
study = optuna.create_study(sampler=sampler, direction='maximize')
study.optimize(func=objective, n_trials=100,show_progress_bar=True)



HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[I 2020-05-27 23:16:58,214] Finished trial#0 with value: 0.5745 with parameters: {'lr': 7.93379749271999e-05, 'lamda': 1.8136353467385963e-05, 'k': 8, 'epoch': 189, 'decay': 5}. Best is trial#0 with value: 0.5745.
[I 2020-05-27 23:18:59,189] Finished trial#1 with value: 0.5825 with parameters: {'lr': 5.8699167086354994e-05, 'lamda': 0.0039173605824079655, 'k': 10, 'epoch': 228, 'decay': 6}. Best is trial#1 with value: 0.5825.
[I 2020-05-27 23:19:56,018] Finished trial#2 with value: 0.5505 with parameters: {'lr': 0.0005422218357238577, 'lamda': 1.1851852136491932, 'k': 5, 'epoch': 189, 'decay': 6}. Best is trial#1 with value: 0.5825.
[I 2020-05-27 23:21:06,645] Finished trial#3 with value: 0.576 with parameters: {'lr': 0.08736701493309758, 'lamda': 0.5245448779317791, 'k': 8, 'epoch': 162, 'decay': 7}. Best is trial#1 with value: 0.5825.
[I 2020-05-27 23:22:01,638] Finished trial#4 with value: 0.5820000000000001 with parameters: {'lr': 0.00654685336266567, 'lamda': 0.30469155792283187, 

In [90]:
df = study.trials_dataframe().drop(['state','datetime_start','datetime_complete'], axis=1)
df.sort_values(by=['value'])

Unnamed: 0,number,value,duration,params_decay,params_epoch,params_k,params_lamda,params_lr
8,8,0.5000,00:00:01.543956,10,805,8,0.000134,0.094504
0,0,0.5505,00:00:00.687026,10,950,4,0.060581,0.025797
30,30,0.5980,00:00:01.211547,8,675,8,0.000034,0.001577
86,86,0.6010,00:00:01.273208,4,514,8,0.000736,0.001310
4,4,0.6090,00:00:00.670439,6,536,4,0.002069,0.017855
...,...,...,...,...,...,...,...,...
42,42,0.7005,00:00:00.880267,5,610,5,0.000116,0.001898
87,87,0.7015,00:00:00.574556,6,630,5,0.000028,0.001995
61,61,0.7015,00:00:00.858273,5,607,5,0.000043,0.001978
41,41,0.7020,00:00:00.897417,5,606,5,0.000138,0.001953


In [50]:
lamda=0.000168,lr=0.029734,decay=4,epoch=700,print_every=None


list_in = np.array(list(X_train_.flatten())+list(X_test_.flatten()))
list_in.astype(type(X_train_))
list_in = list_in.reshape(-1,1)
list_in.shape

(3000, 1)

In [48]:
#DNA sequence as a “language”, known as k-mer counting
def getKmers(sequence, size):
    return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]
def get_n_grams(data1,n):
    X_train = []
    X_test = []

    cv = CountVectorizer(analyzer='char',ngram_range=(n,n))
    for i in data1:
        sentence = ' '.join(getKmers(i[0], size=n))
        X_train.append(sentence)
        
    X_cocat = X_train
    X = cv.fit_transform(X_cocat).toarray()
    return X

X_preprocess = get_n_grams(X_train_,7)

# def objective(trial):
#     lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
#     lamda = trial.suggest_loguniform('lamda', 0.01, 0.5)
#     k =  trial.suggest_categorical('k', [4,5,8,10])
#     epoch =  trial.suggest_int('epoch', 10, 20)
#     decay = trial.suggest_int('decay', 3, 10)
#     return cross_validate(X_preprocess[:2000,:], y,lr=lr,lamda=lamda,k=k,epoch=epoch,decay=decay)

cross_validate(X_preprocess[:2000], y,0.001,20,k=5,epoch=10,decay=10)

# import optuna

# sampler = optuna.samplers.TPESampler()
# study = optuna.create_study(sampler=sampler, direction='maximize')
# study.optimize(func=objective, n_trials=100,show_progress_bar=True)

0.924

In [13]:
# df = study.trials_dataframe().drop(['state','datetime_start','datetime_complete'], axis=1)
# df.sort_values(by=['value'])

In [91]:
# # Count Vectorizer 
# def get_count_grams(data1,n):
#     cv = CountVectorizer(analyzer='char',ngram_range=(n,n))
#     X = cv.fit_transform(data1).toarray()
#     return X

# X_preprocess = get_n_grams(X_train_.flatten(),8)

# # def objective(trial):
# #     lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
# #     lamda = trial.suggest_loguniform('lamda', 0.01, 0.5)
# #     k =  trial.suggest_categorical('k', [4,5,8,10])
# #     epoch =  trial.suggest_int('epoch', 10, 20)
# #     decay = trial.suggest_int('decay', 3, 10)
# #     return cross_validate(X_preprocess, y,lr=lr,lamda=lamda,k=k,epoch=epoch,decay=decay)

# cross_validate(X_preprocess[:2000], y,0.0001,20,k=5,epoch=10,decay=10)

# # import optuna

# # sampler = optuna.samplers.TPESampler()
# # study = optuna.create_study(sampler=sampler, direction='maximize')
# # study.optimize(func=objective, n_trials=200,show_progress_bar=True)

In [201]:
# df = study.trials_dataframe().drop(['state','datetime_start','datetime_complete'], axis=1)
# df.sort_values(by=['value'])

In [202]:
# cross_validate(X_preprocess, y,lr=0.004433,lamda=0.432127,k=4,epoch=16,decay=4)

In [66]:
# Count Vectorizer 
def get_tf_idf_grams(data1,n):
    cv = TfidfVectorizer(analyzer='char',ngram_range=(n,n))
    X = cv.fit_transform(data1).toarray()
    return X

X_preprocess = get_tf_idf_grams(X_train_.flatten(),8)

# def objective(trial):
#     lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
#     lamda = trial.suggest_loguniform('lamda', 0.01, 0.5)
#     k =  trial.suggest_categorical('k', [4,5,8,10])
#     epoch =  trial.suggest_int('epoch', 10, 20)
#     decay = trial.suggest_int('decay', 3, 10)
#     return cross_validate(X_preprocess, y,lr=lr,lamda=lamda,k=k,epoch=epoch,decay=decay)

cross_validate(X_preprocess[:2000], y,0.001,20,k=5,epoch=10,decay=10)

# import optuna

# sampler = optuna.samplers.TPESampler()
# study = optuna.create_study(sampler=sampler, direction='maximize')
# study.optimize(func=objective, n_trials=100,show_progress_bar=True)

0.5

In [213]:
# df = study.trials_dataframe().drop(['state','datetime_start','datetime_complete'], axis=1)
# df.sort_values(by=['value'])

# After testing all possible dataset preprocessing type now lets stick to one

In [None]:
X_train, X_test, y_train, y_test = get_train_test(X_preprocess[:2000],y,0.3)


In [112]:
# X_preprocess = get_n_grams(X_train_,8)
# X_preprocess.shape

# print(cross_validate(X_preprocess[:2000,:], y,lr=0.001,lamda=15,k=4,epoch=16,decay=10))

X_train, X_test, y_train, y_test = get_train_test(XXX,y,0.01)

# y,0.001,15,k=5,epoch=10,decay=10)

logistic = logisticregression(X_train,y_train,lamda=0.000168,lr=0.029734,decay=4,epoch=700,print_every=None)
logistic.train()
        
print(logistic.evaluate(X_train,y_train))
print(logistic.evaluate(X_test,y_test))
cross_validate(XXX, y,lamda=0.000168,lr=0.029734,decay=4,epoch=700)

(1980, 50) (20, 50) (1980, 1) (20, 1)




0.5818181818181818
0.65


[0.7075, 0.7075, 0.7075, 0.7075, 0.69]

In [235]:
X_preprocess = get_count_grams(np.vstack((X_train_,X_test_)).flatten(),6)
X_preprocess.shape

cross_validate(X_preprocess[:2000,:], y,lr=0.004433,lamda=0.432127,k=4,epoch=16,decay=4)

C_count_6 = X_preprocess[2000:,:]

X_train, X_test, y_train, y_test = get_train_test(X_preprocess[:2000,:],y,0.3)


logistic_count6 = logisticregression(X_train,y_train,lamda=0.455265,epoch=10,print_every=1,lr=0.000407,decay=11)
logistic_count6.train()
        
print(logistic_count6.evaluate(X_train,y_train))
print(logistic_count6.evaluate(X_test,y_test))

(1400, 4096) (600, 4096) (1400, 1) (600, 1)
Epoch : 0  Loss: 0.6931471805599454
Epoch : 1  Loss: 0.369619629360289
Epoch : 2  Loss: 0.24093457515641262
Epoch : 3  Loss: 0.23515736618853034
Epoch : 4  Loss: 0.2349297795537864
Epoch : 5  Loss: 0.23492310904273742
Epoch : 6  Loss: 0.23492296082415293
Epoch : 7  Loss: 0.23492295817739842
Epoch : 8  Loss: 0.23492295813789463
Epoch : 9  Loss: 0.2349229581373882
0.9664285714285714
0.6433333333333333


In [236]:
X_preprocess = get_count_grams(np.vstack((X_train_,X_test_)).flatten(),4)
X_preprocess.shape

print(cross_validate(X_preprocess[:2000,:], y,lr=0.004433,lamda=0.432127,k=4,epoch=16,decay=7))

C_count_4 = X_preprocess[2000:,:]

X_train, X_test, y_train, y_test = get_train_test(X_preprocess[:2000,:],y,0.3)


logistic_count4 = logisticregression(X_train,y_train,lamda=0.455265,epoch=10,print_every=1,lr=0.000407,decay=11)
logistic_count4.train()
        
print(logistic_count4.evaluate(X_train,y_train))
print(logistic_count4.evaluate(X_test,y_test))

(1400, 256) (600, 256) (1400, 1) (600, 1)
Epoch : 0  Loss: 0.6931471805599454
Epoch : 1  Loss: 0.6286425932017687
Epoch : 2  Loss: 0.6042153578172307
Epoch : 3  Loss: 0.6027152714606171
Epoch : 4  Loss: 0.6026532220481413
Epoch : 5  Loss: 0.6026514005159469
Epoch : 6  Loss: 0.6026513600396904
Epoch : 7  Loss: 0.6026513593169011
Epoch : 8  Loss: 0.6026513593061132
Epoch : 9  Loss: 0.6026513593059748
0.6907142857142857
0.59



divide by zero encountered in log


invalid value encountered in multiply



In [237]:
X_preprocess = get_tf_idf_grams(np.vstack((X_train_,X_test_)).flatten(),4)
X_preprocess.shape

cross_validate(X_preprocess[:2000,:], y,lr=0.004433,lamda=0.432127,k=4,epoch=16,decay=4)

C_tf_4 = X_preprocess[2000:,:]


X_train, X_test, y_train, y_test = get_train_test(X_preprocess[:2000,:],y,0.3)


logistic_tf4 = logisticregression(X_train,y_train,lamda=0.455265,epoch=10,print_every=1,lr=0.000407,decay=11)
logistic_tf4.train()
        
print(logistic_tf4.evaluate(X_train,y_train))
print(logistic_tf4.evaluate(X_test,y_test))

(1400, 256) (600, 256) (1400, 1) (600, 1)
Epoch : 0  Loss: 0.6931471805599454
Epoch : 1  Loss: 0.6260173199159424
Epoch : 2  Loss: 0.6002614260404124
Epoch : 3  Loss: 0.598726394674755
Epoch : 4  Loss: 0.5986622185693936
Epoch : 5  Loss: 0.5986603339307318
Epoch : 6  Loss: 0.5986602920517511
Epoch : 7  Loss: 0.598660291303913
Epoch : 8  Loss: 0.5986602912927511
Epoch : 9  Loss: 0.5986602912926081
0.7028571428571428
0.5966666666666667


In [238]:
X_preprocess = get_tf_idf_grams(np.vstack((X_train_,X_test_)).flatten(),6)
X_preprocess.shape

cross_validate(X_preprocess[:2000,:], y,lr=0.004433,lamda=0.432127,k=4,epoch=16,decay=7)

C_tf_6 = X_preprocess[2000:,:]

X_train, X_test, y_train, y_test = get_train_test(X_preprocess[:2000,:],y,0.3)


logistic_tf6 = logisticregression(X_train,y_train,lamda=0.455265,epoch=10,print_every=1,lr=0.000407,decay=11)
logistic_tf6.train()
        
print(logistic_tf6.evaluate(X_train,y_train))
print(logistic_tf6.evaluate(X_test,y_test))

(1400, 4096) (600, 4096) (1400, 1) (600, 1)
Epoch : 0  Loss: 0.6931471805599454
Epoch : 1  Loss: 0.3746954683721624
Epoch : 2  Loss: 0.23994925474338094
Epoch : 3  Loss: 0.23422424550847226
Epoch : 4  Loss: 0.23399751150543982
Epoch : 5  Loss: 0.23399086485544413
Epoch : 6  Loss: 0.23399071716632397
Epoch : 7  Loss: 0.2339907145290239
Epoch : 8  Loss: 0.2339907144896612
Epoch : 9  Loss: 0.23399071448915654
0.9635714285714285
0.6433333333333333


In [15]:
X_test_final = scale(X_preprocess[2000:])

In [17]:
from collections import Counter


sumbission = []
for i in range(len(X_test_final)):
    r1 = logistic.predict(X_test_final[i])
#     r2 = logistic_count6.predict(C_count_6[i])
#     r3 = logistic_tf4.predict(C_tf_4[i])
#     r4 = logistic_tf6.predict(C_tf_6[i])
    
    
#     votes = [r1[0],r2[0],r3[0],r4[0]]
    
#     print(Counter(votes))
#     print(Counter(votes).most_common(1)[0][0])
    
#     break
#     sumbission.append([i,int(Counter(votes).most_common(1)[0][0])])
    
    sumbission.append([i,int(r1)])

In [20]:
# sumbission
df = pd.DataFrame(sumbission)
df.columns = ['Id','Bound']
df.to_csv('cv_64.9.csv',index=False)

In [19]:
df.head(50)

Unnamed: 0,Id,Bound
0,0,1
1,1,1
2,2,0
3,3,0
4,4,1
5,5,1
6,6,1
7,7,1
8,8,1
9,9,1
