# Imports

In [135]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

import string
import re
from Bio.SeqUtils.ProtParam import ProteinAnalysis

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# Loading files

In [136]:
X_train=pd.read_csv('./data/Xtr.csv', sep=',') #we use this dataset to train our model
Y_train=pd.read_csv('./data/Ytr.csv', sep=',') #we use this dataset to train our model
X_test=pd.read_csv('./data/Xte.csv', sep=',') #we will use this data set later to validate our model

X_train_mat=pd.read_csv('./data/Xtr_mat100.csv', sep=',') #we use this dataset to train our model
X_test_mat=pd.read_csv('./data/Xte_mat100.csv', sep=',') #we will use this data set later to validate our model

In [137]:
X_train['seq'][0]

'GAGGGGCTGGGGAGGGGGCTGGCCCAGAGGCACCAGACTCTGCAGAACCACCCAGGCATTGTGGGGCTGCCCTGCCACCTGCTGGCCGCTCCTGGTGGCAG'

### Loading preprocessed data
Since data preprocessing takes time we have done some preprocessing and store the preprocessed data.

Those preprocessing are:
- Characters to ord numbers
- Bio sequency parameters (molecular_weight,	gravity,	iso_electric_point,	instability_index,	molar_extinction_coefficient,	secondary_structure_fraction)

In [138]:
X_train_preprocessed=pd.read_csv('./data/train_data_preprocessing1.csv', sep=',') #we use this dataset to train our model
Y_train=pd.read_csv('./data/Ytr.csv', sep=',') #we use this dataset to train our model
X_test_preprocessed=pd.read_csv('./data/test_data_preprocessing1.csv', sep=',')

In [139]:
print('The shape of the X_train dataset is:',X_train.shape)
print('The shape of the Y_train dataset is:',Y_train.shape)

The shape of the X_train dataset is: (2000, 2)
The shape of the Y_train dataset is: (2000, 2)


# Data preprocessing

### 1. Converting characters to numbers

In [140]:
Alphabet_dict = dict(zip(string.ascii_uppercase, range(1,27)))
for i in range(101):
    X_train['seq_'+str(i)] = X_train.seq.apply(lambda x :Alphabet_dict[x[i]])
    X_test['seq_'+str(i)] = X_test.seq.apply(lambda x :Alphabet_dict[x[i]])

### 2. Getting DNA parameters by using bio-sequence

In [141]:
def get_DNA_parameters(data):
    """
    This function takes a dataframe that containes a column(seq) of DNA sequences
    It computer parameters related to all sequence and append those features to the input datafram
    @ input : DataFrame
    @ output : DataFrame
    """
    
    data = data
    cols = ['molecular_weight','gravity','iso_electric_point',
            'instability_index','molar_extinction_coefficient',
           'secondary_structure_fraction']

    for name in cols:
        data[name] = None

    for ind in range(len(data)):
        seq = data.iloc[ind]['seq']
        seq = ProteinAnalysis(seq)
        data[cols[0]][ind] = seq.molecular_weight()
        data[cols[1]][ind] = seq.gravy()
        data[cols[2]][ind] = seq.isoelectric_point()
        data[cols[3]][ind] = seq.instability_index()
        data[cols[4]][ind] = np.mean(seq.molar_extinction_coefficient())
        data[cols[5]][ind] = np.mean(seq.secondary_structure_fraction())
    
    return data

In [142]:
# X_test_ = get_DNA_parameters(X_test)
# X_train_ = get_DNA_parameters(X_train)

### 3. function to convert a DNA sequence string to a numpy array

In [143]:
# converts to lower case, changes any non 'acgt' characters to 'n'
def string_to_array(my_string):
    my_string = my_string.lower()
    my_string = re.sub('[^acgt]', 'z', my_string)
    my_array = np.array(list(my_string))
    return my_array

### 4. DNA sequence string as an ordinal vector

In [144]:
label_encoder = LabelEncoder()
label_encoder.fit(np.array(['a','c','g','t','z']))
def ordinal_encoder(my_array):
    integer_encoded = label_encoder.transform(my_array)
    float_encoded = integer_encoded.astype(float)
    float_encoded[float_encoded == 0] = 0.25 # A
    float_encoded[float_encoded == 1] = 0.50 # C
    float_encoded[float_encoded == 2] = 0.75 # G
    float_encoded[float_encoded == 3] = 1.00 # T
    float_encoded[float_encoded == 4] = 0.00 # anything else, z
    return float_encoded

### 5. Function to one-hot encode a DNA sequence string

In [145]:
def one_hot_encoder(my_array):
    integer_encoded = label_encoder.transform(my_array)
    onehot_encoder = OneHotEncoder(sparse=False, dtype=int, n_values=5)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    onehot_encoded = np.delete(onehot_encoded, -1, 1)
    return onehot_encoded

### 6. Afunction to give required representations

In [163]:
def DNA_represent(data,mode = 'ordinal'):
    data = data
    cols = ['seq'+str(i) for i in range(101)]
    X = []
    for ind in range(len(data)):
        seq = data.iloc[ind]['seq']
        seq = string_to_array(seq)
        if mode == 'ordinal':
            seq_arr = ordinal_encoder(seq)
       
        if mode == 'onehot':
            seq_arr = one_hot_encoder(seq)
        X.append(seq_arr)
    X = np.array(X)
    X = pd.DataFrame(data=X,columns=cols)
    X = pd.concat([data,X],axis=1,sort=False)
    return X
        

In [164]:
#X_test_ = DNA_represent(X_test)

In [165]:
#X_test_.to_csv('./data/test_data_preprocessing2.csv',index=False)

In [166]:
#X_train_ = DNA_represent(X_train)

In [167]:
#X_train_.to_csv('./data/train_data_preprocessing2.csv',index=False)

In [172]:
X_train_ = DNA_represent(X_train,mode='ordinal')
X_test_ = DNA_represent(X_test)

## What about counting number of different caracters 

In [76]:
#X = X_train.drop(['seq', 'Id'], axis=1)
#X_t = X_test.drop(['seq', 'Id'], axis=1)
y = Y_train.Bound

## Models

In [77]:
# Ridge Regression (RR)

class solveRR():
    def __init__(self, X, y, lam=0.1):
        self.beta = None
        self.X = X
        self.y = y
        self.lam = lam
            
    def fit(self):
        
        X = self.X
        y = self.y
        lam = self.lam 
        
        n, p = X.shape
        assert (len(y) == n)

        A = (X.T.dot(X)) + np.eye(p)*lam*n
        b = X.T.dot(y)
        
        self.beta = np.linalg.solve(A, b)
        
        return self.beta
    
        
    def predict(self, X, threshold=.5):
        return np.where(X.dot(self.beta) >= threshold, 1, 0)
        
          
    def Accuracy_check(self,X, y, threshold=.5):
        return np.mean(self.predict(X, threshold)==y)
    



# Weighted Ridge Regression (WRR)
class solveWRR():
    def __init__(self, X, y, w, lam=0.1):
        self.beta = None
        self.X = X
        self.y = y
        self.lam = lam
        self.w = w
    
    def fit(self):
        
        X = self.X
        y = self.y
        lam = self.lam 
        w = self.w
        
        n, p = X.shape
        assert (len(y) == len(w) == n)

        y1 = np.sqrt(w) * y
        X1 = (np.sqrt(w) * X.T).T
        
        # Hint:
        # Find y1 and X1 such that:
        
        self.beta = solveRR(X1, y1, lam).fit()
                
        return self.beta
    
        
    def predict(self, X, threshold):
        return np.where(X.dot(self.beta) >= threshold, 1, 0)
        
          
    def Accuracy_check(self,X, y, threshold=.5):
        return np.mean(self.predict(X, threshold)==y)
    

# Logistic Ridge Regression (LRR)
class solveLRR():
    def __init__(self, X, y, lam=0.1):
        self.beta = None
        self.X = X
        self.y = y
        self.lam = lam
    
    def fit(self):
        
        X = self.X
        y = self.y
        
        n, p = X.shape
        assert (len(y) == n)
    
        lam = self.lam 
        max_iter = 50
        eps = 1e-3
        sigmoid = lambda a: 1/(1 + np.exp(-a))
        
        
        
        # Initialize
        self.beta = np.zeros(p)

        # Hint: Use IRLS
        for i in range(max_iter):
            beta_old = self.beta
            f = X.dot(beta_old)
            w = sigmoid(f) * sigmoid(-f)
            z = f + y / sigmoid(y*f)
            self.beta = solveWRR(X, z, w, 2*lam).fit()
            # Break condition (achieved convergence)
            #if np.sum((beta-beta_old)**2) < eps:
            #    break                
        return self.beta
    
        
    def predict(self, X, threshold):
        return np.where(X.dot(self.beta) >= threshold, 1, 0)
        
          
    def Accuracy_check(self,X, y, threshold=.5):
        return np.mean(self.predict(X, threshold)==y)


## Kernel

In [78]:
X_train.std().mean()

0.2666935983266983

In [79]:
import ipdb

In [80]:
class ksolveRR():
    def __init__(self, X, y, lam= 0.0001):
        self.beta = None
        self.X = X
        self.y = y
        self.lam = lam
            
    
    def K(self, x, x_prime):
        return (x.T.dot(x_prime))**2
    
    def fit(self):
        
        X = self.X
        y = self.y
        lam = self.lam 
        
        n, p = X.shape
        assert (len(y) == n)
        
#         ipdb.set_trace()
#         A = X.T.dot(X) + np.eye(p)*lam*n
#         A = (X.T.dot((X.dot(X.T) + 1 )**2) + np.eye(n)*lam*n

#         b = y
        
        K = np.exp(-(1/(2*(6.995879868253351)))*np.linalg.norm(X-X)**2)
        
#         K = (X.dot(X.T)+1)**(300)
        
        self.beta = (X.T.dot(\
                             np.linalg.inv(K + np.eye(n)*lam*n))\
                             .dot(y))
        
        return self.beta
    
        
    def predict(self, X, threshold=.5):
        return np.where(X.dot(self.beta) >= threshold, 1, 0)
        
          
    def Accuracy_check(self,X, y, threshold=.5):
        return np.mean(self.predict(X, threshold)==y)
    

# Cross Validation

In [81]:
from sklearn.model_selection import KFold 
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer


In [82]:
kfold=KFold(n_splits=5)
onehot_encoder = OneHotEncoder(sparse=False, categories='auto')

# X_cross = X.values
# X_t = X_t.values

y_cross = y.values

# vectorizer = TfidfVectorizer()
# X_cross = vectorizer.fit_transform(X)

X_cross = X_train# = onehot_encoder.fit_transform(X_train)
X_t_enc = X_test#onehot_encoder.fit_transform(X_test)


# scaler = StandardScaler()#MinMaxScaler() # StandardScaler()
# scaler.fit(X_cross)

# X_cross = scaler.transform(X_cross)

In [83]:
X_cross

array([[0.75, 0.25, 0.75, ..., 0.5 , 0.25, 0.75],
       [0.5 , 0.75, 0.75, ..., 1.  , 0.75, 0.75],
       [0.75, 0.25, 0.5 , ..., 0.5 , 0.75, 1.  ],
       ...,
       [1.  , 0.75, 0.75, ..., 0.75, 0.75, 0.5 ],
       [0.5 , 0.5 , 0.5 , ..., 0.75, 0.5 , 0.5 ],
       [1.  , 0.75, 0.5 , ..., 0.75, 1.  , 1.  ]])

In [84]:
from sklearn import linear_model as lm
from sklearn.metrics import accuracy_score, roc_auc_score

# from np.line

In [85]:
models = {solveRR: 'Ridge Regression (RR)', solveWRR:'Weighted Ridge Regression (WRR)', \
          solveLRR : 'Logistic Ridge Regression (LRR)', ksolveRR : 'Kernal Ridge Regression'}

# models = {ksolveRR : 'Kernal Ridge Regression'}
for model in models:
    accuracy = []
    for i, (train_index, validate_index) in enumerate(kfold.split(X_train)):

        X_train, y_train = X_cross[train_index], y_cross[train_index]
        X_valid, y_valid = X_cross[validate_index], y_cross[validate_index]

        if model ==solveWRR:
            w = np.random.rand(len(y_train))
            model_curr = solveWRR(X_train, y_train, w, lam=0.0001)
        else:
            model_curr = model(X_train, y_train, lam=0.0001)
            
        model_curr.fit()

        accuracy.append(model_curr.Accuracy_check(X_valid, y_valid, threshold=0.5))
        print(f'accurracy fold {i}: {accuracy[i]}')
    
    print(f'\nAverage accuracy {models[model]} is : {np.mean(accuracy)}\n')

accurracy fold 0: 0.565
accurracy fold 1: 0.6025
accurracy fold 2: 0.57
accurracy fold 3: 0.57
accurracy fold 4: 0.57

Average accuracy Ridge Regression (RR) is : 0.5754999999999999

accurracy fold 0: 0.5375
accurracy fold 1: 0.559375
accurracy fold 2: 0.584375
accurracy fold 3: 0.55
accurracy fold 4: 0.54375

Average accuracy Weighted Ridge Regression (WRR) is : 0.555

accurracy fold 0: 0.51953125
accurracy fold 1: 0.5
accurracy fold 2: 0.546875
accurracy fold 3: 0.41796875
accurracy fold 4: 0.5078125

Average accuracy Logistic Ridge Regression (LRR) is : 0.4984375

accurracy fold 0: 0.4292682926829268
accurracy fold 1: 0.5170731707317073
accurracy fold 2: 0.5219512195121951
accurracy fold 3: 0.5609756097560976
accurracy fold 4: 0.5882352941176471

Average accuracy Kernal Ridge Regression is : 0.5235007173601148



In [47]:
models = {solveRR: 'Ridge Regression (RR)', solveWRR:'Weighted Ridge Regression (WRR)', \
          solveLRR : 'Logistic Ridge Regression (LRR)', ksolveRR : 'Kernal Ridge Regression'}

# models = {ksolveRR : 'Kernal Ridge Regression'}
for model in models:
    accuracy = []
    for i, (train_index, validate_index) in enumerate(kfold.split(X_train)):

        X_train, y_train = X_cross[train_index], y_cross[train_index]
        X_valid, y_valid = X_cross[validate_index], y_cross[validate_index]

        if model ==solveWRR:
            w = np.random.rand(len(y_train))
            model_curr = solveWRR(X_train, y_train, w, lam=0.0001)
        else:
            model_curr = model(X_train, y_train, lam=0.0001)
            
        model_curr.fit()

        accuracy.append(model_curr.Accuracy_check(X_valid, y_valid, threshold=0.5))
        print(f'accurracy fold {i}: {accuracy[i]}')
    
    print(f'\nAverage accuracy {models[model]} is : {np.mean(accuracy)}\n')

accurracy fold 0: 0.5775
accurracy fold 1: 0.645
accurracy fold 2: 0.6375
accurracy fold 3: 0.6225
accurracy fold 4: 0.6375

Average accuracy Ridge Regression (RR) is : 0.624

accurracy fold 0: 0.603125
accurracy fold 1: 0.590625
accurracy fold 2: 0.61875
accurracy fold 3: 0.609375
accurracy fold 4: 0.5875

Average accuracy Weighted Ridge Regression (WRR) is : 0.6018749999999999

accurracy fold 0: 0.53515625
accurracy fold 1: 0.50390625
accurracy fold 2: 0.55859375
accurracy fold 3: 0.44140625
accurracy fold 4: 0.515625

Average accuracy Logistic Ridge Regression (LRR) is : 0.5109375

accurracy fold 0: 0.6195121951219512
accurracy fold 1: 0.6195121951219512
accurracy fold 2: 0.6195121951219512
accurracy fold 3: 0.6585365853658537
accurracy fold 4: 0.5784313725490197

Average accuracy Kernal Ridge Regression is : 0.6191009086561454



In [48]:
# Cehckinf full model
model = ksolveRR(X_cross, y_cross, lam=0.0001)
model.fit()

model.Accuracy_check(X_cross, y_cross, threshold=0.5)

0.682

# Predictions

In [49]:
model = ksolveRR(X_cross, y_cross, lam=0.0001)
model.fit()
y_pred = model.predict(X_t_enc, 0.5)

In [50]:
X = np.arange(1000).reshape(-1, 1)
sample = pd.DataFrame(data=X, columns=['Id'])
sample.head()

Unnamed: 0,Id
0,0
1,1
2,2
3,3
4,4


In [51]:
sample['Bound'] = y_pred

In [596]:
sample.tail()

Unnamed: 0,Id,Bound
995,995,0
996,996,0
997,997,0
998,998,1
999,999,1


In [53]:
#sample.tail()

In [54]:
sample.to_csv('./ksolveRR_63_cv_ord.csv', index=False)