# Bioinformatics Project

### Feature (Gene) Selection with SVM

In this project,

#### Implementation

1. Import libraries.

In [34]:
import sys
import time
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import LeaveOneOut
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix 
from sklearn.feature_selection import SelectKBest, chi2

2. Get user inputs (name of dataset and objectives be considered).

In [35]:
dataset_name = sys.argv[1]
objectives   = sys.argv[2]
dataset_name = 'shipp'     # gordon: lung # shipp: lymphoma # singh: prostate # tian: myeloma

3. Read dataset.

In [36]:
samples = pd.read_csv('Datasets/' + dataset_name + '_inputs.csv', header = None)
labels = pd.read_csv('Datasets/' + dataset_name + '_outputs.csv', header = None)


4. Pre-process dataset.

In [37]:
samples.fillna(0, inplace = True)
samples = np.asarray(samples.values)
labels = np.transpose(np.asarray(labels.values.ravel() - 1, dtype=int))
samples = preprocessing.MinMaxScaler().fit_transform(samples)

5. Apply initial feature selection method.

In [38]:
number_of_initial_features = 100
samples = SelectKBest(score_func=chi2, k = number_of_initial_features).fit_transform(samples, labels)
#print(samples.shape)

5. Be ready! Last preparations.

In [39]:
indexes           = np.array([0, 1])
scores            = []
loo               = LeaveOneOut()
number_of_classes = np.max(labels) + 1
batch_size        = 1
epochs            = 5

6. Apply Linear SVM!

In [40]:
start_time = time.time()
for train_index, test_index in loo.split(samples):
    x_train, x_test = samples[train_index], samples[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    
    X_train = x_train[:, indexes]
    X_test = x_test[:, indexes]
    Y_train = y_train[:]
    Y_test = y_test[:]
    
    X_train = X_train.astype('float32')
    X_test = X_test.astype('float32')
    Y_train = Y_train[:]
    Y_test = Y_test[:]
    
    clf = LinearSVC(random_state=0)
    clf.fit(X_train, Y_train)
    score = clf.score(X_test, Y_test)
    scores.append(score)
end_time = time.time()

7. Apply Linear SVM!

In [None]:
start_time = time.time()
for train_index, test_index in loo.split(samples):
    x_train, x_test = samples[train_index], samples[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    
    X_train = x_train[:, indexes]
    X_test = x_test[:, indexes]
    Y_train = y_train[:]
    Y_test = y_test[:]
    
    X_train = X_train.astype('float32')
    X_test = X_test.astype('float32')
    Y_train = Y_train[:]
    Y_test = Y_test[:]
    
    clf = SVC(kernel = 'rbf', random_state=0) # rbf == gaussian
    clf.fit(X_train, Y_train)
    score = clf.score(X_test, Y_test)
    scores.append(score)
end_time = time.time()

8. Apply k-NN!

In [19]:
start_time = time.time()
for train_index, test_index in loo.split(samples):
    x_train, x_test = samples[train_index], samples[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    
    X_train = x_train[:, indexes]
    X_test = x_test[:, indexes]
    Y_train = y_train[:]
    Y_test = y_test[:]
    
    X_train = X_train.astype('float32')
    X_test = X_test.astype('float32')
    Y_train = Y_train[:]
    Y_test = Y_test[:]
    
    clf = KNeighborsClassifier(n_neighbors=3)
    clf.fit(X_train, Y_train)
    score = clf.score(X_test, Y_test)
    scores.append(score)
end_time = time.time()

9. Apply NN!

In [42]:
start_time = time.time()
for train_index, test_index in loo.split(samples):
    x_train, x_test = samples[train_index], samples[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    
    X_train = x_train[:, indexes]
    X_test = x_test[:, indexes]
    Y_train = y_train[:]
    Y_test = y_test[:]
    
    X_train = X_train.astype('float32')
    X_test = X_test.astype('float32')
    Y_train = Y_train[:]
    Y_test = Y_test[:]
    
    clf = MLPClassifier(hidden_layer_sizes=(10, 10), activation='relu', solver='adam', max_iter=500)
    clf.fit(X_train, Y_train)
    score = clf.score(X_test, Y_test)
    scores.append(score)
end_time = time.time()



10. Print results

In [41]:
print('Score: ' + str(np.average(scores)))
print('Time: ' + str(end_time - start_time))

Score: 0.8831168831168831
Time: 0.09003233909606934


# Comments

Author:
    GOSHGAR ISMAYILOV
    SERHAT İŞCAN

Project Description:

    In this project, 

Notes: 
1. Your are not allowed to use Numpy.
2. You are not allowed to use any libraries to find the Needleman Wunsch and Smith Waterman scores.
3. You can only use standard libraries apart from the given codes.
4. Please submit your assignment using Moodle. Upload a single zip file named as YourNameSurname.zip. Your zip file should include your report, your source code, and the corresponding read.me file. You can use any programming language of your choice. But, your read.me file should clearly explain how to run your program.
5. For any question e-mail me from selen.parlar@boun.edu.tr