# Classifier

In [1]:
# required functions from nn module
from nn.io import read_text_file, read_fasta_file
from nn.nn import NeuralNetwork
from nn.preprocess import one_hot_encode_seqs,  sample_seqs

import numpy as np

# for splitting data into training and testing sets
from sklearn.model_selection import train_test_split

## Load data

In [2]:
# read in the 137 positive Rap1 motif examples.
rap1_pos = read_text_file("data/rap1-lieb-positives.txt")

# read in all the negative Rap1 motif  examples
rap1_neg = read_fasta_file("data/yeast-upstream-1k-negative.fa")

print("Number of Rap1 positives: " + str(len(rap1_pos)))
print("Number of Rap1 negatives: " + str(len(rap1_neg)))


Number of Rap1 positives: 137
Number of Rap1 negatives: 3163


## Balance your classes using your sample_seq function and explain why you chose the sampling scheme you did.

In [3]:
rap1 = rap1_pos  + rap1_neg
labels = [1] * len(rap1_pos) + [0] * len(rap1_neg)

# TO DO:  some sampling to account for length of sequences

# Up sample the positive class by sampling with replacement
rap1, labels = sample_seqs(rap1, labels)

# Convert to NumPy arrays
rap1 = np.array(rap1)
labels = np.array(labels)

# Generate a shuffled index
shuffle_indices = np.random.permutation(len(rap1))

# Apply the shuffled indices
rap1_shuffled = rap1[shuffle_indices]
labels_shuffled = labels[shuffle_indices]

## One-hot encode the data using your one_hot_encode_seqs function.

In [9]:
X = one_hot_encode_seqs(rap1_shuffled)
y = labels_shuffled
print("One hot encoding changed number of sequences feature matrix shape from: " + str(len(rap1_shuffled)) + " to " + str(len(X )))
print("For first sequence, one-hot-encoding changed the length from: " + str(len(rap1_shuffled[0])) + " to " + str(len(X[0])))

One hot encoding changed number of sequences feature matrix shape from: 6326 to 6326
For first sequence, one-hot-encoding changed the length from: 1000 to 4000


## Split the data into training and validation sets.

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X, labels_shuffled, test_size = 0.1, random_state =42)

## Generate an instance of your NeuralNetwork class with an appropriate architecture.
## Train your neural network on the training data.

## Plot your training and validation loss by epoch.
### Report the accuracy of your classifier on your validation dataset.

## Explain your choice of loss function and hyperparameters.