In [None]:
import sys
import numpy as np
import sklearn
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from IPython.core.display import HTML as Center
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

#ICU Molecular Biology (Promoter Gene Sequences) Data Set
link = "https://archive.ics.uci.edu/ml/machine-learning-databases/molecular-biology/promoter-gene-sequences/promoters.data"
tags = ["Class", "id", "Sequence"]
data = pd.read_csv(url, names = names)
classes = data.loc[:,"Class"]
# generate list of DNA sequence
sequences = list(data.loc[: , "Sequence"])
dataset = {}

# looping the sequences and spliting it into individual nucleotides
for i, seq in enumerate(sequences):

  # split into nucleotides, remove ta characters
  nucleotides = list(seq)
  nucleotides = [x for x in nucleotides if x != '\t']

  # append class assignment
  nucleotides.append(classes[i])

  # add to dataset
  dataset[i] = nucleotide

data = pd.DataFrame(dataset)
df = data.transpose()
df.rename(columns = {57 : "Class"}, inplace = True)

def screning(plot, feature):
  total = len(feature)
  for p in plot.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width() / 2 - 0.05
    y = p.get_y() + p.get_height()
    ax.annotate(percentage, (x, y), size = 12)

series = []

for name in df.columns:
  series.append(df[name].value_counts())
# Clearing out duplicate columns
info = pd.DataFrame(series)
details = info.transpose()
numerical_df = pd.get_dummies(df)
numerical_df.head()
df = numerical_df.drop(columns=['Class_-'])

df.rename(columns = {'Class_+': 'Class'}, inplace = True)

from sklearn import model_selection
#Splitting the dataset into training test and test set
# Create X and Y datasets for training
X = np.array(df.drop(['Class'], 1))
y = np.array(df['Class'])

# define seed for reproducibility
seed = 1

# split data into training and testing datasets
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25, random_state=seed)

scoring ='accuracy'

#define the model to train
names = ["Nearest Neighbors", "Gaussian Process","Decision Tree","Random Forest",
         "Neural Net", "AdaBoost","Naive Bayes","SVM Linear","SVM RBF","SVM Sigmoid"]
classifiers =[
    KNeighborsClassifier(n_neighbors=3),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10,max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    SVC(kernel='linear'),
    SVC(kernel='rbf'),
    SVC(kernel='sigmoid')
]
models =zip(names,classifiers)

# evaluate each model in turn
results = []
names=[]
accuracy = []
for name,model in models:
    kfold = model_selection.KFold(n_splits = 10, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X_train,y_train,cv=kfold,scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg= "%s: %f (%f)" %(name, cv_results.mean(), cv_results.std())

Center("""  """)

# pictorial representation of result produced by each algorithm
fig = plt.figure(figsize=(10,12), dpi = 80)
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results, vert=False)
ax.set_yticklabels(names)
plt.show()
plt.savefig("algorithm_comparison.png")
