#**Importing the necessary packages.**

In [14]:
import sys
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import sklearn

plt.rcParams['figure.figsize'] = (8, 8)


#**Data Preparation**

In [None]:
# linking up with the UCI Machine Learning Repository (promoter gene sequences dataset)
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/molecular-biology/promoter-gene-sequences/promoters.data'
names = ['Class', 'id', 'Sequence']
data = pd.read_csv(url, names=names)
data.head()

# **Data Preprocessing**

In [25]:
#Removing tab characters within the sequences
data['Sequence'] = data['Sequence'].apply(lambda x: ''.join(x.split()))

#Splitting each sequence into individual nucleotides
nucleotides_df = data['Sequence'].apply(lambda x: pd.Series(list(x)))

# Adding the class column
nucleotides_df['Class'] = data['Class']
nucleotides_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,Class
0,t,a,c,t,a,g,c,a,a,t,...,g,c,t,t,g,t,c,g,t,+
1,t,g,c,t,a,t,c,c,t,g,...,c,a,t,c,g,c,c,a,a,+
2,g,t,a,c,t,a,g,a,g,a,...,c,a,c,c,c,g,g,c,g,+
3,a,a,t,t,g,t,g,a,t,g,...,a,a,c,a,a,a,c,t,c,+
4,t,c,g,a,t,a,a,t,t,a,...,c,c,g,t,g,g,t,a,g,+


In [26]:
# Descriptive Analysis
nucleotides_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,Class
count,106,106,106,106,106,106,106,106,106,106,...,106,106,106,106,106,106,106,106,106,106
unique,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,2
top,t,a,a,c,a,a,a,a,a,a,...,c,c,c,t,t,c,c,c,t,+
freq,38,34,30,30,36,42,38,34,33,36,...,36,42,31,33,35,32,29,29,34,53


#**Import Machine Learning Libraries**

In [11]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

#**Encode Labels**
The label encoder is used to convert the nucleotide sequences and class labels into numerical values suitable for machine learning algorithms

In [24]:
# creating the instance of Label Encoder
label_encoder = LabelEncoder()

# Determine the unique values in the columns and assign unique integer to them
nucleotides_df = nucleotides_df.apply(label_encoder.fit_transform)

nucleotides_df.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,Class
0,3,0,1,3,0,2,1,0,0,3,...,2,1,3,3,2,3,1,2,3,0
1,3,2,1,3,0,3,1,1,3,2,...,1,0,3,1,2,1,1,0,0,0
2,2,3,0,1,3,0,2,0,2,0,...,1,0,1,1,1,2,2,1,2,0
3,0,0,3,3,2,3,2,0,3,2,...,0,0,1,0,0,0,1,3,1,0
4,3,1,2,0,3,0,0,3,3,0,...,1,1,2,3,2,2,3,0,2,0


#**Preparing Train/Test Data**

In [5]:
#separating features ('X') and labels ('y')
X = nucleotides_df.drop('Class', axis=1)
y = nucleotides_df['Class']

# splitting them into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True)


#**Defining the different Models**

In [27]:
# We define a dictionary of the different machine learning models to be evaluates
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "AdaBoost": AdaBoostClassifier(),
    "K Nearest Neighbors": KNeighborsClassifier(n_neighbors=3),
    "Naive Bayes": GaussianNB(),
    "SVM Linear": SVC(kernel='linear'),
    "SVM RBF": SVC(kernel='rbf'),
    "SVM Sigmoid": SVC(kernel='sigmoid')
}


#**Cross Validation**

In [8]:
results = {}
scoring = 'accuracy'
kfold = KFold(n_splits=10, shuffle=True)

for name, model in models.items():
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results[name] = cv_results
    print(f"{name}: {cv_results.mean():.4f} ({cv_results.std():.4f})")


Random Forest: 0.9214 (0.0908)
AdaBoost: 0.8982 (0.0758)
K Nearest Neighbors: 0.8214 (0.1032)
Naive Bayes: 0.8625 (0.1625)
SVM Linear: 0.7839 (0.1142)
SVM RBF: 0.8482 (0.1656)
SVM Sigmoid: 0.4679 (0.1350)


# **Training and Evaluation**

In [9]:
for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(f"\n{name}")
    print(f"Accuracy: {accuracy_score(y_test, predictions):.4f}")
    print(classification_report(y_test, predictions))



Random Forest
Accuracy: 0.8519
              precision    recall  f1-score   support

           0       0.82      0.93      0.87        15
           1       0.90      0.75      0.82        12

    accuracy                           0.85        27
   macro avg       0.86      0.84      0.85        27
weighted avg       0.86      0.85      0.85        27


AdaBoost
Accuracy: 0.8889
              precision    recall  f1-score   support

           0       0.83      1.00      0.91        15
           1       1.00      0.75      0.86        12

    accuracy                           0.89        27
   macro avg       0.92      0.88      0.88        27
weighted avg       0.91      0.89      0.89        27


K Nearest Neighbors
Accuracy: 0.8519
              precision    recall  f1-score   support

           0       0.82      0.93      0.87        15
           1       0.90      0.75      0.82        12

    accuracy                           0.85        27
   macro avg       0.86      0.