### Analysis of IL6-inducing peptides dataset

In this notebook, we:
1. Load the dataset.
2. Split the dataset into training and testing sets.
3. Define multiple classifiers.
4. Perform 10-fold cross-validation.
5. Plot the mean accuracy and standard deviation for each model.
6. Extract and display feature importances.
7. Plot feature importances for Decision Tree and Random Forest.

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_curve, auc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Load the dataset
data = pd.read_csv('/mnt/data/Data.csv')

# Split the dataset into training and testing sets
X = data.drop('class', axis=1)
y = data['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the classifiers
classifiers = {
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "SVM": SVC(probability=True, kernel='linear'),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
    "Linear Discriminant Analysis": LinearDiscriminantAnalysis()
}

# Perform 10-fold cross-validation for each classifier
cv_results = {}
for name, clf in classifiers.items():
    scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
    cv_results[name] = {
        "mean_accuracy": scores.mean(),
        "std_accuracy": scores.std()
    }

# Extract and display feature importances for Decision Tree and Random Forest
feature_importance_data = {}
for name in ["Decision Tree", "Random Forest"]:
    clf = classifiers[name]
    clf.fit(X_train, y_train)
    feature_importances = clf.feature_importances_
    feature_importance_data[name] = feature_importances

features_df = pd.DataFrame({
    'Feature': X.columns,
    'Decision Tree': feature_importance_data["Decision Tree"],
    'Random Forest': feature_importance_data["Random Forest"]
})

# Plot mean accuracies with standard deviations
names = list(cv_results.keys())
mean_accuracies = [cv_results[name]["mean_accuracy"] for name in names]
std_accuracies = [cv_results[name]["std_accuracy"] for name in names]
plt.figure(figsize=(14, 8))
plt.bar(names, mean_accuracies, yerr=std_accuracies, color=['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black'])
plt.ylabel('Accuracy')
plt.title('Mean Accuracy with 10-fold Cross Validation')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.show()

# Plot feature importances for Decision Tree and Random Forest
plt.figure(figsize=(14, 8))
plt.barh(features_df['Feature'], features_df['Decision Tree'], color='cyan')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance from Decision Tree')
plt.gca().invert_yaxis()
plt.show()

plt.figure(figsize=(14, 8))
plt.barh(features_df['Feature'], features_df['Random Forest'], color='magenta')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance from Random Forest')
plt.gca().invert_yaxis()
plt.show()
