In [13]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import accuracy_score
import numpy as np

In [14]:
# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/Machine learning Lab/drugbank_clean_no_outliers_IQR.csv')

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1163 entries, 0 to 1162
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   name               1163 non-null   int64  
 1   state              1163 non-null   int64  
 2   groups             1163 non-null   int64  
 3   drug-interactions  1163 non-null   int64  
 4   enzymes            1163 non-null   int64  
 5   targets            1163 non-null   int64  
 6   average-mass       1163 non-null   float64
 7   monoisotopic-mass  1163 non-null   float64
dtypes: float64(2), int64(6)
memory usage: 72.8 KB


In [16]:
non_zero_variance_features = df.loc[:, df.var() > 0]
X = non_zero_variance_features.drop(columns=['targets'])
y = df['targets']

In [30]:
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_train_score = rf_model.score(X_train, y_train)
rf_test_score = rf_model.score(X_test, y_test)
rf_cv_score = cross_val_score(rf_model, X, y, cv=5).mean()
print("Random Forest:")
print("Training Accuracy:", rf_train_score)
print("Testing Accuracy:", rf_test_score)
print("Cross-Validation Accuracy:", rf_cv_score)

# K-Nearest Neighbors
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
knn_train_score = knn_model.score(X_train, y_train)
knn_test_score = knn_model.score(X_test, y_test)
knn_cv_score = cross_val_score(knn_model, X, y, cv=5).mean()
print("\nK-Nearest Neighbors:")
print("Training Accuracy:", knn_train_score)
print("Testing Accuracy:", knn_test_score)
print("Cross-Validation Accuracy:", knn_cv_score)

# Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
lr_train_score = lr_model.score(X_train, y_train)
lr_test_score = lr_model.score(X_test, y_test)
lr_cv_score = cross_val_score(lr_model, X, y, cv=5).mean()
print("\nLogistic Regression:")
print("Training Accuracy:", lr_train_score)
print("Testing Accuracy:", lr_test_score)
print("Cross-Validation Accuracy:", lr_cv_score)

# Support Vector Machine
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_train_score = svm_model.score(X_train, y_train)
svm_test_score = svm_model.score(X_test, y_test)
svm_cv_score = cross_val_score(svm_model, X, y, cv=5).mean()
print("\nSupport Vector Machine:")
print("Training Accuracy:", svm_train_score)
print("Testing Accuracy:", svm_test_score)
print("Cross-Validation Accuracy:", svm_cv_score)

# prompt: select best model from above
rf_avg_acc = (rf_cv_score+rf_train_score+rf_test_score)/3
knn_avg_acc = (knn_cv_score+knn_train_score+knn_test_score)/3
lr_avg_acc = (lr_cv_score+lr_train_score+lr_test_score)/3
svm_avg_acc = (svm_cv_score+svm_train_score+svm_test_score)/3


# Create a dictionary to store the model names and their cross-validation scores
model_scores = {
    "Random Forest": rf_avg_acc,
    "K-Nearest Neighbors": knn_avg_acc,
    "Logistic Regression": lr_avg_acc,
    "Support Vector Machine": svm_avg_acc
}

# Find the model with the highest cross-validation score
best_model = max(model_scores, key=model_scores.get)
best_score = model_scores[best_model]

print("\nBest Model:", best_model)
print("Best Average Accuracy:", best_score)


Random Forest:
Training Accuracy: 1.0
Testing Accuracy: 0.944206008583691
Cross-Validation Accuracy: 0.9174670711854374

K-Nearest Neighbors:
Training Accuracy: 0.9204301075268817
Testing Accuracy: 0.9356223175965666
Cross-Validation Accuracy: 0.9157318336539884

Logistic Regression:
Training Accuracy: 0.9096774193548387
Testing Accuracy: 0.944206008583691
Cross-Validation Accuracy: 0.9165976024863106

Support Vector Machine:
Training Accuracy: 0.9096774193548387
Testing Accuracy: 0.944206008583691
Cross-Validation Accuracy: 0.9165976024863106

Best Model: Random Forest
Best Cross-Validation Accuracy: 0.9538910265897095


In [32]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create individual models with default parameters
rf_model = RandomForestClassifier()
knn_model = KNeighborsClassifier()
lr_model = LogisticRegression()
svm_model = SVC(probability=True)

# Hyperparameter tuning for hard voting and soft voting
hard_voting_params = {'voting': ['hard'], 'weights': [[1, 1, 1, 1], [2, 1, 1, 1], [1, 2, 1, 1], [1, 1, 2, 1], [1, 1, 1, 2]]}
hard_voting_model = GridSearchCV(VotingClassifier(estimators=[('rf', rf_model), ('knn', knn_model), ('lr', lr_model), ('svm', svm_model)]), hard_voting_params, cv=5)
hard_voting_model.fit(X_train, y_train)

soft_voting_params = {'voting': ['soft'], 'weights': [[1, 1, 1, 1], [2, 1, 1, 1], [1, 2, 1, 1], [1, 1, 2, 1], [1, 1, 1, 2]]}
soft_voting_model = GridSearchCV(VotingClassifier(estimators=[('rf', rf_model), ('knn', knn_model), ('lr', lr_model), ('svm', svm_model)]), soft_voting_params, cv=5)
soft_voting_model.fit(X_train, y_train)

# Evaluate the models
hard_voting_cv_score = cross_val_score(hard_voting_model.best_estimator_, X, y, cv=5).mean()
soft_voting_cv_score = cross_val_score(soft_voting_model.best_estimator_, X, y, cv=5).mean()

print("Hard Voting Cross-Validation Accuracy:", hard_voting_cv_score)
print("Soft Voting Cross-Validation Accuracy:", soft_voting_cv_score)

# Assign the best performing model to clf
if soft_voting_cv_score > hard_voting_cv_score:
    clf = soft_voting_model.best_estimator_
else:
    clf = hard_voting_model.best_estimator_

print("\nBest performing model:", type(clf).__name__)

# K Fold Cross Validation Accuracy
cv_scores = cross_val_score(clf, X_train, y_train, cv=5)
print("Cross-Validation Scores: ", cv_scores)

# Average Accuracy of Cross Validation
average_accuracy = np.mean(cv_scores)
print("Average Accuracy of Cross Validation: ", average_accuracy)

Hard Voting Cross-Validation Accuracy: 0.9165976024863106
Soft Voting Cross-Validation Accuracy: 0.9174559715850229

Best performing model: VotingClassifier
Cross-Validation Scores:  [0.91397849 0.90860215 0.90860215 0.90860215 0.90860215]
Average Accuracy of Cross Validation:  0.9096774193548386


In [34]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

X_non_negative = np.maximum(X, 0)

# Apply feature selection using chi-square
select_k_best = SelectKBest(score_func=chi2, k=5)
X_top5 = select_k_best.fit_transform(X_non_negative, y)


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_top5, y, test_size=0.2)

# Create individual models with default parameters
rf_model = RandomForestClassifier()
knn_model = KNeighborsClassifier()
lr_model = LogisticRegression()
svm_model = SVC(probability=True)

# Hard Voting
hard_voting_model = VotingClassifier(estimators=[('rf', rf_model), ('knn', knn_model), ('lr', lr_model), ('svm', svm_model)], voting='hard')
hard_voting_cv_scores = cross_val_score(hard_voting_model, X_top5, y, cv=5)
# print("Hard Voting Cross-Validation Scores:", hard_voting_cv_scores)
hard_voting_model.fit(X_train, y_train)
hard_voting_test_score = hard_voting_model.score(X_test, y_test)
print("Hard Voting Test Accuracy:", hard_voting_test_score)

# Calculate average cross-validation scores
avg_hard_voting_cv_score = np.mean(hard_voting_cv_scores)
print("Average Hard Voting Cross-Validation Score:", avg_hard_voting_cv_score)

# Soft Voting
soft_voting_model = VotingClassifier(estimators=[('rf', rf_model), ('knn', knn_model), ('lr', lr_model), ('svm', svm_model)], voting='soft')
soft_voting_cv_scores = cross_val_score(soft_voting_model, X_top5, y, cv=5)
# print("Soft Voting Cross-Validation Scores:", soft_voting_cv_scores)
soft_voting_model.fit(X_train, y_train)
soft_voting_test_score = soft_voting_model.score(X_test, y_test)
print("Soft Voting Test Accuracy:", soft_voting_test_score)
avg_soft_voting_cv_score = np.mean(soft_voting_cv_scores)
print("Average Soft Voting Cross-Validation Score:", avg_soft_voting_cv_score)

Hard Voting Test Accuracy: 0.9184549356223176
Average Hard Voting Cross-Validation Score: 0.9174596714518277
Soft Voting Test Accuracy: 0.9184549356223176
Average Soft Voting Cross-Validation Score: 0.9165976024863106


In [35]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import numpy as np



# Apply feature selection using Random Forest Classifier
rfc = RandomForestClassifier()
sfm = SelectFromModel(rfc, threshold=-np.inf, max_features=5)
X_top5 = sfm.fit_transform(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_top5, y, test_size=0.2)

# Create individual models with default parameters
rf_model = RandomForestClassifier()
knn_model = KNeighborsClassifier()
lr_model = LogisticRegression()
svm_model = SVC(probability=True)

# Hard Voting
hard_voting_model = VotingClassifier(estimators=[('rf', rf_model), ('knn', knn_model), ('lr', lr_model), ('svm', svm_model)], voting='hard')
hard_voting_cv_scores = cross_val_score(hard_voting_model, X_top5, y, cv=5)
# print("Hard Voting Cross-Validation Scores:", hard_voting_cv_scores)
hard_voting_model.fit(X_train, y_train)
hard_voting_test_score = hard_voting_model.score(X_test, y_test)
print("Hard Voting Test Accuracy:", hard_voting_test_score)
# Calculate average cross-validation scores
avg_hard_voting_cv_score = np.mean(hard_voting_cv_scores)
print("Average Hard Voting Cross-Validation Score:", avg_hard_voting_cv_score)

# Soft Voting
soft_voting_model = VotingClassifier(estimators=[('rf', rf_model), ('knn', knn_model), ('lr', lr_model), ('svm', svm_model)], voting='soft')
soft_voting_cv_scores = cross_val_score(soft_voting_model, X_top5, y, cv=5)
# print("Soft Voting Cross-Validation Scores:", soft_voting_cv_scores)
soft_voting_model.fit(X_train, y_train)
soft_voting_test_score = soft_voting_model.score(X_test, y_test)
print("Soft Voting Test Accuracy:", soft_voting_test_score)
avg_soft_voting_cv_score = np.mean(soft_voting_cv_scores)
print("Average Soft Voting Cross-Validation Score:", avg_soft_voting_cv_score)



Hard Voting Test Accuracy: 0.927038626609442
Average Hard Voting Cross-Validation Score: 0.9165976024863106
Soft Voting Test Accuracy: 0.9313304721030042
Average Soft Voting Cross-Validation Score: 0.9183143406837354
