In [None]:
!pip install torch transformers pandas numpy



In [None]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch

# Load ProtBERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('Rostlab/prot_bert', do_lower_case=False)  # Ensures sequences are not lowercased
model = BertModel.from_pretrained('Rostlab/prot_bert')

# Function to get embeddings for a protein sequence
def get_protbert_embeddings(sequence):
    # Ensure the sequence is properly formatted (uppercase amino acids with no spaces)
    sequence = " ".join(list(sequence))  # Insert spaces between each amino acid
    inputs = tokenizer(sequence, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Mean pooling to get fixed-size embeddings for the whole sequence
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Load your CSV file that contains the amino acid sequences
# Assume the column containing sequences is named 'Sequence'
data = pd.read_csv('DD-train.csv')  # Replace with your actual file path

# Prepare to store embeddings
embeddings = []

# Iterate over all sequences and generate embeddings
for seq in data[' Sequence']:
    embedding = get_protbert_embeddings(seq)
    embeddings.append(embedding[0])  # Append the first (and only) embedding

# Convert the embeddings list to a DataFrame
embeddings_df = pd.DataFrame(embeddings)

# Save the embeddings to a CSV file
embeddings_df.to_csv('DD-train_embedding.csv', index=False)  # Replace with desired output path

print(f"Protein embeddings saved to 'protein_embeddings.csv'.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/361 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

Protein embeddings saved to 'protein_embeddings.csv'.


In [None]:
pip install xgboost




In [None]:
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score

# Load your protein dataset as a pandas DataFrame
data = pd.read_csv('DD_dataset.csv')  # Replace with your actual dataset

# Prepare features and labels
X = data.iloc[:, :-1].values   # Features (all columns except the last)
y = data.iloc[:, -1].values     # Labels (last column)

# Check for NaN and infinite values
print("NaN values in features:", np.isnan(X).any())
print("NaN values in labels:", np.isnan(y).any())
print("Infinite values in features:", np.isinf(X).any())
print("Infinite values in labels:", np.isinf(y).any())

# Check data types
print("Data types of features:", X.dtype)
print("Data types of labels:", y.dtype)

# Check class distribution
unique, counts = np.unique(y, return_counts=True)
class_distribution = dict(zip(unique, counts))
print("Class distribution:", class_distribution)

# Initialize MLPClassifier
mlp_clf = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)

# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define scoring metrics
scoring = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score, average='weighted', zero_division=1),
    'recall': make_scorer(recall_score, average='weighted', zero_division=1),
    'f1_score': make_scorer(f1_score, average='weighted', zero_division=1)
}

# Perform cross-validation
cv_results = cross_validate(mlp_clf, X, y, cv=cv, scoring=scoring, return_train_score=False)

# Extract the scores for each metric
accuracy_scores = cv_results['test_accuracy']
precision_scores = cv_results['test_precision']
recall_scores = cv_results['test_recall']
f1_scores = cv_results['test_f1_score']

# Print out the cross-validation results
print("Cross-validation Accuracy Scores:", accuracy_scores)
print("Mean Accuracy:", np.mean(accuracy_scores))
print("Cross-validation Precision Scores:", precision_scores)
print("Mean Precision:", np.mean(precision_scores))
print("Cross-validation Recall Scores:", recall_scores)
print("Mean Recall:", np.mean(recall_scores))
print("Cross-validation F1 Scores:", f1_scores)
print("Mean F1 Score:", np.mean(f1_scores))

# Train the model on the full dataset
mlp_clf.fit(X, y)

# Make predictions (for demonstration, you'd typically do this on a separate test set)
# predictions = mlp_clf.predict(X_test)  # Assuming you have a separate test set


NaN values in features: False
NaN values in labels: False
Infinite values in features: False
Infinite values in labels: False
Data types of features: float64
Data types of labels: int64
Class distribution: {1: 19, 2: 16, 3: 32, 4: 15, 5: 18, 6: 15, 7: 74, 8: 21, 9: 29, 10: 13, 11: 16, 12: 32, 13: 12, 14: 13, 15: 16, 16: 77, 17: 23, 18: 24, 19: 40, 20: 22, 21: 17, 22: 22, 23: 18, 24: 15, 25: 15, 26: 40, 27: 40}




Cross-validation Accuracy Scores: [0.75539568 0.76258993 0.79136691 0.74820144 0.77536232]
Mean Accuracy: 0.7665832551350223
Cross-validation Precision Scores: [0.76338929 0.79180687 0.81896581 0.76542766 0.78263301]
Mean Precision: 0.7844445268178456
Cross-validation Recall Scores: [0.75539568 0.76258993 0.79136691 0.74820144 0.77536232]
Mean Recall: 0.7665832551350223
Cross-validation F1 Scores: [0.7472697  0.76093601 0.78708495 0.7401044  0.77028295]
Mean F1 Score: 0.7611356015794843


In [None]:
!pip install tensorflow




In [None]:
!pip install --upgrade tensorflow




In [None]:
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier  # Import KerasClassifier correctly

# Sample data generation (replace with your dataset)
X = np.random.rand(1000, 20)  # 1000 samples, 20 features
y = np.random.randint(2, size=1000)  # Binary labels

# Function to create the Keras model
def create_model():
    model = Sequential()
    model.add(Dense(64, activation='relu', input_dim=X.shape[1]))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Wrap the model with KerasClassifier
model = KerasClassifier(build_fn=create_model, epochs=100, batch_size=32, verbose=0)

# Cross-validation
cross_val_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

# Print cross-validation results
print(f'Cross-validation Accuracy: {cross_val_scores.mean()}')

# Fit the model on the whole dataset after cross-validation
model.fit(X, y)

# Evaluate the model on the test set
y_pred = (model.predict(X) > 0.5).astype("int32")

# Calculate metrics
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
f1 = f1_score(y, y_pred)

# Print evaluation results
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


ModuleNotFoundError: No module named 'keras.wrappers'

In [None]:
pip install scikeras[tensorflow]


Collecting scikeras[tensorflow]
  Downloading scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Downloading scikeras-0.13.0-py3-none-any.whl (26 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.13.0


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scikeras.wrappers import KerasClassifier  # Import KerasClassifier from SciKeras
from tensorflow import keras
from tensorflow.keras import layers

# Sample data generation (replace with your dataset)
X = np.random.rand(1000, 20)  # 1000 samples, 20 features
y = np.random.randint(2, size=1000)  # Binary labels

# Function to create the Keras model
def create_model():
    model = keras.Sequential()
    model.add(layers.Dense(64, activation='relu', input_dim=X.shape[1]))
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Wrap the model with KerasClassifier
model = KerasClassifier(model=create_model, epochs=100, batch_size=32, verbose=0)

# Cross-validation
cross_val_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

# Print cross-validation results
print(f'Cross-validation Accuracy: {cross_val_scores.mean()}')

# Fit the model on the whole dataset after cross-validation
model.fit(X, y)

# Evaluate the model on the test set
y_pred = (model.predict(X) > 0.5).astype("int32")

# Calculate metrics
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
f1 = f1_score(y, y_pred)

# Print evaluation results
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Cross-validation Accuracy: 0.485


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Accuracy: 0.706
Precision: 0.7480314960629921
Recall: 0.5900621118012422
F1 Score: 0.6597222222222222


In [None]:
import numpy as np
import xgboost as xgb
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Sample data generation (replace with your dataset)
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the XGBoost model
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Cross-validation
cross_val_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')

# Print cross-validation results
print(f'Cross-validation Accuracy: {cross_val_scores.mean()}')

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print evaluation results
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Cross-validation Accuracy: 0.9


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.895
Precision: 0.9479166666666666
Recall: 0.8504672897196262
F1 Score: 0.896551724137931


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load dataset from a CSV file
# Replace 'your_file.csv' with the path to your dataset file
data = pd.read_csv('EDD_embedding.csv')

# Assume the last column is the target variable, adjust accordingly
X = data.iloc[:, :-1]  # Features (all columns except the last one)
y = data.iloc[:, -1]   # Target (last column)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the Decision Tree model
model = DecisionTreeClassifier(random_state=42)

# Cross-validation
cross_val_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')

# Print cross-validation results
print(f'Cross-validation Accuracy: {cross_val_scores.mean()}')

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics with multiclass averaging
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')  # Change to 'macro', 'micro', or 'weighted'
recall = recall_score(y_test, y_pred, average='macro')        # Change to 'macro', 'micro', or 'weighted'
f1 = f1_score(y_test, y_pred, average='macro')                # Change to 'macro', 'micro', or 'weighted'

# Print evaluation results
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


Cross-validation Accuracy: 0.3793097213572533
Accuracy: 0.40350877192982454
Precision: 0.32037205196358165
Recall: 0.34742916062217466
F1 Score: 0.32583750807709105


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load dataset from a CSV file
# Replace 'your_file.csv' with the path to your dataset file
data = pd.read_csv('EDD_embedding.csv')

# Assume the last column is the target variable, adjust accordingly
X = data.iloc[:, :-1]  # Features (all columns except the last one)
y = data.iloc[:, -1]   # Target (last column)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the Naive Bayes model
model = GaussianNB()

# Cross-validation
cross_val_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')

# Print cross-validation results
print(f'Cross-validation Accuracy: {cross_val_scores.mean()}')

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics with multiclass averaging
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')  # Change to 'macro', 'micro', or 'weighted'
recall = recall_score(y_test, y_pred, average='macro')        # Change to 'macro', 'micro', or 'weighted'
f1 = f1_score(y_test, y_pred, average='macro')                # Change to 'macro', 'micro', or 'weighted'

# Print evaluation results
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


Cross-validation Accuracy: 0.47804139796827183
Accuracy: 0.5160818713450293
Precision: 0.5302810315655803
Recall: 0.5780192302901368
F1 Score: 0.5083481942353422


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load dataset from a CSV file
# Replace 'your_file.csv' with the path to your dataset file
data = pd.read_csv('EDD_embedding.csv')

# Assume the last column is the target variable, adjust accordingly
X = data.iloc[:, :-1]  # Features (all columns except the last one)
y = data.iloc[:, -1]   # Target (last column)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Cross-validation
cross_val_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')

# Print cross-validation results
print(f'Cross-validation Accuracy: {cross_val_scores.mean()}')

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics with multiclass averaging
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')  # Change to 'macro', 'micro', or 'weighted'
recall = recall_score(y_test, y_pred, average='macro')        # Change to 'macro', 'micro', or 'weighted'
f1 = f1_score(y_test, y_pred, average='macro')                # Change to 'macro', 'micro', or 'weighted'

# Print evaluation results
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


Cross-validation Accuracy: 0.6426482110211544
Accuracy: 0.6578947368421053
Precision: 0.726896146994161
Recall: 0.5351877141732597
F1 Score: 0.5691038446993678


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load dataset from a CSV file
# Replace 'your_file.csv' with the path to your dataset file
data = pd.read_csv('EDD_embedding.csv')

# Assume the last column is the target variable, adjust accordingly
X = data.iloc[:, :-1]  # Features (all columns except the last one)
y = data.iloc[:, -1]   # Target (last column)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the KNN model
k = 5  # You can change this value to find the optimal number of neighbors
model = KNeighborsClassifier(n_neighbors=k)

# Cross-validation
cross_val_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')

# Print cross-validation results
print(f'Cross-validation Accuracy: {cross_val_scores.mean()}')

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics with multiclass averaging
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')  # Change to 'macro', 'micro', or 'weighted'
recall = recall_score(y_test, y_pred, average='macro')        # Change to 'macro', 'micro', or 'weighted'
f1 = f1_score(y_test, y_pred, average='macro')                # Change to 'macro', 'micro', or 'weighted'

# Print evaluation results
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


Cross-validation Accuracy: 0.5720634027763827
Accuracy: 0.6111111111111112
Precision: 0.6036642006219928
Recall: 0.5728219016186312
F1 Score: 0.5598676249849365


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load dataset from a CSV file
# Replace 'your_file.csv' with the path to your dataset file
data = pd.read_csv('EDD_embedding.csv')

# Assume the last column is the target variable, adjust accordingly
X = data.iloc[:, :-1]  # Features (all columns except the last one)
y = data.iloc[:, -1]   # Target (last column)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the SVM model
model = SVC(kernel='linear')  # You can change the kernel to 'rbf', 'poly', etc.

# Cross-validation
cross_val_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')

# Print cross-validation results
print(f'Cross-validation Accuracy: {cross_val_scores.mean()}')

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics with multiclass averaging
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')  # Change to 'macro', 'micro', or 'weighted'
recall = recall_score(y_test, y_pred, average='macro')        # Change to 'macro', 'micro', or 'weighted'
f1 = f1_score(y_test, y_pred, average='macro')                # Change to 'macro', 'micro', or 'weighted'

# Print evaluation results
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


Cross-validation Accuracy: 0.8430814767195024
Accuracy: 0.8611111111111112
Precision: 0.8938246576080972
Recall: 0.8361925384057578
F1 Score: 0.8541117604316433


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load dataset from a CSV file
# Replace 'your_file.csv' with the path to your dataset file
data = pd.read_csv('EDD_embedding.csv')

# Assume the last column is the target variable, adjust accordingly
X = data.iloc[:, :-1]  # Features (all columns except the last one)
y = data.iloc[:, -1]   # Target (last column)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the MLP model
model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)

# Cross-validation
cross_val_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')

# Print cross-validation results
print(f'Cross-validation Accuracy: {cross_val_scores.mean()}')

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics with multiclass averaging
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')  # Change to 'macro', 'micro', or 'weighted'
recall = recall_score(y_test, y_pred, average='macro')        # Change to 'macro', 'micro', or 'weighted'
f1 = f1_score(y_test, y_pred, average='macro')                # Change to 'macro', 'micro', or 'weighted'

# Print evaluation results
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')




Cross-validation Accuracy: 0.882584326094381
Accuracy: 0.8874269005847953
Precision: 0.8990241688970362
Recall: 0.8912529954510445
F1 Score: 0.8903117725748092




In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load dataset from a CSV file
# Replace 'your_file.csv' with the path to your dataset file
data = pd.read_csv('EDD_embedding.csv')

# Assume the last column is the target variable, adjust accordingly
X = data.iloc[:, :-1]  # Features (all columns except the last one)
y = data.iloc[:, -1]   # Target (last column)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the Logistic Regression model
model = LogisticRegression(max_iter=300, multi_class='multinomial', solver='lbfgs')

# Cross-validation
cross_val_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')

# Print cross-validation results
print(f'Cross-validation Accuracy: {cross_val_scores.mean()}')

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics with multiclass averaging
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')  # Change to 'macro', 'micro', or 'weighted'
recall = recall_score(y_test, y_pred, average='macro')        # Change to 'macro', 'micro', or 'weighted'
f1 = f1_score(y_test, y_pred, average='macro')                # Change to 'macro', 'micro', or 'weighted'

# Print evaluation results
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Cross-validation Accuracy: 0.7732312781672928




Accuracy: 0.7894736842105263
Precision: 0.810127974118224
Recall: 0.7082046912263358
F1 Score: 0.7404078743621137


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
pip install scikeras[tensorflow]

Collecting scikeras[tensorflow]
  Downloading scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Downloading scikeras-0.13.0-py3-none-any.whl (26 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.13.0


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scikeras.wrappers import KerasClassifier
from tensorflow import keras
from tensorflow.keras import layers

# Load dataset from a CSV file
# Replace 'your_file.csv' with the path to your dataset file
data = pd.read_csv('EDD_embedding.csv')

# Assume the last column is the target variable, adjust accordingly
X = data.iloc[:, :-1].values  # Features (all columns except the last one)
y = data.iloc[:, -1].values    # Target (last column)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to create the ANN model
def create_model():
    model = keras.Sequential()
    model.add(layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dense(len(np.unique(y)), activation='softmax'))  # Output layer for multiclass classification
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Create KerasClassifier
model = KerasClassifier(model=create_model, epochs=100, batch_size=10, verbose=0)

# Cross-validation
cross_val_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')

# Print cross-validation results
print(f'Cross-validation Accuracy: {cross_val_scores.mean()}')

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics with multiclass averaging
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')  # Change to 'macro', 'micro', or 'weighted'
recall = recall_score(y_test, y_pred, average='macro')        # Change to 'macro', 'micro', or 'weighted'
f1 = f1_score(y_test, y_pred, average='macro')                # Change to 'macro', 'micro', or 'weighted'

# Print evaluation results
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Cross-validation Accuracy: 0.8653963343177237


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Accuracy: 0.8669590643274854
Precision: 0.8734376218447966
Recall: 0.8759274835880669
F1 Score: 0.8693612887808918


In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load dataset from a CSV file
# Replace 'your_dataset.csv' with the path to your dataset file
data = pd.read_csv('EDD_embedding.csv')

# Assuming the last column is the target variable and the rest are features
X = data.iloc[:, :-1].values  # Features (all columns except the last one)
y = data.iloc[:, -1].values    # Target (last column)

# Adjust the class labels to start from 0 if they start from 1
y = y - 1  # Adjust class labels

# Print unique classes for debugging
print("Unique classes in target variable:", np.unique(y))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the XGBoost model
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Cross-validation
cross_val_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')

# Print cross-validation results
print(f'Cross-validation Accuracy: {cross_val_scores.mean()}')

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')  # Use 'macro' for multiclass
recall = recall_score(y_test, y_pred, average='macro')        # Use 'macro' for multiclass
f1 = f1_score(y_test, y_pred, average='macro')                # Use 'macro' for multiclass

# Print evaluation results
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


Unique classes in target variable: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26]


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Cross-validation Accuracy: 0.7414006468851077


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.7573099415204678
Precision: 0.7680466288184599
Recall: 0.7187948733467537
F1 Score: 0.7222358747011022
