In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns  # Optional: for heatmap styling

# Load the data
data = pd.read_csv("data.csv")

# Define window size
window_size = 15  # Approximately 15 data points for 0.5 seconds

# Initialize lists to store windowed features and labels
windowed_features = []
windowed_labels = []

# Segment the time-series data into windows
for i in range(0, len(data) - window_size + 1, window_size):
    window = data.iloc[i:i + window_size]
    if len(window) == window_size:  # Ensure the window is complete
        # Extract features from the window
        window_features = {
            'mean_acc_x': window['acc_x'].mean(),
            'mean_acc_y': window['acc_y'].mean(),
            'mean_acc_z': window['acc_z'].mean(),
            'std_acc_x': window['acc_x'].std(),
            'std_acc_y': window['acc_y'].std(),
            'std_acc_z': window['acc_z'].std(),
            'skew_acc_x': window['acc_x'].skew(),
            'skew_acc_y': window['acc_y'].skew(),
            'skew_acc_z': window['acc_z'].skew(),
            'kurt_acc_x': window['acc_x'].kurt(),
            'kurt_acc_y': window['acc_y'].kurt(),
            'kurt_acc_z': window['acc_z'].kurt()
        }
        windowed_features.append(window_features)

        # Assign label to the window (assuming it's the same for all samples within the window)
        window_label = window['behavior'].iloc[0]  # Adjust based on your specific label
        windowed_labels.append(window_label)

# Convert lists to DataFrame
X_windowed = pd.DataFrame(windowed_features)
y_windowed = np.array(windowed_labels)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X_windowed)

# Scale the input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Deal with class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y_windowed)

# Split the data into train and test using stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)

# Initialize a list to store accuracy results
accuracies = []

# Perform 10 iterations
for i in range(1):
    # Instantiate Random Forest classifier with random subspace
    rf_classifier = RandomForestClassifier(n_estimators=100, max_features='sqrt', random_state=42)

    # Train the Random Forest classifier
    rf_classifier.fit(X_train, y_train)

    # Make predictions on the testing data
    y_pred_rf = rf_classifier.predict(X_test)

    # Evaluate the Random Forest model's performance
    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    f1 = f1_score(y_test, y_pred_rf, average='weighted')
    precision = precision_score(y_test, y_pred_rf, average='weighted')
    recall = recall_score(y_test, y_pred_rf, average='weighted')

    # Gini score calculation
    def gini_score(y_true, y_pred):
        true_positive_rate = recall_score(y_true, y_pred, average='weighted')
        false_positive_rate = 1 - precision_score(y_true, y_pred, average='weighted')
        return 2 * (true_positive_rate - false_positive_rate)

    gini = gini_score(y_test, y_pred_rf)

    accuracies.append(accuracy_rf)
    print(f"Accuracy on test data (Random Forest with 0.5-second window size) for iteration {i + 1}: {accuracy_rf * 100:.2f}%")
    print(f"F1 Score: {f1:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"Gini Score: {gini:.2f}")

# Print average accuracy over all iterations
print(f"Average Accuracy over 10 iterations: {np.mean(accuracies) * 100:.2f}%")

# Plot confusion matrix for the last iteration
conf_matrix = confusion_matrix(y_test, y_pred_rf)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

# Get the unique labels and their corresponding encoded values
unique_labels, label_encoding = np.unique(y_windowed, return_inverse=True)

# Print the unique labels with their corresponding numbers
for i, label in enumerate(unique_labels):
    print(f"Encoded label {i} corresponds to behavior: {label}")


In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Plot confusion matrix for the last iteration
conf_matrix = confusion_matrix(y_test, y_pred_rf)

# Normalize the confusion matrix by dividing by the sum of each row to get percentages
conf_matrix_percentage = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis] * 100

# Plot the normalized confusion matrix with percentages
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_percentage, annot=True, fmt='.2f', cmap='Blues',
            xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix (in Percentages)')
plt.show()

# Get the unique labels and their corresponding encoded values
unique_labels, label_encoding = np.unique(y_windowed, return_inverse=True)

# Print the unique labels with their corresponding numbers
for i, label in enumerate(unique_labels):
    print(f"Encoded label {i} corresponds to behavior: {label}")

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import numpy as np

# Function to calculate Gini score from predicted probabilities
def gini_score(y_true, y_pred_proba):
    """
    Gini score calculation based on predicted probabilities for binary classification.
    :param y_true: True labels
    :param y_pred_proba: Predicted probabilities for the positive class
    :return: Gini coefficient score
    """
    # Sort the true values and predictions by predicted probabilities
    sorted_indices = np.argsort(y_pred_proba)
    sorted_y_true = np.array(y_true)[sorted_indices]


# Assuming y_pred_proba is the predicted probabilities from RandomForest
# Example usage (predicting probabilities instead of hard predictions):
rf_classifier = RandomForestClassifier(n_estimators=100, max_features='sqrt', random_state=42)
rf_classifier.fit(X_train, y_train)

# Get predicted probabilities for the positive class (in binary classification)
y_pred_proba_rf = rf_classifier.predict_proba(X_test)[:, 1]  # Only take probabilities of the positive class
y_pred_rf = rf_classifier.predict(X_test)

# Calculate the Gini score using the predicted probabilities
gini = gini_score(y_test, y_pred_proba_rf)

# Calculate other metrics
accuracy_rf = accuracy_score(y_test, y_pred_rf)
f1 = f1_score(y_test, y_pred_rf, average='weighted')
precision = precision_score(y_test, y_pred_rf, average='weighted')
recall = recall_score(y_test, y_pred_rf, average='weighted')

print(f"Accuracy: {accuracy_rf * 100:.2f}%")


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns

# After making predictions
y_pred_rf = rf_classifier.predict(X_test)

# Create confusion matrix
cm = confusion_matrix(y_test, y_pred_rf)

# Normalize the confusion matrix by converting counts to percentages
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100

# Plot the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm_normalized, annot=True, fmt=".2f", cmap="Blues", cbar=False)

plt.title("Confusion Matrix (in %)")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()

# Get the unique labels and their corresponding encoded values
unique_labels, label_encoding = np.unique(y_windowed, return_inverse=True)

# Print the unique labels with their corresponding numbers
for i, label in enumerate(unique_labels):
    print(f"Encoded label {i} corresponds to behavior: {label}")


In [None]:
from sklearn.metrics import roc_auc_score

# Gini score calculation using AUC-ROC
def gini_score(y_true, y_pred_proba):
    # Calculate AUC-ROC score
    auc = roc_auc_score(y_true, y_pred_proba, multi_class='ovr', average='weighted')
    # Gini coefficient is derived from AUC
    return 2 * auc - 1

# Perform 10 iterations
for i in range(10):
    # Instantiate Random Forest classifier with random subspace
    rf_classifier = RandomForestClassifier(n_estimators=100, max_features='sqrt', random_state=42)

    # Train the Random Forest classifier
    rf_classifier.fit(X_train, y_train)

    # Make predictions on the testing data
    y_pred_rf = rf_classifier.predict(X_test)
    y_pred_proba_rf = rf_classifier.predict_proba(X_test)  # Get predicted probabilities

    # Evaluate the Random Forest model's performance
    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    f1 = f1_score(y_test, y_pred_rf, average='weighted')
    precision = precision_score(y_test, y_pred_rf, average='weighted')
    recall = recall_score(y_test, y_pred_rf, average='weighted')

    # Calculate Gini score using predicted probabilities
    gini = gini_score(y_test, y_pred_proba_rf)

    accuracies.append(accuracy_rf)
    print(f"Accuracy on test data (Random Forest with 0.5-second window size) for iteration {i + 1}: {accuracy_rf * 100:.2f}%")
    print(f"F1 Score: {f1:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"Gini Score: {gini:.2f}")

# Print average accuracy over all iterations
print(f"Average Accuracy over 10 iterations: {np.mean(accuracies) * 100:.2f}%")

# Plot confusion matrix for the last iteration
conf_matrix = confusion_matrix(y_test, y_pred_rf)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Plot confusion matrix for the last iteration
conf_matrix = confusion_matrix(y_test, y_pred_rf)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Plot confusion matrix as percentages for the last iteration
conf_matrix = confusion_matrix(y_test, y_pred_rf)
conf_matrix_percent = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis] * 100

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_percent, annot=True, fmt='.2f', cmap='Blues',
            xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix (Percentages)')
plt.show()


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns  # Optional: for heatmap styling

# Define file names
file_names = ["cow1.csv", "cow2.csv", "cow3.csv", "cow4.csv", "cow5.csv", "cow6.csv"]

# Initialize an empty list to store DataFrames
all_data = []

# Iterate over each file and read them
for file_name in file_names:
    data = pd.read_csv(file_name)
    all_data.append(data)

# Concatenate all the data into one DataFrame
combined_data = pd.concat(all_data, ignore_index=True)

# Define window size
window_size = 15  # Approximately 15 data points for 0.5 seconds

# Initialize lists to store windowed features and labels
windowed_features = []
windowed_labels = []

# Segment the time-series data into windows
for i in range(0, len(combined_data) - window_size + 1, window_size):
    window = combined_data.iloc[i:i + window_size]
    if len(window) == window_size:  # Ensure the window is complete
        # Extract features from the window
        window_features = {
            'mean_acc_x': window['AccX'].mean(),
            'mean_acc_y': window['AccY'].mean(),
            'mean_acc_z': window['AccZ'].mean(),
            'std_acc_x': window['AccX'].std(),
            'std_acc_y': window['AccY'].std(),
            'std_acc_z': window['AccZ'].std(),
            'skew_acc_x': window['AccX'].skew(),
            'skew_acc_y': window['AccY'].skew(),
            'skew_acc_z': window['AccZ'].skew(),
            'kurt_acc_x': window['AccX'].kurt(),
            'kurt_acc_y': window['AccY'].kurt(),
            'kurt_acc_z': window['AccZ'].kurt()
        }
        windowed_features.append(window_features)

        # Assign label to the window (assuming it's the same for all samples within the window)
        window_label = window['Label'].iloc[0]
        windowed_labels.append(window_label)

# Convert lists to DataFrame
X_windowed = pd.DataFrame(windowed_features)
y_windowed = np.array(windowed_labels)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X_windowed)

# Scale the input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Deal with class imbalance using SMOTE with a reduced number of neighbors
smote = SMOTE(random_state=42, k_neighbors=2)  # Reduce k_neighbors to 2
X_resampled, y_resampled = smote.fit_resample(X_scaled, y_windowed)

# Split the data into train and test using stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)

# Initialize a list to store accuracy results
accuracies = []

# Gini score calculation using AUC-ROC
def gini_score(y_true, y_pred_proba):
    # Calculate AUC-ROC score
    auc = roc_auc_score(y_true, y_pred_proba, multi_class='ovr', average='weighted')
    # Gini coefficient is derived from AUC
    return 2 * auc - 1

# Perform 1 iteration
for i in range(1):
    # Instantiate Random Forest classifier with random subspace
    rf_classifier = RandomForestClassifier(n_estimators=100, max_features='sqrt', random_state=42)

    # Train the Random Forest classifier
    rf_classifier.fit(X_train, y_train)

    # Make predictions on the testing data
    y_pred_rf = rf_classifier.predict(X_test)
    y_pred_proba_rf = rf_classifier.predict_proba(X_test)  # Get predicted probabilities

    # Evaluate the Random Forest model's performance
    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    f1 = f1_score(y_test, y_pred_rf, average='weighted')
    precision = precision_score(y_test, y_pred_rf, average='weighted')
    recall = recall_score(y_test, y_pred_rf, average='weighted')

    # Calculate Gini score using predicted probabilities
    gini = gini_score(y_test, y_pred_proba_rf)

    accuracies.append(accuracy_rf)
    print(f"Accuracy on test data (Random Forest with 0.5-second window size) for iteration {i + 1}: {accuracy_rf * 100:.2f}%")
    print(f"F1 Score: {f1:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"Gini Score: {gini:.2f}")

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Plot confusion matrix for the last iteration
conf_matrix = confusion_matrix(y_test, y_pred_rf)

# Normalize the confusion matrix by dividing by the sum of each row to get percentages
conf_matrix_percentage = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis] * 100

# Plot the normalized confusion matrix with percentages
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_percentage, annot=True, fmt='.2f', cmap='Blues',
            xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix (in Percentages)')
plt.show()

# Get the unique labels and their corresponding encoded values
unique_labels, label_encoding = np.unique(y_windowed, return_inverse=True)

# Print the unique labels with their corresponding numbers
for i, label in enumerate(unique_labels):
    print(f"Encoded label {i} corresponds to behavior: {label}")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# Define column labels (adjust as per your dataset)
column_labels = ['date', 'x_mean', 'x_max', 'x_min', 'x_std', 'x_var', 'x_skew', 'x_kurtosis',
                 'y_mean', 'y_max', 'y_min', 'y_std', 'y_var', 'y_skew', 'y_kurtosis',
                 'z_mean', 'z_max', 'z_min', 'z_std', 'z_var', 'z_skew', 'z_kurtosis',
                 'n', 'x_range', 'y_range', 'z_range', 'svm', 'vmm', 'sma', 'ai', 'label']

# Data files
data_files = ["resampled_2_1S.csv", "resampled_2_3S.csv", "resampled_2_5S.csv"]

# Initialize lists to store all data
data_frames = []

# Load all data files and append to data_frames list
for file_name in data_files:
    df = pd.read_csv(file_name)
    data_frames.append(df)

# Concatenate all data frames into a single dataframe
data = pd.concat(data_frames, ignore_index=True)

# Assuming 'date' is the column containing timestamps
data['date'] = pd.to_datetime(data['date'])  # Convert to datetime format

# Calculate time difference
time_diff = data['date'].diff().dropna()  # Drop NA values and calculate time difference

# Calculate frequency (assuming uniform sampling)
mean_time_diff = time_diff.mean()
frequency = 1 / mean_time_diff.total_seconds()  # Convert to Hz

print("Sampling frequency for combined data:", frequency, "Hz")

# Calculate window size based on frequency
window_duration = 0.5  # Seconds
window_size = int(frequency * window_duration)

if window_size == 0:
    print("Window size is zero, adjusting to 1")
    window_size = 1

# Extract windowed features and labels
windowed_features = []
windowed_labels = []

for i in range(0, len(data), window_size):
    window = data.iloc[i:i+window_size]
    if len(window) == window_size:
        # Compute mean values as features for this window
        window_features = window.mean()
        window_label = window['label'].iloc[0]
        windowed_features.append(window_features)
        windowed_labels.append(window_label)

# Convert lists to DataFrame
X = pd.DataFrame(windowed_features).drop(columns=['date'])  # Drop 'date' column for features
y = np.array(windowed_labels)

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Scale the input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Deal with class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)

# Initialize Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Make predictions
y_pred = rf_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Test accuracy for combined data:", accuracy * 100, "%")

# Gini score calculation using AUC-ROC
def gini_score(y_true, y_pred_proba):
    # Calculate AUC-ROC score
    auc = roc_auc_score(y_true, y_pred_proba, multi_class='ovr', average='weighted')
    # Gini coefficient is derived from AUC
    return 2 * auc - 1

# Perform 1 iteration
for i in range(1):
    # Instantiate Random Forest classifier with random subspace
    rf_classifier = RandomForestClassifier(n_estimators=100, max_features='sqrt', random_state=42)

    # Train the Random Forest classifier
    rf_classifier.fit(X_train, y_train)

    # Make predictions on the testing data
    y_pred_rf = rf_classifier.predict(X_test)
    y_pred_proba_rf = rf_classifier.predict_proba(X_test)  # Get predicted probabilities

    # Evaluate the Random Forest model's performance
    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    f1 = f1_score(y_test, y_pred_rf, average='weighted')
    precision = precision_score(y_test, y_pred_rf, average='weighted')
    recall = recall_score(y_test, y_pred_rf, average='weighted')

    # Calculate Gini score using predicted probabilities
    gini = gini_score(y_test, y_pred_proba_rf)

    accuracies.append(accuracy_rf)
    print(f"Accuracy on test data (Random Forest with 0.5-second window size) for iteration {i + 1}: {accuracy_rf * 100:.2f}%")
    print(f"F1 Score: {f1:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"Gini Score: {gini:.2f}")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
data = pd.read_csv("dataset_6.csv")

# Define window size
window_size = 15  # Approximately 15 data points for 0.5 seconds

# Initialize lists to store windowed features and labels
windowed_features = []
windowed_labels = []

# Segment the time-series data into windows
for i in range(0, len(data) - window_size + 1, window_size):
    window = data.iloc[i:i + window_size]
    if len(window) == window_size:  # Ensure the window is complete
        # Extract features from the window
        window_features = {
            'mean_acc_x': window['acc_x'].mean(),
            'mean_acc_y': window['acc_y'].mean(),
            'mean_acc_z': window['acc_z'].mean(),
            'std_acc_x': window['acc_x'].std(),
            'std_acc_y': window['acc_y'].std(),
            'std_acc_z': window['acc_z'].std(),
            'skew_acc_x': window['acc_x'].skew(),
            'skew_acc_y': window['acc_y'].skew(),
            'skew_acc_z': window['acc_z'].skew(),
            'kurt_acc_x': window['acc_x'].kurt(),
            'kurt_acc_y': window['acc_y'].kurt(),
            'kurt_acc_z': window['acc_z'].kurt()
        }
        windowed_features.append(window_features)

        # Assign label to the window (assuming it's the same for all samples within the window)
        window_label = window['label'].iloc[0]
        windowed_labels.append(window_label)

# Convert lists to DataFrame
X_windowed = pd.DataFrame(windowed_features)
y_windowed = np.array(windowed_labels)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X_windowed)

# Scale the input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Deal with class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y_windowed)

# Split the data into train and test using stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)

# Initialize lists to store metrics
f1_scores = []
precisions = []
recalls = []
accuracies = []

# Perform 10 iterations
for i in range(10):
    # Instantiate Random Forest classifier with random subspace
    rf_classifier = RandomForestClassifier(n_estimators=100, max_features='sqrt', random_state=42)

    # Train the Random Forest classifier
    rf_classifier.fit(X_train, y_train)

    # Make predictions on the testing data
    y_pred_rf = rf_classifier.predict(X_test)

    # Evaluate the Random Forest model's performance
    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    f1 = f1_score(y_test, y_pred_rf, average='weighted')
    precision = precision_score(y_test, y_pred_rf, average='weighted')
    recall = recall_score(y_test, y_pred_rf, average='weighted')

    accuracies.append(accuracy_rf)
    f1_scores.append(f1)
    precisions.append(precision)
    recalls.append(recall)

    print(f"Iteration {i + 1}:")
    print(f"  Accuracy: {accuracy_rf * 100:.2f}%")
    print(f"  F1 Score: {f1:.2f}")
    print(f"  Precision: {precision:.2f}")
    print(f"  Recall: {recall:.2f}")

# Print average metrics
print("\nAverage Metrics over 10 iterations:")
print(f"  Average Accuracy: {np.mean(accuracies) * 100:.2f}%")
print(f"  Average F1 Score: {np.mean(f1_scores):.2f}")
print(f"  Average Precision: {np.mean(precisions):.2f}")
print(f"  Average Recall: {np.mean(recalls):.2f}")

# Plot confusion matrix for the last iteration
conf_matrix = confusion_matrix(y_test, y_pred_rf)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()


In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Plot confusion matrix for the last iteration
conf_matrix = confusion_matrix(y_test, y_pred_rf)

# Normalize the confusion matrix by dividing by the sum of each row to get percentages
conf_matrix_percentage = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis] * 100

# Plot the normalized confusion matrix with percentages
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_percentage, annot=True, fmt='.2f', cmap='Blues',
            xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix (in Percentages)')
plt.show()

# Get the unique labels and their corresponding encoded values
unique_labels, label_encoding = np.unique(y_windowed, return_inverse=True)

# Print the unique labels with their corresponding numbers
for i, label in enumerate(unique_labels):
    print(f"Encoded label {i} corresponds to behavior: {label}")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns  # Optional: for heatmap styling
# Gini score calculation using AUC-ROC
def gini_score(y_true, y_pred_proba):
    # Calculate AUC-ROC score
    auc = roc_auc_score(y_true, y_pred_proba, multi_class='ovr', average='weighted')
    # Gini coefficient is derived from AUC
    return 2 * auc - 1

# Perform 1 iteration
for i in range(1):
    # Instantiate Random Forest classifier with random subspace
    rf_classifier = RandomForestClassifier(n_estimators=100, max_features='sqrt', random_state=42)

    # Train the Random Forest classifier
    rf_classifier.fit(X_train, y_train)

    # Make predictions on the testing data
    y_pred_rf = rf_classifier.predict(X_test)
    y_pred_proba_rf = rf_classifier.predict_proba(X_test)  # Get predicted probabilities

    # Evaluate the Random Forest model's performance
    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    f1 = f1_score(y_test, y_pred_rf, average='weighted')
    precision = precision_score(y_test, y_pred_rf, average='weighted')
    recall = recall_score(y_test, y_pred_rf, average='weighted')

    # Calculate Gini score using predicted probabilities
    gini = gini_score(y_test, y_pred_proba_rf)

    print(f"Gini Score: {gini:.2f}")

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# Load the dataset (replace 'CURC.csv' with your actual file path)
data = pd.read_csv('CURC.csv')

# Convert the 'Time' column to datetime format (assuming Time is in HH:MM:SS format)
data['Time'] = pd.to_datetime(data['Time'], format='%H:%M:%S').dt.time

# Define a larger window size in seconds
window_size_seconds = 6  # Adjust this value as needed
window_size_samples = window_size_seconds  # 6 seconds = 6 data points

# Initialize lists to store windowed features and labels
windowed_features = []
windowed_labels = []

# Segment the time-series data into windows
for i in range(0, len(data) - window_size_samples + 1, window_size_samples):
    window = data.iloc[i:i + window_size_samples]
    if len(window) == window_size_samples:  # Ensure the window is complete
        # Extract features from the window
        window_features = {
            'mean_x': window['X-axis (g)'].mean(),
            'mean_y': window['Y-axis (g)'].mean(),
            'mean_z': window['Z-axis (g)'].mean(),
            'std_x': window['X-axis (g)'].std(),
            'std_y': window['Y-axis (g)'].std(),
            'std_z': window['Z-axis (g)'].std(),
            'skew_x': window['X-axis (g)'].skew(),
            'skew_y': window['Y-axis (g)'].skew(),
            'skew_z': window['Z-axis (g)'].skew(),
            'kurt_x': window['X-axis (g)'].kurt(),
            'kurt_y': window['Y-axis (g)'].kurt(),
            'kurt_z': window['Z-axis (g)'].kurt()
        }
        windowed_features.append(window_features)

        # Assign labels to the window based on 'IteragreementLocom' and 'IteragreementFeeding'
        locomotion_label = window['IteragreementLocom'].mode().iloc[0]
        feeding_label = window['IteragreementFeeding'].mode().iloc[0]
        combined_label = f"{locomotion_label}{feeding_label}"
        windowed_labels.append(combined_label)

# Convert lists to DataFrame
X_windowed = pd.DataFrame(windowed_features)
y_windowed = np.array(windowed_labels)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X_windowed)

# Scale the input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Deal with class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y_windowed)

# Split the data into train and test using stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)

# Initialize lists to store metrics
accuracies = []
f1_scores = []
precisions = []
recalls = []
gini_scores = []

# Gini score calculation using AUC-ROC
def gini_score(y_true, y_pred_proba):
    if y_pred_proba.ndim == 1:  # Binary classification
        auc = roc_auc_score(y_true, y_pred_proba)
    else:  # Multi-class classification
        auc = roc_auc_score(y_true, y_pred_proba, multi_class='ovr', average='weighted')
    return 2 * auc - 1

# Perform 10 iterations
for i in range(10):
    # Instantiate Random Forest classifier with random subspace
    rf_classifier = RandomForestClassifier(n_estimators=100, max_features='sqrt', random_state=42)

    # Train the Random Forest classifier
    rf_classifier.fit(X_train, y_train)

    # Make predictions on the testing data
    y_pred_rf = rf_classifier.predict(X_test)
    y_pred_proba_rf = rf_classifier.predict_proba(X_test)  # Get predicted probabilities

    # Evaluate the Random Forest model's performance
    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    f1 = f1_score(y_test, y_pred_rf, average='weighted')
    precision = precision_score(y_test, y_pred_rf, average='weighted')
    recall = recall_score(y_test, y_pred_rf, average='weighted')

    # Calculate Gini score using predicted probabilities
    gini = gini_score(y_test, y_pred_proba_rf)

    # Append metrics to lists
    accuracies.append(accuracy_rf)
    f1_scores.append(f1)
    precisions.append(precision)
    recalls.append(recall)
    gini_scores.append(gini)

    print(f"Iteration {i + 1}:")
    print(f"  Accuracy: {accuracy_rf * 100:.2f}%")
    print(f"  F1 Score: {f1:.2f}")
    print(f"  Precision: {precision:.2f}")
    print(f"  Recall: {recall:.2f}")
    print(f"  Gini Score: {gini:.2f}")

# Print average metrics
print("\nAverage Metrics over 10 iterations:")
print(f"  Average Accuracy: {np.mean(accuracies) * 100:.2f}%")
print(f"  Average F1 Score: {np.mean(f1_scores):.2f}")
print(f"  Average Precision: {np.mean(precisions):.2f}")
print(f"  Average Recall: {np.mean(recalls):.2f}")
print(f"  Average Gini Score: {np.mean(gini_scores):.2f}")


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns

# After making predictions
y_pred_rf = rf_classifier.predict(X_test)

# Create confusion matrix
cm = confusion_matrix(y_test, y_pred_rf)

# Normalize the confusion matrix by converting counts to percentages
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100

# Plot the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm_normalized, annot=True, fmt=".2f", cmap="Blues", cbar=False)

plt.title("Confusion Matrix (in %)")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np  # Make sure to import numpy as well

# After making predictions
y_pred_rf = rf_classifier.predict(X_test)

# Create confusion matrix
cm = confusion_matrix(y_test, y_pred_rf)

# Normalize the confusion matrix by converting counts to percentages
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100

# Plot the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm_normalized, annot=True, fmt=".2f", cmap="Blues", cbar=False, annot_kws={"size": 16})  # Increase annotation size
plt.xlabel("Predicted Labels", fontsize=16)  # Increase x-axis label size
plt.ylabel("True Labels", fontsize=16)      # Increase y-axis label size
plt.xticks(fontsize=14)  # Increase x-axis ticks size
plt.yticks(fontsize=14)  # Increase y-axis ticks size
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Example normalized confusion matrix values (replace these with your own)
cm_normalized = np.array(
    [89.06, 7.81, 3.12],
    [21.54, 70.77, 7.69],
    [0.00, 1.54, 98.46,
    [0.00, 1.54, 98.46]
])

# Plot the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm_normalized, annot=True, fmt=".2f", cmap="Blues", cbar=False, annot_kws={"size": 16})  # Increase annotation size
plt.xlabel("Predicted Labels", fontsize=16)  # Increase x-axis label size
plt.ylabel("True Labels", fontsize=16)      # Increase y-axis label size
plt.xticks(fontsize=14)  # Increase x-axis ticks size
plt.yticks(fontsize=14)  # Increase y-axis ticks size

# Make the plot more aesthetically pleasing
plt.title("Normalized Confusion Matrix", fontsize=18, fontweight='bold')
plt.tight_layout()  # Adjust layout
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Normalized confusion matrix values (replace these with your own)
cm_normalized = np.array([
    [89.06, 7.81, 3.12],
    [21.54, 70.77, 7.69],
    [0.00, 1.54, 98.46]
])

# Plot the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm_normalized, annot=True, fmt=".2f", cmap="Blues", cbar=False, annot_kws={"size": 16})  # Increase annotation size
plt.xlabel("Predicted Labels", fontsize=16, fontweight='bold')  # Increase x-axis label size
plt.ylabel("True Labels", fontsize=16, fontweight='bold')      # Increase y-axis label size
plt.xticks(fontsize=14)  # Increase x-axis ticks size
plt.yticks(fontsize=14)  # Increase y-axis ticks size




In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Normalized confusion matrix values (replace these with your own)
cm_normalized = np.array([
    [89.06, 7.81, 3.12],
    [21.54, 70.77, 7.69],
    [0.00, 1.54, 98.46]
])

# Plot the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm_normalized, annot=True, fmt=".2f", cmap="Blues", cbar=False, annot_kws={"size": 20, "weight": "bold"})  # Increase annotation size and bold
plt.xlabel("Predicted Labels", fontsize=20, fontweight='bold')  # Increase x-axis label size and bold
plt.ylabel("True Labels", fontsize=20, fontweight='bold')      # Increase y-axis label size and bold
plt.xticks(fontsize=14, fontweight='bold')  # Increase x-axis ticks size and bold
plt.yticks(fontsize=14, fontweight='bold')  # Increase y-axis ticks size and bold


plt.tight_layout()  # Adjust layout
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Normalized confusion matrix values (replace these with your own)
cm_normalized = np.array([
    [89.06, 7.81, 3.12],
    [21.54, 70.77, 7.69],
    [0.00, 1.54, 98.46]
])

# Plot the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(
    cm_normalized,
    annot=True,
    fmt=".2f",
    cmap="Blues",
    cbar=False,
    annot_kws={"size": 20, "weight": "bold"},  # Increase annotation size and bold
    linewidths=1.5,  # Width of the borders around each cell
    linecolor='black'  # Color of the borders
)

plt.xlabel("Predicted Labels", fontsize=20, fontweight='bold')  # Increase x-axis label size and bold
plt.ylabel("True Labels", fontsize=20, fontweight='bold')      # Increase y-axis label size and bold
plt.xticks(fontsize=20, fontweight='bold')  # Set x-axis ticks size to 20 and bold
plt.yticks(fontsize=20, fontweight='bold')  # Set y-axis ticks size to 20 and bold


plt.tight_layout()  # Adjust layout
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Normalized confusion matrix values (replace these with your own)
cm_normalized = np.array([
    [89.06, 7.81, 3.12],
    [21.54, 70.77, 7.69],
    [0.00, 1.54, 98.46]
])

# Plot the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(
    cm_normalized,
    annot=True,
    fmt=".2f",
    cmap="Blues",
    cbar=False,
    annot_kws={"size": 20, "weight": "bold"},  # Increase annotation size and bold
    linewidths=2.5,  # Width of the borders around each cell
    linecolor='black',  # Color of the borders
    square=True  # Makes cells square-shaped
)

plt.xlabel("Predicted Labels", fontsize=20, fontweight='bold')  # Increase x-axis label size and bold
plt.ylabel("True Labels", fontsize=20, fontweight='bold')      # Increase y-axis label size and bold
plt.xticks(fontsize=20, fontweight='bold')  # Set x-axis ticks size to 20 and bold
plt.yticks(fontsize=20, fontweight='bold')  # Set y-axis ticks size to 20 and bold

# Make the plot more aesthetically pleasing
plt.title("Normalized Confusion Matrix", fontsize=20, fontweight='bold')  # Bold title with font size 20
plt.tight_layout()  # Adjust layout
plt.show()


In [None]:
# Get the unique labels and their corresponding encoded values
unique_labels, label_encoding = np.unique(y_windowed, return_inverse=True)

# Print the unique labels with their corresponding numbers
for i, label in enumerate(unique_labels):
    print(f"Encoded label {i} corresponds to behavior: {label}")


In [None]:
import matplotlib.pyplot as plt

# Data
datasets = ['1', '2', '3', '4', '5']
model_accuracies = [92.72, 93.41, 99.97, 90.17, 86.08]
published_accuracies = [85.67, 94.43, "NA", 87.15, "NA"]

# Filter out 'NA' values
filtered_published_accuracies = [acc if acc != 'NA' else None for acc in published_accuracies]

# Plotting
plt.figure(figsize=(12, 7))

bar_width = 0.35
index = range(len(datasets))

# Plot model accuracies with solid color
bars_model = plt.bar(index, model_accuracies, bar_width, label='Model Accuracy', color='skyblue', edgecolor='black', hatch='')

# Plot published accuracies with dashed color
bars_published = plt.bar([i + bar_width for i in index if published_accuracies[i] != 'NA'],
                         [acc for acc in filtered_published_accuracies if acc is not None],
                         bar_width, label='Published Accuracy', color='lightgreen', edgecolor='black', hatch='//')

plt.xlabel('Dataset')
plt.ylabel('Accuracy (%)')
plt.title('Comparison of Model and Published Accuracies Across Datasets')

# Align x-axis labels with bars
plt.xticks([i + bar_width / 2 for i in index], datasets)

plt.legend()

# Annotate bars with values
for bar in bars_model:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.5, f"{yval:.2f}%", ha='center', va='bottom', fontsize=10, color='black')

for bar in bars_published:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.5, f"{yval:.2f}%", ha='center', va='bottom', fontsize=10, color='black')

# Improve layout and show plot
plt.tight_layout()
plt.grid(axis='y', linestyle='--', alpha=0.7)  # Add grid for better readability
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Data
datasets = [
    '1',
    '2',
    '3',
    '4',
    '5'
]
f1_scores = [0.93, 0.93, 1, 0.90, 0.86]
gini_scores = [0.98, 0.99, 1, 0.98, 0.92]
precision_scores = [0.93, 0.93, 1, 0.90, 0.86]
recall_scores = [0.93, 0.93, 1, 0.90, 0.86]

# Plotting
plt.figure(figsize=(12, 8))

bar_width = 0.2
index = range(len(datasets))

bars1 = plt.bar([i + 2 * bar_width for i in index], f1_scores, bar_width, label='F1 Score', color='orange')
bars2 = plt.bar([i + 3 * bar_width for i in index], gini_scores, bar_width, label='Gini Score', color='lightblue')
bars3 = plt.bar([i + 4 * bar_width for i in index], precision_scores, bar_width, label='Precision', color='pink')
bars4 = plt.bar([i + 5 * bar_width for i in index], recall_scores, bar_width, label='Recall', color='lightcoral')

plt.xlabel('Dataset')
plt.ylabel('Score')
plt.xticks([i + 2.5 * bar_width for i in index], datasets, rotation=45, ha='right')

# Annotate bars with values
def autolabel(bars):
    for bar in bars:
        height = bar.get_height()
        plt.annotate(f'{height:.3f}',
                     xy=(bar.get_x() + bar.get_width() / 2, height),
                     xytext=(0, 3),
                     textcoords="offset points",
                     ha='center', va='bottom')

autolabel(bars1)
autolabel(bars2)
autolabel(bars3)
autolabel(bars4)


# Set y-axis limits to focus on 85% to 100%
plt.ylim(0.7, 1)

# Move legend outside the plot
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), shadow=True, ncol=2)

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Data
datasets = ['1', '2', '3', '4', '5']
f1_scores = [0.93, 0.93, 1, 0.90, 0.86]
gini_scores = [0.98, 0.99, 1, 0.98, 0.92]
precision_scores = [0.93, 0.93, 1, 0.90, 0.86]
recall_scores = [0.93, 0.93, 1, 0.90, 0.86]

# Plotting
plt.figure(figsize=(12, 8))

bar_width = 0.15
index = range(len(datasets))

# Adjusting bar positions
bars1 = plt.bar([i - 1.5 * bar_width for i in index], f1_scores, bar_width, label='F1 Score', color='orange')
bars2 = plt.bar([i - 0.5 * bar_width for i in index], gini_scores, bar_width, label='Gini Score', color='lightblue')
bars3 = plt.bar([i + 0.5 * bar_width for i in index], precision_scores, bar_width, label='Precision', color='pink')
bars4 = plt.bar([i + 1.5 * bar_width for i in index], recall_scores, bar_width, label='Recall', color='lightcoral')

plt.xlabel('Dataset')
plt.ylabel('Score')
plt.xticks(index, datasets, rotation=45, ha='right')

# Annotate bars with values
def autolabel(bars):
    for bar in bars:
        height = bar.get_height()
        plt.annotate(f'{height:.3f}',
                     xy=(bar.get_x() + bar.get_width() / 2, height),
                     xytext=(0, 3),
                     textcoords="offset points",
                     ha='center', va='bottom')

autolabel(bars1)
autolabel(bars2)
autolabel(bars3)
autolabel(bars4)

# Set y-axis limits to focus on 85% to 100%
plt.ylim(0.7, 1)

# Move legend outside the plot
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), shadow=True, ncol=2)

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Data
datasets = ['1', '4', '5']
grazing_acc = [91.64, 71.79, 89.06]
walking_acc = [79.74, 74.87, 98.46]

# Set up the bar width and figure size
bar_width = 0.35
index = np.arange(len(datasets))

plt.figure(figsize=(10, 6))

# Plot the bars for Grazing and Walking
bars1 = plt.bar(index, grazing_acc, bar_width, label='Grazing Accuracy', color='skyblue')
bars2 = plt.bar(index + bar_width, walking_acc, bar_width, label='Walking Accuracy', color='salmon')

# Add labels and titles
plt.xlabel('Dataset')
plt.ylabel('Accuracy (%)')
plt.title('Accuracy of Grazing and Walking Behaviors for Datasets 1, 4, and 5')
plt.xticks(index + bar_width / 2, datasets)

# Annotate bars with values
def autolabel(bars):
    for bar in bars:
        height = bar.get_height()
        plt.annotate(f'{height:.2f}%',
                     xy=(bar.get_x() + bar.get_width() / 2, height),
                     xytext=(0, 3),  # Offset for label placement
                     textcoords="offset points",
                     ha='center', va='bottom')

autolabel(bars1)
autolabel(bars2)

# Add a legend
plt.legend()

# Display the plot
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Data
datasets = ['1', '4', '5']
grazing_acc = [91.64, 71.79, 89.06]
walking_acc = [79.74, 74.87, 98.46]

# Set up the bar width and figure size
bar_width = 0.35
index = np.arange(len(datasets))

plt.figure(figsize=(10, 6))

# Plot the bars for Grazing and Walking (with hatching for Walking)
bars1 = plt.bar(index, grazing_acc, bar_width, label='Grazing Accuracy', color='skyblue')
bars2 = plt.bar(index + bar_width, walking_acc, bar_width, label='Walking Accuracy', color='salmon', hatch='//')

# Add labels and titles
plt.xlabel('Dataset')
plt.ylabel('Accuracy (%)')
plt.title('Accuracy of Grazing and Walking Behaviors for Datasets 1, 4, and 5')
plt.xticks(index + bar_width / 2, datasets)

# Annotate bars with values
def autolabel(bars):
    for bar in bars:
        height = bar.get_height()
        plt.annotate(f'{height:.2f}%',
                     xy=(bar.get_x() + bar.get_width() / 2, height),
                     xytext=(0, 3),  # Offset for label placement
                     textcoords="offset points",
                     ha='center', va='bottom')

autolabel(bars1)
autolabel(bars2)

# Move legend to upper middle
plt.legend(loc='upper center', bbox_to_anchor=(0.5, 1.05), ncol=2)

# Display the plot
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Data
datasets = ['1', '4', '5']
grazing_acc = [91.64, 71.79, 89.06]
walking_acc = [79.74, 74.87, 98.46]

# Set up the bar width and figure size
bar_width = 0.35
index = np.arange(len(datasets))

plt.figure(figsize=(10, 6))

# Plot the bars for Grazing and Walking (with hatching for Walking)
bars1 = plt.bar(index, grazing_acc, bar_width, label='Grazing Accuracy', color='skyblue')
bars2 = plt.bar(index + bar_width, walking_acc, bar_width, label='Walking Accuracy', color='salmon', hatch='//')

# Add labels and titles
plt.xlabel('Dataset')
plt.ylabel('Accuracy (%)')
plt.title('Accuracy of Grazing and Walking Behaviors for Datasets 1, 4, and 5')
plt.xticks(index + bar_width / 2, datasets)

# Annotate bars with values
def autolabel(bars):
    for bar in bars:
        height = bar.get_height()
        plt.annotate(f'{height:.2f}%',
                     xy=(bar.get_x() + bar.get_width() / 2, height),
                     xytext=(0, 3),  # Offset for label placement
                     textcoords="offset points",
                     ha='center', va='bottom')

autolabel(bars1)
autolabel(bars2)

# Move legend slightly lower and keep it centered
plt.legend(loc='upper center', bbox_to_anchor=(0.5, 0.95), ncol=2)

# Display the plot
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Data
datasets = ['1', '4', '5']
grazing_acc = [91.64, 71.79, 89.06]
walking_acc = [79.74, 74.87, 98.46]

# Set up the bar width and figure size
bar_width = 0.35
index = np.arange(len(datasets))

plt.figure(figsize=(10, 6))

# Plot the bars for Grazing and Walking (with hatching for Walking)
bars1 = plt.bar(index, grazing_acc, bar_width, label='Grazing Accuracy', color='skyblue')
bars2 = plt.bar(index + bar_width, walking_acc, bar_width, label='Walking Accuracy', color='salmon', hatch='//')

# Add labels and titles with increased font size
plt.xlabel('Dataset', fontsize=14)
plt.ylabel('Accuracy (%)', fontsize=14)
plt.xticks(index + bar_width / 2, datasets, fontsize=12)

# Move legend slightly lower and keep it centered with increased font size
plt.legend(loc='upper center', bbox_to_anchor=(0.5, 0.95), ncol=2, fontsize=12)

# Display the plot
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Data
datasets = ['1', '4', '5']
grazing_acc = [91.64, 71.79, 89.06]
walking_acc = [79.74, 74.87, 98.46]

# Set up the bar width and figure size
bar_width = 0.35
index = np.arange(len(datasets))

# Define different colors for bars
grazing_color = '#1f77b4'  # Grazing Accuracy
walking_color = '#ff9999'  # Walking Accuracy

plt.figure(figsize=(10, 6))

# Create the bars for Grazing and Walking
bars1 = plt.bar(index, grazing_acc, bar_width, label='Grazing Accuracy', color=grazing_color, edgecolor='black', linewidth=1.5)
bars2 = plt.bar(index + bar_width, walking_acc, bar_width, label='Walking Accuracy', color=walking_color, edgecolor='black', linewidth=1.5, hatch='//')

# Add labels and title with larger text and bold
plt.xlabel('Dataset', fontsize=16, fontweight='bold')
plt.ylabel('Accuracy (%)', fontsize=16, fontweight='bold')

# Adjust x-ticks and legend
plt.xticks(index + bar_width / 2, datasets, fontsize=14)
plt.legend(loc='upper center', bbox_to_anchor=(0.5, 1.05), ncol=2, fontsize=12)

# Increase font size of y-ticks and make them bold
plt.yticks(fontsize=14)

# Remove grid lines
plt.grid(False)

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Data
datasets = ['1', '4', '5']
grazing_acc = [91.64, 71.79, 89.06]
walking_acc = [79.74, 74.87, 98.46]

# Set up the bar width and figure size
bar_width = 0.35
index = np.arange(len(datasets))

# Define lighter colors for bars
grazing_color = '#a3c1e0'  # Light blue for Grazing Accuracy
walking_color = '#ffcccc'   # Light pink for Walking Accuracy

plt.figure(figsize=(10, 6))

# Create the bars for Grazing and Walking
bars1 = plt.bar(index, grazing_acc, bar_width, label='Grazing Accuracy', color=grazing_color, edgecolor='black', linewidth=1.5)
bars2 = plt.bar(index + bar_width, walking_acc, bar_width, label='Walking Accuracy', color=walking_color, edgecolor='black', linewidth=1.5, hatch='//')

# Add labels and title with larger text and bold
plt.xlabel('Dataset', fontsize=16, fontweight='bold')
plt.ylabel('Accuracy (%)', fontsize=16, fontweight='bold')

# Adjust x-ticks and legend
plt.xticks(index + bar_width / 2, datasets, fontsize=14, fontweight='bold')
plt.yticks(fontsize=14, fontweight='bold')

# Move the legend down slightly
plt.legend(loc='upper center', bbox_to_anchor=(0.5, 0.95), ncol=2, fontsize=12, frameon=True)

# Remove grid lines
plt.grid(False)

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Data
datasets = ['1', '4', '5']
grazing_acc = [91.64, 71.79, 89.06]
walking_acc = [79.74, 74.87, 98.46]

# Set up the bar width and figure size
bar_width = 0.35
index = np.arange(len(datasets))

# Define darker shades of blue and pink for bars
grazing_color = '#87ceeb'  # Darker light blue for Grazing Accuracy
walking_color = '#ff9999'   # Darker light pink for Walking Accuracy

plt.figure(figsize=(10, 6))

# Create the bars for Grazing and Walking
bars1 = plt.bar(index, grazing_acc, bar_width, label='Grazing Accuracy', color=grazing_color, edgecolor='black', linewidth=1.5)
bars2 = plt.bar(index + bar_width, walking_acc, bar_width, label='Walking Accuracy', color=walking_color, edgecolor='black', linewidth=1.5, hatch='//')

# Add labels and title with larger text and bold
plt.xlabel('Dataset', fontsize=16, fontweight='bold')
plt.ylabel('Test Accuracy (%)', fontsize=16, fontweight='bold')

# Adjust x-ticks and legend
plt.xticks(index + bar_width / 2, datasets, fontsize=14, fontweight='bold')
plt.yticks(fontsize=14, fontweight='bold')

# Move the legend down slightly
plt.legend(loc='upper center', bbox_to_anchor=(0.5, 0.95), ncol=2, fontsize=12, frameon=True)

# Remove grid lines
plt.grid(False)

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd

# Load your dataset (replace 'CURC.csv' with your actual file)
data = pd.read_csv('CURC.csv')

# Define the column that indicates grazing/feeding behavior (e.g., 'IteragreementFeeding' might label feeding)
feeding_column = 'IteragreementFeeding'

# Convert the 'IteragreementFeeding' column to numeric values (assuming 1 indicates grazing, 0 otherwise)
data[feeding_column] = pd.to_numeric(data[feeding_column], errors='coerce')

# Add a time column and convert it to datetime format (assuming 'Time' exists)
data['Time'] = pd.to_datetime(data['Time'], format='%H:%M:%S')

# Define daytime and nighttime hours (for example, day: 6 AM to 6 PM, night: 6 PM to 6 AM)
day_start = pd.to_datetime("06:00:00").time()
day_end = pd.to_datetime("18:00:00").time()

# Create separate columns for day and night grazing
data['is_daytime'] = data['Time'].apply(lambda x: day_start <= x.time() <= day_end)
data['is_nighttime'] = ~data['is_daytime']

# Calculate total daytime and nighttime grazing instances
daytime_grazing_seconds = data[data['is_daytime']][feeding_column].sum()
nighttime_grazing_seconds = data[data['is_nighttime']][feeding_column].sum()

# Convert to hours
daytime_grazing_hours = daytime_grazing_seconds / 3600
nighttime_grazing_hours = nighttime_grazing_seconds / 3600

# Assuming daytime and nighttime each last 12 hours
day_duration_hours = 12
night_duration_hours = 12

# Calculate grazing percentages
daytime_grazing_percentage = (daytime_grazing_hours / day_duration_hours) * 100
nighttime_grazing_percentage = (nighttime_grazing_hours / night_duration_hours) * 100

# Set thresholds based on the study
daytime_threshold = 40  # 40% of the day
nighttime_threshold = 16  # 16% of the night

# Determine if overgrazing occurred during the day or night
if daytime_grazing_percentage > daytime_threshold:
    print(f"Overgrazing detected during the day: Grazing {daytime_grazing_percentage:.2f}%, exceeding the {daytime_threshold}% threshold.")
else:
    print(f"No overgrazing during the day: Grazing {daytime_grazing_percentage:.2f}%, within the {daytime_threshold}% threshold.")

if nighttime_grazing_percentage > nighttime_threshold:
    print(f"Overgrazing detected during the night: Grazing {nighttime_grazing_percentage:.2f}%, exceeding the {nighttime_threshold}% threshold.")
else:
    print(f"No overgrazing during the night: Grazing {nighttime_grazing_percentage:.2f}%, within the {nighttime_threshold}% threshold.")

In [None]:
import pandas as pd

# Load your dataset (replace 'CURC.csv' with your actual file)
data = pd.read_csv('CURC.csv')

# Define the column that indicates grazing/feeding behavior (e.g., 'IteragreementFeeding' might label feeding)
feeding_column = 'IteragreementFeeding'

# Convert the 'IteragreementFeeding' column to numeric values (assuming 1 indicates grazing, 0 otherwise)
# It seems the values in 'IteragreementFeeding' could contain categorical labels like "EatingEating" or "otherother".
# We will convert "EatingEating" to 1 (indicating feeding), and others to 0 (non-feeding).
data[feeding_column] = data[feeding_column].apply(lambda x: 1 if 'Eating' in x else 0)

# Add a time column and convert it to datetime format (assuming 'Time' exists)
data['Time'] = pd.to_datetime(data['Time'], format='%H:%M:%S')

# Define daytime and nighttime hours (for example, day: 6 AM to 6 PM, night: 6 PM to 6 AM)
day_start = pd.to_datetime("06:00:00").time()
day_end = pd.to_datetime("18:00:00").time()

# Create separate columns for day and night grazing
data['is_daytime'] = data['Time'].apply(lambda x: day_start <= x.time() <= day_end)
data['is_nighttime'] = ~data['is_daytime']

# Calculate total daytime and nighttime grazing instances
daytime_grazing_seconds = data[data['is_daytime']][feeding_column].sum()
nighttime_grazing_seconds = data[data['is_nighttime']][feeding_column].sum()

# Convert to hours
daytime_grazing_hours = daytime_grazing_seconds / 3600
nighttime_grazing_hours = nighttime_grazing_seconds / 3600

# Assuming daytime and nighttime each last 12 hours
day_duration_hours = 12
night_duration_hours = 12

# Calculate grazing percentages
daytime_grazing_percentage = (daytime_grazing_hours / day_duration_hours) * 100
nighttime_grazing_percentage = (nighttime_grazing_hours / night_duration_hours) * 100

# Set thresholds based on the study
daytime_threshold = 40  # 40% of the day
nighttime_threshold = 16  # 16% of the night

# Determine if overgrazing occurred during the day or night
if daytime_grazing_percentage > daytime_threshold:
    print(f"Overgrazing detected during the day: Grazing {daytime_grazing_percentage:.2f}%, exceeding the {daytime_threshold}% threshold.")
else:
    print(f"No overgrazing during the day: Grazing {daytime_grazing_percentage:.2f}%, within the {daytime_threshold}% threshold.")

if nighttime_grazing_percentage > nighttime_threshold:
    print(f"Overgrazing detected during the night: Grazing {nighttime_grazing_percentage:.2f}%, exceeding the {nighttime_threshold}% threshold.")
else:
    print(f"No overgrazing during the night: Grazing {nighttime_grazing_percentage:.2f}%, within the {nighttime_threshold}% threshold.")

In [None]:
import pandas as pd

# Load your dataset (replace 'CURC.csv' with your actual file)
data = pd.read_csv('CURC.csv')

# Define the column that indicates grazing/feeding behavior (e.g., 'IteragreementFeeding')
feeding_column = 'IteragreementFeeding'

# Map behavior strings to numeric values (assuming "EatingEating" means grazing)
# Adjust this mapping if there are additional behaviors
behavior_mapping = {
    'EatingEating': 1,  # Grazing
    'otherother': 0     # Not grazing
}

# Apply the mapping to the feeding column
data[feeding_column] = data[feeding_column].map(behavior_mapping)

# Convert 'Time' column to datetime if it's not already (assuming time in HH:MM:SS format)
data['Time'] = pd.to_datetime(data['Time'], format='%H:%M:%S').dt.time

# Define thresholds for grazing
daytime_start = pd.to_datetime('06:00:00').time()  # 6 AM
daytime_end = pd.to_datetime('18:00:00').time()    # 6 PM

# Create boolean columns for daytime and nighttime grazing
data['is_daytime'] = data['Time'].apply(lambda x: daytime_start <= x <= daytime_end)
data['is_nighttime'] = ~data['is_daytime']

# Check if 'IteragreementFeeding' has the correct 1's and 0's for grazing behavior
print("Distribution of Grazing Behavior:")
print(data[feeding_column].value_counts())

# Calculate daytime grazing (sum feeding during the day)
daytime_grazing_seconds = data.loc[data['is_daytime'], feeding_column].sum()
nighttime_grazing_seconds = data.loc[data['is_nighttime'], feeding_column].sum()

# Convert to hours
daytime_grazing_hours = daytime_grazing_seconds / 3600
nighttime_grazing_hours = nighttime_grazing_seconds / 3600

# Total day and night hours
total_day_hours = 12  # 12 hours from 6 AM to 6 PM
total_night_hours = 12

# Thresholds (from the study)
daytime_grazing_threshold = 0.40 * total_day_hours  # 40% of the daytime
nighttime_grazing_threshold = 0.16 * total_night_hours  # 16% of the nighttime

# Calculate percentages for grazing
daytime_grazing_percentage = (daytime_grazing_seconds / (total_day_hours * 3600)) * 100
nighttime_grazing_percentage = (nighttime_grazing_seconds / (total_night_hours * 3600)) * 100

# Output grazing assessment
if daytime_grazing_percentage > 40:
    print(f"Overgrazing during the day: Grazing {daytime_grazing_percentage:.2f}%, exceeding the 40% threshold.")
else:
    print(f"No overgrazing during the day: Grazing {daytime_grazing_percentage:.2f}%, within the 40% threshold.")

if nighttime_grazing_percentage > 16:
    print(f"Overgrazing during the night: Grazing {nighttime_grazing_percentage:.2f}%, exceeding the 16% threshold.")
else:
    print(f"No overgrazing during the night: Grazing {nighttime_grazing_percentage:.2f}%, within the 16% threshold.")

In [None]:
import pandas as pd

# Load your dataset
data = pd.read_csv('dataset_6.csv')

# Define thresholds based on the study
daytime_grazing_threshold = 40  # 40% grazing during the day
nighttime_grazing_threshold = 16  # 16% grazing during the night

# Define behavior labels to be analyzed
behavior_labels = ['Grazing', 'Walking']

# Check if 'label' column exists
if 'label' in data.columns:
    # Count occurrences of each behavior
    behavior_counts = data['label'].value_counts()

    # Print the distribution of behaviors
    print("Distribution of Behavior Labels:")
    print(behavior_counts)

    # Calculate percentage of each behavior
    total_records = data.shape[0]
    behavior_percentages = (behavior_counts / total_records) * 100

    # Print percentage of each behavior
    print("\nPercentage of Each Behavior:")
    for behavior in behavior_labels:
        if behavior in behavior_percentages.index:
            print(f"{behavior}: {behavior_percentages[behavior]:.2f}%")
        else:
            print(f"{behavior}: 0.00%")

    # Check if behavior percentages meet the thresholds
    if behavior_percentages.get('Grazing', 0) > daytime_grazing_threshold:
        print(f"Overgrazing detected: Grazing {behavior_percentages.get('Grazing', 0):.2f}%, exceeding the {daytime_grazing_threshold}% threshold.")
    else:
        print(f"No overgrazing detected: Grazing {behavior_percentages.get('Grazing', 0):.2f}%, within the {daytime_grazing_threshold}% threshold.")

    if behavior_percentages.get('Walking', 0) > nighttime_grazing_threshold:
        print(f"Excessive walking detected: Walking {behavior_percentages.get('Walking', 0):.2f}%, exceeding the {nighttime_grazing_threshold}% threshold.")
    else:
        print(f"No excessive walking detected: Walking {behavior_percentages.get('Walking', 0):.2f}%, within the {nighttime_grazing_threshold}% threshold.")
else:
    raise KeyError("The column 'label' is missing in the dataset.")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset (replace 'CURC.csv' with your actual file path)
data = pd.read_csv('CURC.csv')

# Check available columns
print(data.columns)

# Combine relevant columns to create a behavior label
# Assuming you want to combine locomotion and feeding behaviors
data['Behavior'] = data['IteragreementLocom'].astype(str) + '-' + data['IteragreementFeeding'].astype(str)

# Check the unique values in the 'Behavior' column
print(data['Behavior'].value_counts())

# Calculate the percentage of each behavior
behavior_counts = data['Behavior'].value_counts()
behavior_percentages = (behavior_counts / len(data)) * 100

# Plot the distribution of behavior labels
plt.figure(figsize=(10, 6))
behavior_percentages.plot(kind='bar', color='skyblue')
plt.xlabel('Behavior')
plt.ylabel('Percentage (%)')
plt.title('Distribution of Different Behaviors')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()  # Adjusts plot to fit labels and title
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset (replace 'CURC.csv' with your actual file path)
data = pd.read_csv('CURC.csv')

# Combine relevant columns to create a behavior label
# Assuming you want to combine locomotion and feeding behaviors
data['Behavior'] = data['IteragreementLocom'].astype(str) + '-' + data['IteragreementFeeding'].astype(str)

# Check the unique values in the 'Behavior' column
print(data['Behavior'].value_counts())

# Plot accelerometer data distributions for different behaviors

# Set up the plotting area
fig, axes = plt.subplots(3, 1, figsize=(12, 18))

# Plot for X-axis (g)
sns.boxplot(x='Behavior', y='X-axis (g)', data=data, ax=axes[0], palette='Set2')
axes[0].set_title('Distribution of X-axis Acceleration by Behavior')
axes[0].tick_params(axis='x', rotation=45)

# Plot for Y-axis (g)
sns.boxplot(x='Behavior', y='Y-axis (g)', data=data, ax=axes[1], palette='Set2')
axes[1].set_title('Distribution of Y-axis Acceleration by Behavior')
axes[1].tick_params(axis='x', rotation=45)

# Plot for Z-axis (g)
sns.boxplot(x='Behavior', y='Z-axis (g)', data=data, ax=axes[2], palette='Set2')
axes[2].set_title('Distribution of Z-axis Acceleration by Behavior')
axes[2].tick_params(axis='x', rotation=45)

# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Load your dataset
data = pd.read_csv('CURC.csv')

# Ensure columns are present in the data
required_columns = ['X-axis (g)', 'Y-axis (g)', 'Z-axis (g)']
for col in required_columns:
    if col not in data.columns:
        raise KeyError(f"Column '{col}' is missing from the dataset.")

# Extract the relevant columns
x = data['X-axis (g)']
y = data['Y-axis (g)']
z = data['Z-axis (g)']

# Create a 3D scatter plot
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')

# Plotting the data points
sc = ax.scatter(x, y, z, c='blue', marker='o', alpha=0.5)

# Labeling the axes
ax.set_xlabel('X-axis (g)')
ax.set_ylabel('Y-axis (g)')
ax.set_zlabel('Z-axis (g)')

# Title and grid
ax.set_title('3D Scatter Plot of Accelerometer Data')
ax.grid(True)

# Show the plot
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Load your dataset
data = pd.read_csv('CURC.csv')

# Create a combined label for behavior
data['CombinedLabel'] = data['IteragreementLocom'].astype(str) + '-' + data['IteragreementFeeding'].astype(str)

# Define unique behaviors based on combined labels
behaviors = data['CombinedLabel'].unique()

# Create a 3D scatter plot for each behavior
for behavior in behaviors:
    # Filter data for the current behavior
    behavior_data = data[data['CombinedLabel'] == behavior]

    # Check if there is data for this behavior
    if behavior_data.empty:
        print(f"No data available for behavior: {behavior}")
        continue

    # Extract the relevant columns
    x = behavior_data['X-axis (g)']
    y = behavior_data['Y-axis (g)']
    z = behavior_data['Z-axis (g)']

    # Create a new figure for each behavior
    fig = plt.figure(figsize=(12, 8))
    ax = fig.add_subplot(111, projection='3d')

    # Plotting the data points
    sc = ax.scatter(x, y, z, marker='o', alpha=0.5)

    # Labeling the axes
    ax.set_xlabel('X-axis (g)')
    ax.set_ylabel('Y-axis (g)')
    ax.set_zlabel('Z-axis (g)')

    # Title and grid
    ax.set_title(f'3D Scatter Plot of Accelerometer Data for {behavior}')
    ax.grid(True)

    # Show the plot
    plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Load your dataset
data = pd.read_csv('CURC.csv')

# Create a combined label for behavior
data['CombinedLabel'] = data['IteragreementLocom'].astype(str) + '-' + data['IteragreementFeeding'].astype(str)

# Define unique behaviors based on combined labels
behaviors = data['CombinedLabel'].unique()

# Determine the number of subplots needed
n_behaviors = len(behaviors)
n_cols = 3  # Number of columns in the grid
n_rows = (n_behaviors + n_cols - 1) // n_cols  # Number of rows needed

# Create a figure with a grid of subplots
fig = plt.figure(figsize=(5 * n_cols, 5 * n_rows))

# Loop through each behavior and create a subplot for it
for i, behavior in enumerate(behaviors):
    # Filter data for the current behavior
    behavior_data = data[data['CombinedLabel'] == behavior]

    # Check if there is data for this behavior
    if behavior_data.empty:
        print(f"No data available for behavior: {behavior}")
        continue

    # Extract the relevant columns
    x = behavior_data['X-axis (g)']
    y = behavior_data['Y-axis (g)']
    z = behavior_data['Z-axis (g)']

    # Create a subplot
    ax = fig.add_subplot(n_rows, n_cols, i + 1, projection='3d')

    # Plotting the data points
    sc = ax.scatter(x, y, z, marker='o', alpha=0.5)

    # Labeling the axes
    ax.set_xlabel('X-axis (g)')
    ax.set_ylabel('Y-axis (g)')
    ax.set_zlabel('Z-axis (g)')

    # Title for each subplot
    ax.set_title(f'Behavior: {behavior}')
    ax.grid(True)

# Adjust layout to prevent overlap
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Load your dataset
data = pd.read_csv('CURC.csv')

# Create a combined label for behavior
data['CombinedLabel'] = data['IteragreementLocom'].astype(str) + '-' + data['IteragreementFeeding'].astype(str)

# Define unique behaviors based on combined labels
behaviors = data['CombinedLabel'].unique()

# Determine the number of subplots needed
n_behaviors = len(behaviors)
n_cols = 3  # Number of columns in the grid
n_rows = (n_behaviors + n_cols - 1) // n_cols  # Number of rows needed

# Create a figure with a grid of subplots
fig = plt.figure(figsize=(5 * n_cols, 5 * n_rows))

# Loop through each behavior and create a subplot for it
for i, behavior in enumerate(behaviors):
    # Filter data for the current behavior
    behavior_data = data[data['CombinedLabel'] == behavior]

    # Check if there is data for this behavior
    if behavior_data.empty:
        print(f"No data available for behavior: {behavior}")
        continue

    # Extract the relevant columns
    x = behavior_data['X-axis (g)']
    y = behavior_data['Y-axis (g)']
    z = behavior_data['Z-axis (g)']

    # Create a subplot
    ax = fig.add_subplot(n_rows, n_cols, i + 1, projection='3d')

    # Plotting the data points
    sc = ax.scatter(x, y, z, marker='o', alpha=0.5)

    # Labeling the axes
    ax.set_xlabel('X-axis (g)')
    ax.set_ylabel('Y-axis (g)')
    ax.set_zlabel('Z-axis (g)')

    # Title for each subplot
    ax.set_title(f'Behavior: {behavior}')
    ax.grid(True)

# Adjust layout to prevent overlap
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.mplot3d import Axes3D

# Load your dataset
data = pd.read_csv('CURC.csv')

# Create a combined label for behavior
data['CombinedLabel'] = data['IteragreementLocom'].astype(str) + '-' + data['IteragreementFeeding'].astype(str)

# Define unique behaviors based on combined labels
behaviors = data['CombinedLabel'].unique()

# Determine the number of subplots needed
n_behaviors = len(behaviors)
n_cols = 3  # Number of columns in the grid
n_rows = (n_behaviors + n_cols - 1) // n_cols  # Number of rows needed

# Create a figure with a grid of subplots
fig = plt.figure(figsize=(7 * n_cols, 6 * n_rows))  # Increased figure size

# Define a list of colors for each behavior
colors = plt.cm.tab10(np.linspace(0, 1, len(behaviors)))  # Using 'tab10' colormap for distinct colors

# Loop through each behavior and create a subplot for it
for i, behavior in enumerate(behaviors):
    # Filter data for the current behavior
    behavior_data = data[data['CombinedLabel'] == behavior]

    # Check if there is data for this behavior
    if behavior_data.empty:
        print(f"No data available for behavior: {behavior}")
        continue

    # Extract the relevant columns
    x = behavior_data['X-axis (g)']
    y = behavior_data['Y-axis (g)']
    z = behavior_data['Z-axis (g)']

    # Create a subplot
    ax = fig.add_subplot(n_rows, n_cols, i + 1, projection='3d')

    # Plotting the data points with a unique color for each behavior
    ax.scatter(x, y, z, marker='o', alpha=0.7, color=colors[i], s=50)  # Adjust size and color

    # Labeling the axes
    ax.set_xlabel('X-axis (g)', fontsize=12)
    ax.set_ylabel('Y-axis (g)', fontsize=12)
    ax.set_zlabel('Z-axis (g)', fontsize=12)

    # Title for each subplot with a slight vertical offset
    ax.set_title(f'Behavior: {behavior}', fontsize=14, pad=20)  # Adjust the pad for spacing
    ax.grid(True, linestyle='--', alpha=0.5)  # Use dashed grid lines for better aesthetics

    # Adjust view angle for better perspective
    ax.view_init(elev=20, azim=30)

# Adjust layout to prevent overlap
plt.subplots_adjust(hspace=0.4, wspace=0.4)  # Increase space between subplots

# Show the plot
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.mplot3d import Axes3D

# Load your dataset
data = pd.read_csv('CURC.csv')

# Create a combined label for behavior
data['CombinedLabel'] = data['IteragreementLocom'].astype(str) + '-' + data['IteragreementFeeding'].astype(str)

# Define unique behaviors based on combined labels
behaviors = data['CombinedLabel'].unique()

# Determine the number of subplots needed
n_behaviors = len(behaviors)
n_cols = 3  # Number of columns in the grid
n_rows = (n_behaviors + n_cols - 1) // n_cols  # Number of rows needed

# Create a figure with a grid of subplots
fig = plt.figure(figsize=(7 * n_cols, 6 * n_rows))  # Increased figure size

# Define a colorblind-friendly color palette
colors = ['#E69F00', '#56B4E9', '#009E73', '#F0E442', '#0072B2', '#D55E00', '#CC79A7', '#999999']

# Loop through each behavior and create a subplot for it
for i, behavior in enumerate(behaviors):
    # Filter data for the current behavior
    behavior_data = data[data['CombinedLabel'] == behavior]

    # Check if there is data for this behavior
    if behavior_data.empty:
        print(f"No data available for behavior: {behavior}")
        continue

    # Extract the relevant columns
    x = behavior_data['X-axis (g)']
    y = behavior_data['Y-axis (g)']
    z = behavior_data['Z-axis (g)']

    # Create a subplot
    ax = fig.add_subplot(n_rows, n_cols, i + 1, projection='3d')

    # Plotting the data points with a unique color for each behavior
    ax.scatter(x, y, z, marker='o', alpha=0.7, color=colors[i % len(colors)], s=50)  # Adjust size and color

    # Labeling the axes
    ax.set_xlabel('X-axis (g)', fontsize=12)
    ax.set_ylabel('Y-axis (g)', fontsize=12)
    ax.set_zlabel('Z-axis (g)', fontsize=12)

    # Title for each subplot with a slight vertical offset
    ax.set_title(f'Behavior: {behavior}', fontsize=14, pad=20)  # Adjust the pad for spacing
    ax.grid(True, linestyle='--', alpha=0.5)  # Use dashed grid lines for better aesthetics

    # Adjust view angle for better perspective
    ax.view_init(elev=20, azim=30)

# Adjust layout to prevent overlap
plt.subplots_adjust(hspace=0.4, wspace=0.4)  # Increase space between subplots

# Show the plot
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.mplot3d import Axes3D

# Load your dataset
data = pd.read_csv('CURC.csv')

# Create a combined label for behavior
data['CombinedLabel'] = data['IteragreementLocom'].astype(str) + '-' + data['IteragreementFeeding'].astype(str)

# Define unique behaviors based on combined labels
behaviors = data['CombinedLabel'].unique()

# Determine the number of subplots needed
n_behaviors = len(behaviors)
n_cols = 3  # Number of columns in the grid
n_rows = (n_behaviors + n_cols - 1) // n_cols  # Number of rows needed

# Create a figure with a grid of subplots
fig = plt.figure(figsize=(7 * n_cols, 6 * n_rows))  # Increased figure size

# Define a colorblind-friendly color palette
colors = ['#E69F00', '#56B4E9', '#009E73', '#F0E442', '#0072B2', '#D55E00', '#CC79A7', '#999999']

# Loop through each behavior and create a subplot for it
for i, behavior in enumerate(behaviors):
    # Filter data for the current behavior
    behavior_data = data[data['CombinedLabel'] == behavior]

    # Check if there is data for this behavior
    if behavior_data.empty:
        print(f"No data available for behavior: {behavior}")
        continue

    # Extract the relevant columns
    x = behavior_data['X-axis (g)']
    y = behavior_data['Y-axis (g)']
    z = behavior_data['Z-axis (g)']

    # Create a subplot
    ax = fig.add_subplot(n_rows, n_cols, i + 1, projection='3d')

    # Plotting the data points with a unique color for each behavior
    ax.scatter(x, y, z, marker='o', alpha=0.7, color=colors[i % len(colors)], s=50)  # Adjust size and color

    # Labeling the axes with bold text
    ax.set_xlabel('X-axis (g)', fontsize=12, fontweight='bold')
    ax.set_ylabel('Y-axis (g)', fontsize=12, fontweight='bold')
    ax.set_zlabel('Z-axis (g)', fontsize=12, fontweight='bold')

    # Title for each subplot with bold text and no padding
    ax.set_title(f'Behavior: {behavior}', fontsize=14, fontweight='bold', pad=10)  # Adjust the pad for spacing
    ax.grid(True, linestyle='--', alpha=0.5)  # Use dashed grid lines for better aesthetics

    # Adjust view angle for better perspective
    ax.view_init(elev=20, azim=30)

# Adjust layout to minimize whitespace
plt.subplots_adjust(hspace=0.3, wspace=0.3)  # Adjust space between subplots

# Show the plot
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.mplot3d import Axes3D

# Load your dataset
data = pd.read_csv('CURC.csv')

# Create a combined label for behavior
data['CombinedLabel'] = data['IteragreementLocom'].astype(str) + '-' + data['IteragreementFeeding'].astype(str)

# Define unique behaviors based on combined labels
behaviors = data['CombinedLabel'].unique()

# Determine the number of subplots needed
n_behaviors = len(behaviors)
n_cols = 3  # Number of columns in the grid
n_rows = (n_behaviors + n_cols - 1) // n_cols  # Number of rows needed

# Create a figure with a grid of subplots
fig = plt.figure(figsize=(7 * n_cols, 5 * n_rows))  # Adjusted figure size for less height

# Define a colorblind-friendly color palette
colors = ['#E69F00', '#56B4E9', '#009E73', '#F0E442', '#0072B2', '#D55E00', '#CC79A7', '#999999']

# Loop through each behavior and create a subplot for it
for i, behavior in enumerate(behaviors):
    # Filter data for the current behavior
    behavior_data = data[data['CombinedLabel'] == behavior]

    # Check if there is data for this behavior
    if behavior_data.empty:
        print(f"No data available for behavior: {behavior}")
        continue

    # Extract the relevant columns
    x = behavior_data['X-axis (g)']
    y = behavior_data['Y-axis (g)']
    z = behavior_data['Z-axis (g)']

    # Create a subplot
    ax = fig.add_subplot(n_rows, n_cols, i + 1, projection='3d')

    # Plotting the data points with a unique color for each behavior
    ax.scatter(x, y, z, marker='o', alpha=0.7, color=colors[i % len(colors)], s=50)  # Adjust size and color

    # Labeling the axes with bold text
    ax.set_xlabel('X-axis (g)', fontsize=12, fontweight='bold')
    ax.set_ylabel('Y-axis (g)', fontsize=12, fontweight='bold')
    ax.set_zlabel('Z-axis (g)', fontsize=12, fontweight='bold')

    # Title for each subplot with bold text
    ax.set_title(f'Behavior: {behavior}', fontsize=14, fontweight='bold')  # Removed padding

    # Add grid for better visibility
    ax.grid(True, linestyle='--', alpha=0.5)

    # Adjust view angle for better perspective
    ax.view_init(elev=20, azim=30)

# Adjust layout to minimize whitespace
plt.subplots_adjust(hspace=0.25, wspace=0.25)  # Decreased space between subplots

# Show the plot
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.mplot3d import Axes3D

# Load your dataset
data = pd.read_csv('/content/AX3_RAW_DATA06272024_ok.csv')

# Create a combined label for behavior
data['CombinedLabel'] = data['IteragreementLocom'].astype(str) + '-' + data['IteragreementFeeding'].astype(str)

# Define unique behaviors based on combined labels
behaviors = data['CombinedLabel'].unique()

# Determine the number of subplots needed
n_behaviors = len(behaviors)
n_cols = 3  # Number of columns in the grid
n_rows = (n_behaviors + n_cols - 1) // n_cols  # Number of rows needed

# Create a figure with a grid of subplots
fig = plt.figure(figsize=(7 * n_cols, 5 * n_rows))  # Adjusted figure size for less height

# Define a colorblind-friendly color palette
colors = ['#E69F00', '#56B4E9', '#009E73', '#F0E442', '#0072B2', '#D55E00', '#CC79A7', '#999999']

# Compute global min and max for all axes
x_min, x_max = data['X-axis (g)'].min(), data['X-axis (g)'].max()
y_min, y_max = data['Y-axis (g)'].min(), data['Y-axis (g)'].max()
z_min, z_max = data['Z-axis (g)'].min(), data['Z-axis (g)'].max()


for i, behavior in enumerate(behaviors):
    behavior_data = data[data['CombinedLabel'] == behavior]

    if behavior_data.empty:
        print(f"No data available for behavior: {behavior}")
        continue

    x = behavior_data['X-axis (g)']
    y = behavior_data['Y-axis (g)']
    z = behavior_data['Z-axis (g)']

    ax = fig.add_subplot(n_rows, n_cols, i + 1, projection='3d')
    ax.scatter(x, y, z, marker='o', alpha=0.7, color=colors[i % len(colors)], s=50)

    ax.set_xlabel('X-axis (g)', fontsize=12, fontweight='bold')
    ax.set_ylabel('Y-axis (g)', fontsize=12, fontweight='bold')
    ax.set_zlabel('Z-axis (g)', fontsize=12, fontweight='bold')
    ax.set_title(f'Behavior: {behavior}', fontsize=14, fontweight='bold')

    ax.grid(True, linestyle='--', alpha=0.5)
    ax.view_init(elev=20, azim=30)

    # 🔹 Apply global axis limits here:
    ax.set_xlim([x_min, x_max])
    ax.set_ylim([y_min, y_max])
    ax.set_zlim([z_min, z_max])


    # Title for each subplot with bold text
    ax.set_title(f'Behavior: {behavior}', fontsize=14, fontweight='bold')  # Removed padding

    # Add grid for better visibility
    ax.grid(True, linestyle='--', alpha=0.5)

    # Adjust view angle for better perspective
    ax.view_init(elev=20, azim=30)

# Adjust layout to minimize whitespace
plt.subplots_adjust(hspace=0.25, wspace=0.25)  # Decreased space between subplots

# Show the plot
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.mplot3d import Axes3D

# Load your dataset
data = pd.read_csv('CURC.csv')

# Create a combined label for behavior
data['CombinedLabel'] = data['IteragreementLocom'].astype(str) + '-' + data['IteragreementFeeding'].astype(str)

# Define unique behaviors based on combined labels
behaviors = data['CombinedLabel'].unique()

# Determine the number of subplots needed
n_behaviors = len(behaviors)
n_cols = 3  # Number of columns in the grid
n_rows = (n_behaviors + n_cols - 1) // n_cols  # Number of rows needed

# Create a figure with a grid of subplots
fig = plt.figure(figsize=(5 * n_cols, 5 * n_rows))
fig.patch.set_facecolor('lightgray')  # Set figure background color

# Define a colormap
colors = plt.cm.viridis(np.linspace(0, 1, n_behaviors))

# Loop through each behavior and create a subplot for it
for i, behavior in enumerate(behaviors):
    # Filter data for the current behavior
    behavior_data = data[data['CombinedLabel'] == behavior]

    # Check if there is data for this behavior
    if behavior_data.empty:
        print(f"No data available for behavior: {behavior}")
        continue

    # Extract the relevant columns
    x = behavior_data['X-axis (g)']
    y = behavior_data['Y-axis (g)']
    z = behavior_data['Z-axis (g)']

    # Create a subplot
    ax = fig.add_subplot(n_rows, n_cols, i + 1, projection='3d')

    # Plotting the data points with color and size adjustments
    sc = ax.scatter(x, y, z, marker='o', alpha=0.7, color=colors[i % len(colors)], s=50)  # Adjust size and color

    # Labeling the axes
    ax.set_xlabel('X-axis (g)', fontsize=20, fontweight='bold')
    ax.set_ylabel('Y-axis (g)', fontsize=20, fontweight='bold')
    ax.set_zlabel('Z-axis (g)', fontsize=20, fontweight='bold')

    # Title for each subplot
    ax.set_title(f'Behavior: {behavior}', fontsize=20, fontweight='bold')
    ax.grid(True, linestyle='--', alpha=0.5)  # Use dashed grid lines for better aesthetics

    # Adjust view angle for better perspective
    ax.view_init(elev=20, azim=30)

# Adjust layout to prevent overlap
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.mplot3d import Axes3D

# Load your dataset
data = pd.read_csv('CURC.csv')

# Create a combined label for behavior
data['CombinedLabel'] = data['IteragreementLocom'].astype(str) + '-' + data['IteragreementFeeding'].astype(str)

# Define unique behaviors based on combined labels
behaviors = data['CombinedLabel'].unique()

# Determine the number of subplots needed
n_behaviors = len(behaviors)
n_cols = 3  # Number of columns in the grid
n_rows = (n_behaviors + n_cols - 1) // n_cols  # Number of rows needed

# Create a figure with a grid of subplots
fig = plt.figure(figsize=(7 * n_cols, 6 * n_rows))  # Increased figure size
fig.patch.set_facecolor('lightgray')  # Set figure background color

# Define a colormap
colors = plt.cm.viridis(np.linspace(0, 1, n_behaviors))

# Loop through each behavior and create a subplot for it
for i, behavior in enumerate(behaviors):
    # Filter data for the current behavior
    behavior_data = data[data['CombinedLabel'] == behavior]

    # Check if there is data for this behavior
    if behavior_data.empty:
        print(f"No data available for behavior: {behavior}")
        continue

    # Extract the relevant columns
    x = behavior_data['X-axis (g)']
    y = behavior_data['Y-axis (g)']
    z = behavior_data['Z-axis (g)']

    # Create a subplot
    ax = fig.add_subplot(n_rows, n_cols, i + 1, projection='3d')

    # Plotting the data points with color and size adjustments
    sc = ax.scatter(x, y, z, marker='o', alpha=0.7, color=colors[i % len(colors)], s=50)  # Adjust size and color

    # Labeling the axes
    ax.set_xlabel('X-axis (g)', fontsize=20, fontweight='bold')
    ax.set_ylabel('Y-axis (g)', fontsize=20, fontweight='bold')
    ax.set_zlabel('Z-axis (g)', fontsize=20, fontweight='bold')

    # Title for each subplot with a slight vertical offset
    ax.set_title(f'Behavior: {behavior}', fontsize=20, fontweight='bold', pad=20)  # Adjust the pad for spacing
    ax.grid(True, linestyle='--', alpha=0.5)  # Use dashed grid lines for better aesthetics

    # Adjust view angle for better perspective
    ax.view_init(elev=20, azim=30)

# Adjust layout to prevent overlap
plt.subplots_adjust(hspace=0.4, wspace=0.4)  # Increase space between subplots

# Show the plot
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np

# Load your dataset
data = pd.read_csv('CURC.csv')

# Create a combined label for behavior
data['CombinedLabel'] = data['IteragreementLocom'].astype(str) + '-' + data['IteragreementFeeding'].astype(str)

# Define unique behaviors based on combined labels
behaviors = data['CombinedLabel'].unique()

# Determine the number of subplots needed
n_behaviors = len(behaviors)
n_cols = 3  # Number of columns in the grid
n_rows = (n_behaviors + n_cols - 1) // n_cols  # Number of rows needed

# Create a figure with a grid of subplots
fig = plt.figure(figsize=(5 * n_cols, 5 * n_rows))
fig.patch.set_facecolor('lightgray')  # Set figure background color

# Define a colormap
colors = plt.cm.viridis(np.linspace(0, 1, n_behaviors))

# Loop through each behavior and create a subplot for it
for i, behavior in enumerate(behaviors):
    # Filter data for the current behavior
    behavior_data = data[data['CombinedLabel'] == behavior]

    # Check if there is data for this behavior
    if behavior_data.empty:
        print(f"No data available for behavior: {behavior}")
        continue

    # Extract the relevant columns
    x = behavior_data['X-axis (g)']
    y = behavior_data['Y-axis (g)']
    z = behavior_data['Z-axis (g)']

    # Create a subplot
    ax = fig.add_subplot(n_rows, n_cols, i + 1, projection='3d')

    # Plotting the data points with color
    sc = ax.scatter(x, y, z, marker='o', alpha=0.7, color=colors[i % len(colors)], s=50)  # Adjust size and color

    # Labeling the axes
    ax.set_xlabel('X-axis (g)', fontsize=20, fontweight='bold')
    ax.set_ylabel('Y-axis (g)', fontsize=20, fontweight='bold')
    ax.set_zlabel('Z-axis (g)', fontsize=20, fontweight='bold')

    # Title for each subplot
    ax.set_title(f'Behavior: {behavior}', fontsize=20, fontweight='bold')
    ax.grid(True)

    # Adjust view angle for better perspective
    ax.view_init(elev=20, azim=30)

# Adjust layout to prevent overlap
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
import pandas as pd

# Load your dataset
data = pd.read_csv('data.csv')

# Define thresholds based on the study
daytime_grazing_threshold = 40  # 40% grazing during the day
nighttime_grazing_threshold = 16  # 16% grazing during the night

# Define behavior labels to be analyzed
behavior_labels = {0: 'Feeding', 1: 'Rumination', 2: 'Standing', 3: 'Lying', 4: 'Walking'}

# Check if 'behavior' column exists
if 'behavior' in data.columns:
    # Count occurrences of each behavior
    behavior_counts = data['behavior'].value_counts()

    # Print the distribution of behaviors
    print("Distribution of Behavior Labels:")
    for behavior_code, count in behavior_counts.items():
        print(f"{behavior_labels[behavior_code]}: {count}")

    # Calculate percentage of each behavior
    total_records = data.shape[0]
    behavior_percentages = (behavior_counts / total_records) * 100

    # Print percentage of each behavior
    print("\nPercentage of Each Behavior:")
    for behavior_code, percentage in behavior_percentages.items():
        print(f"{behavior_labels[behavior_code]}: {percentage:.2f}%")

    # Check if grazing and walking percentages meet the thresholds
    grazing_percentage = behavior_percentages.get(0, 0)  # Feeding is labeled as 0
    walking_percentage = behavior_percentages.get(4, 0)  # Walking is labeled as 4

    if grazing_percentage > daytime_grazing_threshold:
        print(f"Overgrazing detected: Grazing {grazing_percentage:.2f}%, exceeding the {daytime_grazing_threshold}% threshold.")
    else:
        print(f"No overgrazing detected: Grazing {grazing_percentage:.2f}%, within the {daytime_grazing_threshold}% threshold.")

    if walking_percentage > nighttime_grazing_threshold:
        print(f"Excessive walking detected: Walking {walking_percentage:.2f}%, exceeding the {nighttime_grazing_threshold}% threshold.")
    else:
        print(f"No excessive walking detected: Walking {walking_percentage:.2f}%, within the {nighttime_grazing_threshold}% threshold.")
else:
    raise KeyError("The column 'behavior' is missing in the dataset.")

In [None]:
import pandas as pd

# Load your dataset (replace 'CURC.csv' with your actual file)
data = pd.read_csv('CURC.csv')

# Define the column that indicates grazing/feeding behavior
feeding_column = 'IteragreementFeeding'

# Convert the 'IteragreementFeeding' column to numeric values
# "EatingEating" is converted to 1 (feeding), others to 0 (non-feeding).
data[feeding_column] = data[feeding_column].apply(lambda x: 1 if 'Eating' in x else 0)

# Convert 'Time' to datetime format
data['Time'] = pd.to_datetime(data['Time'], format='%H:%M:%S')

# Define daytime and nighttime hours
day_start = pd.to_datetime("06:00:00").time()
day_end = pd.to_datetime("18:00:00").time()

# Create separate columns for daytime and nighttime
data['is_daytime'] = data['Time'].apply(lambda x: day_start <= x.time() <= day_end)
data['is_nighttime'] = ~data['is_daytime']

# Calculate total daytime and nighttime grazing instances
daytime_grazing_seconds = data[data['is_daytime']][feeding_column].sum()
nighttime_grazing_seconds = data[data['is_nighttime']][feeding_column].sum()

# Convert seconds to hours
daytime_grazing_hours = daytime_grazing_seconds / 3600
nighttime_grazing_hours = nighttime_grazing_seconds / 3600

# Set durations for day and night
day_duration_hours = 12
night_duration_hours = 12

# Calculate percentages for grazing
daytime_grazing_percentage = (daytime_grazing_hours / day_duration_hours) * 100
nighttime_grazing_percentage = (nighttime_grazing_hours / night_duration_hours) * 100

# Set thresholds based on the study
daytime_threshold = 40  # 40% for daytime
nighttime_threshold = 16  # 16% for nighttime

# Check for overgrazing
if daytime_grazing_percentage > daytime_threshold:
    print(f"Overgrazing detected during the day: Grazing {daytime_grazing_percentage:.2f}%, exceeding the {daytime_threshold}% threshold.")
else:
    print(f"No overgrazing during the day: Grazing {daytime_grazing_percentage:.2f}%, within the {daytime_threshold}% threshold.")

if nighttime_grazing_percentage > nighttime_threshold:
    print(f"Overgrazing detected during the night: Grazing {nighttime_grazing_percentage:.2f}%, exceeding the {nighttime_threshold}% threshold.")
else:
    print(f"No overgrazing during the night: Grazing {nighttime_grazing_percentage:.2f}%, within the {nighttime_threshold}% threshold.")


In [None]:
import pandas as pd

# Load your dataset (replace 'CURC.csv' with your actual file)
data = pd.read_csv('CURC.csv')

# Define the behavior columns
locomotion_column = 'IteragreementLocom'
feeding_column = 'IteragreementFeeding'

# Convert columns to binary indicators (1 for behavior present, 0 otherwise)
data[locomotion_column] = data[locomotion_column].apply(lambda x: 1 if 'Locomotion' in x else 0)
data[feeding_column] = data[feeding_column].apply(lambda x: 1 if 'Eating' in x else 0)

# Calculate the total number of data points (to calculate percentages)
total_points = len(data)

# Count the number of instances for each behavior
locomotion_count = data[locomotion_column].sum()
feeding_count = data[feeding_column].sum()

# Calculate percentages
locomotion_percentage = (locomotion_count / total_points) * 100
feeding_percentage = (feeding_count / total_points) * 100

# Print out the percentage breakdown of each behavior
print("Percentage breakdown of each behavior:")
print(f"Locomotion: {locomotion_percentage:.2f}%")
print(f"Feeding: {feeding_percentage:.2f}%")

In [None]:
import pandas as pd

# Load your dataset (replace 'CURC.csv' with your actual file)
data = pd.read_csv('CURC.csv')

# Define mapping for the labels
label_mapping = {
    'WalkingWalking': 'Walking',
    'StandingStanding': 'Standing',
    'EatingEating': 'Feeding'
}

# Apply the mapping to the column
data['behavior'] = data['IteragreementLocom'].map(label_mapping).fillna('Other')

# Count occurrences of each behavior
behavior_counts = data['behavior'].value_counts()

# Calculate the total number of data points
total_points = len(data)

# Create a percentage breakdown for each behavior
behavior_percentages = (behavior_counts / total_points) * 100

# Print out the distribution of behavior labels
print("Distribution of Behavior Labels:")
for behavior, count in behavior_counts.items():
    print(f"{behavior}: {count}")

# Print out the percentage of each behavior
print("\nPercentage of Each Behavior:")
for behavior, percentage in behavior_percentages.items():
    print(f"{behavior}: {percentage:.2f}%")

# Define thresholds
daytime_threshold = 40  # % for daytime
walking_threshold = 16  # % for walking

# Extract percentages
feeding_percentage = behavior_percentages.get('Feeding', 0)
walking_percentage = behavior_percentages.get('Walking', 0)

# Print messages about overgrazing and excessive walking
if feeding_percentage > daytime_threshold:
    print(f"\nOvergrazing detected: Grazing {feeding_percentage:.2f}%, exceeding the {daytime_threshold}% threshold.")
else:
    print(f"\nNo overgrazing detected: Grazing {feeding_percentage:.2f}%, within the {daytime_threshold}% threshold.")

if walking_percentage > walking_threshold:
    print(f"Excessive walking detected: Walking {walking_percentage:.2f}%, exceeding the {walking_threshold}% threshold.")
else:
    print(f"No excessive walking detected: Walking {walking_percentage:.2f}%, within the {walking_threshold}% threshold.")

In [None]:
import pandas as pd

# Load your dataset (replace 'CURC.csv' with your actual file)
data = pd.read_csv('CURC.csv')

# Define mapping for the labels
label_mapping = {
    'WalkingWalking': 'Walking',
    'StandingStanding': 'Standing',
    'EatingEating': 'Feeding',
    'otherother': 'Other'  # Add this to catch any other labels
}

# Apply the mapping to the relevant column
data['behavior'] = data['IteragreementLocom'].map(label_mapping).fillna('Other')

# Count occurrences of each behavior
behavior_counts = data['behavior'].value_counts()

# Calculate the total number of data points
total_points = len(data)

# Create a percentage breakdown for each behavior
behavior_percentages = (behavior_counts / total_points) * 100

# Print out the distribution of behavior labels
print("Distribution of Behavior Labels:")
for behavior, count in behavior_counts.items():
    print(f"{behavior}: {count}")

# Print out the percentage of each behavior
print("\nPercentage of Each Behavior:")
for behavior, percentage in behavior_percentages.items():
    print(f"{behavior}: {percentage:.2f}%")

# Define thresholds
daytime_threshold = 40  # % for daytime grazing
walking_threshold = 16  # % for walking

# Extract percentages
feeding_percentage = behavior_percentages.get('Feeding', 0)
walking_percentage = behavior_percentages.get('Walking', 0)

# Print messages about overgrazing and excessive walking
if feeding_percentage > daytime_threshold:
    print(f"\nOvergrazing detected: Grazing {feeding_percentage:.2f}%, exceeding the {daytime_threshold}% threshold.")
else:
    print(f"\nNo overgrazing detected: Grazing {feeding_percentage:.2f}%, within the {daytime_threshold}% threshold.")

if walking_percentage > walking_threshold:
    print(f"Excessive walking detected: Walking {walking_percentage:.2f}%, exceeding the {walking_threshold}% threshold.")
else:
    print(f"No excessive walking detected: Walking {walking_percentage:.2f}%, within the {walking_threshold}% threshold.")

In [None]:
import matplotlib.pyplot as plt

# Define data for the plots
data1_labels = ['Standing', 'Walking']
data1_counts = [2987, 268]
data1_percentages = [91.77, 8.23]

data2_labels = ['Grazing', 'Lying-Ruminating', 'Lying-Resting', 'Standing-Resting', 'Walking', 'Standing-Ruminating']
data2_counts = [5732, 2034, 1702, 1559, 1229, 832]
data2_percentages = [43.80, 14.92, 12.58, 11.33, 9.39, 6.29]

data3_labels = ['Walking', 'Feeding', 'Rumination', 'Lying', 'Standing']
data3_counts = [592645, 359711, 294985, 3368, 1827]
data3_percentages = [47.32, 28.72, 23.55, 0.27, 0.15]

# Create subplots
fig, axs = plt.subplots(3, 2, figsize=(14, 18))

# Plot 1: Distribution of Behavior Labels (Data 1)
axs[0, 0].bar(data1_labels, data1_counts, color=['blue', 'orange'])
axs[0, 0].set_title('Distribution of Behavior Labels (Data 1)')
axs[0, 0].set_ylabel('Count')

# Plot 2: Percentage of Each Behavior (Data 1)
axs[0, 1].bar(data1_labels, data1_percentages, color=['blue', 'orange'])
axs[0, 1].set_title('Percentage of Each Behavior (Data 1)')
axs[0, 1].set_ylabel('Percentage (%)')

# Plot 3: Distribution of Behavior Labels (Data 2)
axs[1, 0].bar(data2_labels, data2_counts, color='green')
axs[1, 0].set_title('Distribution of Behavior Labels (Data 2)')
axs[1, 0].set_ylabel('Count')
axs[1, 0].tick_params(axis='x', rotation=45)

# Plot 4: Percentage of Each Behavior (Data 2)
axs[1, 1].bar(data2_labels, data2_percentages, color='green')
axs[1, 1].set_title('Percentage of Each Behavior (Data 2)')
axs[1, 1].set_ylabel('Percentage (%)')
axs[1, 1].tick_params(axis='x', rotation=45)

# Plot 5: Distribution of Behavior Labels (Data 3)
axs[2, 0].bar(data3_labels, data3_counts, color='red')
axs[2, 0].set_title('Distribution of Behavior Labels (Data 3)')
axs[2, 0].set_ylabel('Count')
axs[2, 0].tick_params(axis='x', rotation=45)

# Plot 6: Percentage of Each Behavior (Data 3)
axs[2, 1].bar(data3_labels, data3_percentages, color='red')
axs[2, 1].set_title('Percentage of Each Behavior (Data 3)')
axs[2, 1].set_ylabel('Percentage (%)')
axs[2, 1].tick_params(axis='x', rotation=45)

# Adjust layout
plt.tight_layout()

# Show plots
plt.show()

# Print threshold messages
print("No overgrazing detected: Grazing 0.00%, within the 40% threshold.")
print("No excessive walking detected: Walking 8.23%, within the 16% threshold.")
print("Overgrazing detected: Grazing 43.80%, exceeding the 40% threshold.")
print("No excessive walking detected: Walking 9.39%, within the 16% threshold.")
print("No overgrazing detected: Grazing 28.72%, within the 40% threshold.")
print("Excessive walking detected: Walking 47.32%, exceeding the 16% threshold.")

In [None]:
import matplotlib.pyplot as plt

# Define data for Dataset 1
labels1 = ['Walking', 'Feeding', 'Rumination', 'Lying', 'Standing']
sizes1 = [592645, 359711, 294985, 3368, 1827]
percentages1 = [47.32, 28.72, 23.55, 0.27, 0.15]

# Define data for Dataset 4
labels4 = ['Grazing', 'Lying-Ruminating', 'Lying-Resting', 'Standing-Resting', 'Walking', 'Standing-Ruminating']
sizes4 = [5732, 2034, 1702, 1559, 1229, 832]
percentages4 = [43.80, 14.92, 12.58, 11.33, 9.39, 6.29]

# Define data for Dataset 5
labels5 = ['Standing', 'Walking']
sizes5 = [2987, 268]
percentages5 = [91.77, 8.23]

# Create subplots for the pie charts
fig, axs = plt.subplots(1, 3, figsize=(18, 6))

# Plot for Dataset 1
axs[0].pie(sizes1, labels=labels1, autopct='%1.2f%%', colors=['blue', 'orange', 'green', 'red', 'purple'])
axs[0].set_title('Dataset 1: Percentage of Each Behavior')

# Plot for Dataset 4
axs[1].pie(sizes4, labels=labels4, autopct='%1.2f%%', colors=['blue', 'orange', 'green', 'red', 'purple', 'cyan'])
axs[1].set_title('Dataset 4: Percentage of Each Behavior')

# Plot for Dataset 5
axs[2].pie(sizes5, labels=labels5, autopct='%1.2f%%', colors=['blue', 'orange'])
axs[2].set_title('Dataset 5: Percentage of Each Behavior')

# Adjust layout
plt.tight_layout()

# Show plots
plt.show()

# Print threshold messages
print("Dataset 1:")
print("No overgrazing detected: Grazing 28.72%, within the 40% threshold.")
print("Excessive walking detected: Walking 47.32%, exceeding the 16% threshold.")

print("\nDataset 4:")
print("Overgrazing detected: Grazing 43.80%, exceeding the 40% threshold.")
print("No excessive walking detected: Walking 9.39%, within the 16% threshold.")

print("\nDataset 5:")
print("No overgrazing detected: Grazing 0.00%, within the 40% threshold.")
print("No excessive walking detected: Walking 8.23%, within the 16% threshold.")

In [None]:
import pandas as pd

# Define the data
data = {
    'Dataset': ['Dataset 1', 'Dataset 4', 'Dataset 5'],
    'Overgrazing Status': [
        'No overgrazing detected: Grazing 28.72%, within the 40% threshold.',
        'Overgrazing detected: Grazing 43.80%, exceeding the 40% threshold.',
        'No overgrazing detected: Grazing 0.00%, within the 40% threshold.'
    ],
    'Walking Status': [
        'Excessive walking detected: Walking 47.32%, exceeding the 16% threshold.',
        'No excessive walking detected: Walking 9.39%, within the 16% threshold.',
        'No excessive walking detected: Walking 8.23%, within the 16% threshold.'
    ]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Print the DataFrame
print(df.to_string(index=False))

In [None]:
import pandas as pd

# Load your dataset (replace 'CURC.csv' with your actual file path)
data = pd.read_csv('CURC.csv')

# Define mapping for the labels
label_mapping = {
    'WalkingWalking': 'Walking',
    'StandingStanding': 'Standing',
    'EatingEating': 'Feeding',  # Updated 'EatingEating' to 'Feeding'
    'otherother': 'Other'  # Add this to catch any other labels
}

# Apply the mapping to the relevant column
data['behavior'] = data['IteragreementLocom'].map(label_mapping).fillna('Other')

# Count occurrences of each behavior
behavior_counts = data['behavior'].value_counts()

# Calculate the total number of data points
total_points = len(data)

# Create a percentage breakdown for each behavior
behavior_percentages = (behavior_counts / total_points) * 100

# Print out the distribution of behavior labels
print("Distribution of Behavior Labels:")
for behavior, count in behavior_counts.items():
    print(f"{behavior}: {count}")

# Print out the percentage of each behavior
print("\nPercentage of Each Behavior:")
for behavior, percentage in behavior_percentages.items():
    print(f"{behavior}: {percentage:.2f}%")

# Define thresholds
daytime_threshold = 40  # % for daytime grazing
walking_threshold = 16  # % for walking

# Extract percentages
feeding_percentage = behavior_percentages.get('Feeding', 0)
walking_percentage = behavior_percentages.get('Walking', 0)

# Print messages about overgrazing and excessive walking
if feeding_percentage > daytime_threshold:
    print(f"\nOvergrazing detected: Feeding {feeding_percentage:.2f}%, exceeding the {daytime_threshold}% threshold.")
else:
    print(f"\nNo overgrazing detected: Feeding {feeding_percentage:.2f}%, within the {daytime_threshold}% threshold.")

if walking_percentage > walking_threshold:
    print(f"Excessive walking detected: Walking {walking_percentage:.2f}%, exceeding the {walking_threshold}% threshold.")
else:
    print(f"No excessive walking detected: Walking {walking_percentage:.2f}%, within the {walking_threshold}% threshold.")

In [None]:
import pandas as pd

# Load your dataset (replace 'CURC.csv' with your actual file)
data = pd.read_csv('CURC.csv')

# Define mapping for the labels
label_mapping = {
    'WalkingWalking': 'Walking',
    'StandingStanding': 'Standing',
    'EatingEating': 'Feeding',
    'otherother': 'Other'  # Add this to catch any other labels
}

# Apply the mapping to the relevant column for locomotion
data['behavior'] = data['IteragreementLocom'].map(label_mapping).fillna('Other')

# Count occurrences of each behavior
behavior_counts = data['behavior'].value_counts()

# Calculate the total number of data points
total_points = len(data)

# Create a percentage breakdown for each behavior
behavior_percentages = (behavior_counts / total_points) * 100

# Print out the distribution of behavior labels
print("Distribution of Behavior Labels:")
for behavior, count in behavior_counts.items():
    print(f"{behavior}: {count}")

# Print out the percentage of each behavior
print("\nPercentage of Each Behavior:")
for behavior, percentage in behavior_percentages.items():
    print(f"{behavior}: {percentage:.2f}%")

# Extract percentages for feeding and walking
feeding_percentage = behavior_percentages.get('Feeding', 0)
walking_percentage = behavior_percentages.get('Walking', 0)

# Calculate grazing percentage from IteragreementFeeding column
feeding_data = data['IteragreementFeeding'].map({'EatingEating': 1, 'otherother': 0}).fillna(0)
feeding_count = feeding_data.sum()
feeding_percentage_total = (feeding_count / total_points) * 100

# Define thresholds
daytime_threshold = 40  # % for daytime grazing
walking_threshold = 16  # % for walking

# Print messages about overgrazing and excessive walking
if feeding_percentage_total > daytime_threshold:
    print(f"\nOvergrazing detected: Grazing {feeding_percentage_total:.2f}%, exceeding the {daytime_threshold}% threshold.")
else:
    print(f"\nNo overgrazing detected: Grazing {feeding_percentage_total:.2f}%, within the {daytime_threshold}% threshold.")

if walking_percentage > walking_threshold:
    print(f"Excessive walking detected: Walking {walking_percentage:.2f}%, exceeding the {walking_threshold}% threshold.")
else:
    print(f"No excessive walking detected: Walking {walking_percentage:.2f}%, within the {walking_threshold}% threshold.")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns  # Optional: for heatmap styling

# Load the data
data = pd.read_csv("data.csv")

# Define window size
window_size = 15  # Approximately 15 data points for 0.5 seconds

# Initialize lists to store windowed features and labels
windowed_features = []
windowed_labels = []

# Segment the time-series data into windows
for i in range(0, len(data) - window_size + 1, window_size):
    window = data.iloc[i:i + window_size]
    if len(window) == window_size:  # Ensure the window is complete
        # Extract features from the window
        window_features = {
            'mean_acc_x': window['acc_x'].mean(),
            'mean_acc_y': window['acc_y'].mean(),
            'mean_acc_z': window['acc_z'].mean(),
            'std_acc_x': window['acc_x'].std(),
            'std_acc_y': window['acc_y'].std(),
            'std_acc_z': window['acc_z'].std(),
            'skew_acc_x': window['acc_x'].skew(),
            'skew_acc_y': window['acc_y'].skew(),
            'skew_acc_z': window['acc_z'].skew(),
            'kurt_acc_x': window['acc_x'].kurt(),
            'kurt_acc_y': window['acc_y'].kurt(),
            'kurt_acc_z': window['acc_z'].kurt()
        }
        windowed_features.append(window_features)

        # Assign label to the window (assuming it's the same for all samples within the window)
        window_label = window['behavior'].iloc[0]  # Adjust based on your specific label
        windowed_labels.append(window_label)

# Convert lists to DataFrame
X_windowed = pd.DataFrame(windowed_features)
y_windowed = np.array(windowed_labels)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X_windowed)

# Scale the input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Deal with class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y_windowed)

# Split the data into train and test using stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)

# Instantiate Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, max_features='sqrt', random_state=42)

# Train the Random Forest classifier
rf_classifier.fit(X_train, y_train)

# Make predictions on the testing data
y_pred_rf = rf_classifier.predict(X_test)

# Evaluate the model's performance
accuracy_rf = accuracy_score(y_test, y_pred_rf)
f1 = f1_score(y_test, y_pred_rf, average='weighted')
precision = precision_score(y_test, y_pred_rf, average='weighted')
recall = recall_score(y_test, y_pred_rf, average='weighted')

print(f"Accuracy: {accuracy_rf * 100:.2f}%")
print(f"F1 Score: {f1:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_rf)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

# Sensitivity and Specificity calculation
sensitivity_specificity = {}
overall_sensitivity = 0
overall_specificity = 0

for i, label in enumerate(np.unique(y_test)):
    tp = conf_matrix[i, i]
    fn = conf_matrix[i, :].sum() - tp
    fp = conf_matrix[:, i].sum() - tp
    tn = conf_matrix.sum() - (tp + fn + fp)

    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

    sensitivity_specificity[label] = {
        'Sensitivity (Recall)': sensitivity,
        'Specificity': specificity
    }

    overall_sensitivity += sensitivity
    overall_specificity += specificity

# Calculate overall sensitivity and specificity
num_classes = len(np.unique(y_test))
overall_sensitivity /= num_classes
overall_specificity /= num_classes

# Print sensitivity and specificity for each label
for label, metrics in sensitivity_specificity.items():
    print(f"\nLabel: {label}")
    print(f"Sensitivity: {metrics['Sensitivity (Recall)']:.2f}")
    print(f"Specificity: {metrics['Specificity']:.2f}")

# Print overall sensitivity and specificity
print(f"\nOverall Sensitivity: {overall_sensitivity:.2f}")
print(f"Overall Specificity: {overall_specificity:.2f}")


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns  # Optional: for heatmap styling

# Define file names
file_names = ["cow1.csv", "cow2.csv", "cow3.csv", "cow4.csv", "cow5.csv", "cow6.csv"]

# Initialize an empty list to store DataFrames
all_data = []

# Iterate over each file and read them
for file_name in file_names:
    data = pd.read_csv(file_name)
    all_data.append(data)

# Concatenate all the data into one DataFrame
combined_data = pd.concat(all_data, ignore_index=True)

# Define window size
window_size = 15  # Approximately 15 data points for 0.5 seconds

# Initialize lists to store windowed features and labels
windowed_features = []
windowed_labels = []

# Segment the time-series data into windows
for i in range(0, len(combined_data) - window_size + 1, window_size):
    window = combined_data.iloc[i:i + window_size]
    if len(window) == window_size:  # Ensure the window is complete
        # Extract features from the window
        window_features = {
            'mean_acc_x': window['AccX'].mean(),
            'mean_acc_y': window['AccY'].mean(),
            'mean_acc_z': window['AccZ'].mean(),
            'std_acc_x': window['AccX'].std(),
            'std_acc_y': window['AccY'].std(),
            'std_acc_z': window['AccZ'].std(),
            'skew_acc_x': window['AccX'].skew(),
            'skew_acc_y': window['AccY'].skew(),
            'skew_acc_z': window['AccZ'].skew(),
            'kurt_acc_x': window['AccX'].kurt(),
            'kurt_acc_y': window['AccY'].kurt(),
            'kurt_acc_z': window['AccZ'].kurt()
        }
        windowed_features.append(window_features)

        # Assign label to the window (assuming it's the same for all samples within the window)
        window_label = window['Label'].iloc[0]
        windowed_labels.append(window_label)

# Convert lists to DataFrame
X_windowed = pd.DataFrame(windowed_features)
y_windowed = np.array(windowed_labels)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X_windowed)

# Scale the input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Deal with class imbalance using SMOTE with a reduced number of neighbors
smote = SMOTE(random_state=42, k_neighbors=2)  # Reduce k_neighbors to 2
X_resampled, y_resampled = smote.fit_resample(X_scaled, y_windowed)

# Split the data into train and test using stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)

# Initialize a list to store accuracy results
accuracies = []

# Gini score calculation using AUC-ROC
def gini_score(y_true, y_pred_proba):
    # Calculate AUC-ROC score
    auc = roc_auc_score(y_true, y_pred_proba, multi_class='ovr', average='weighted')
    # Gini coefficient is derived from AUC
    return 2 * auc - 1

# Perform 1 iteration
for i in range(1):
    # Instantiate Random Forest classifier with random subspace
    rf_classifier = RandomForestClassifier(n_estimators=100, max_features='sqrt', random_state=42)

    # Train the Random Forest classifier
    rf_classifier.fit(X_train, y_train)

    # Make predictions on the testing data
    y_pred_rf = rf_classifier.predict(X_test)
    y_pred_proba_rf = rf_classifier.predict_proba(X_test)  # Get predicted probabilities

    # Evaluate the Random Forest model's performance
    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    f1 = f1_score(y_test, y_pred_rf, average='weighted')
    precision = precision_score(y_test, y_pred_rf, average='weighted')
    recall = recall_score(y_test, y_pred_rf, average='weighted')

    # Calculate Gini score using predicted probabilities
    gini = gini_score(y_test, y_pred_proba_rf)

    accuracies.append(accuracy_rf)
    print(f"Accuracy on test data (Random Forest with 0.5-second window size) for iteration {i + 1}: {accuracy_rf * 100:.2f}%")
    print(f"F1 Score: {f1:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"Gini Score: {gini:.2f}")

    # Calculate confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred_rf)

    # Calculate Sensitivity and Specificity
    sensitivity_specificity = {}
    for label in np.unique(y_test):
        tp = conf_matrix[label, label]  # True Positives
        fn = conf_matrix[label, :].sum() - tp  # False Negatives
        fp = conf_matrix[:, label].sum() - tp  # False Positives
        tn = conf_matrix.sum() - (tp + fn + fp)  # True Negatives

        # Calculate Sensitivity and Specificity
        sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

        sensitivity_specificity[label] = {
            'Sensitivity (Recall)': sensitivity,
            'Specificity': specificity
        }

    # Print Sensitivity and Specificity for each label
    for label, metrics in sensitivity_specificity.items():
        print(f"\nLabel: {label}")
        print(f"Sensitivity: {metrics['Sensitivity (Recall)']:.2f}")
        print(f"Specificity: {metrics['Specificity']:.2f}")

In [None]:
sensitivity_specificity = {}
for i, label in enumerate(np.unique(y_test)):  # Enumerate to get numerical index 'i'
    tp = conf_matrix[i, i]  # Use 'i' for indexing
    fn = conf_matrix[i, :].sum() - tp
    fp = conf_matrix[:, i].sum() - tp
    tn = conf_matrix.sum() - (tp + fn + fp)

    # Calculate Sensitivity and Specificity (Corrected Indentation)
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

    sensitivity_specificity[label] = {
        'Sensitivity (Recall)': sensitivity,
        'Specificity': specificity
    }

# Print Sensitivity and Specificity for each label
for label, metrics in sensitivity_specificity.items():
    print(f"\nLabel: {label}")
    print(f"Sensitivity: {metrics['Sensitivity (Recall)']:.2f}")
    print(f"Specificity: {metrics['Specificity']:.2f}")

In [None]:
# Initialize a dictionary to store sensitivity and specificity for each label
sensitivity_specificity = {}

# Iterate over unique labels in the test set
for i, label in enumerate(np.unique(y_test)):  # Enumerate to get numerical index 'i'
    tp = conf_matrix[i, i]  # True Positive: correctly predicted instances of the current class
    fn = conf_matrix[i, :].sum() - tp  # False Negative: instances of the current class not predicted as such
    fp = conf_matrix[:, i].sum() - tp  # False Positive: instances predicted as the current class that aren't
    tn = conf_matrix.sum() - (tp + fn + fp)  # True Negative: correctly predicted instances of other classes

    # Calculate Sensitivity and Specificity
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0  # Sensitivity (Recall)
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0  # Specificity

    # Store the calculated metrics in the dictionary
    sensitivity_specificity[label] = {
        'Sensitivity (Recall)': sensitivity,
        'Specificity': specificity
    }

# Print Sensitivity and Specificity for each label
for label, metrics in sensitivity_specificity.items():
    print(f"\nLabel: {label}")
    print(f"Sensitivity: {metrics['Sensitivity (Recall)']:.2f}")
    print(f"Specificity: {metrics['Specificity']:.2f}")

In [None]:
# Initialize overall metrics
total_tp = 0  # Total True Positives
total_fn = 0  # Total False Negatives
total_fp = 0  # Total False Positives
total_tn = 0  # Total True Negatives

# Calculate total TP, FN, FP, and TN across all labels
for i in range(len(np.unique(y_test))):
    tp = conf_matrix[i, i]  # True Positive
    fn = conf_matrix[i, :].sum() - tp  # False Negative
    fp = conf_matrix[:, i].sum() - tp  # False Positive
    tn = conf_matrix.sum() - (tp + fn + fp)  # True Negative

    total_tp += tp
    total_fn += fn
    total_fp += fp
    total_tn += tn

# Calculate overall Sensitivity and Specificity
overall_sensitivity = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
overall_specificity = total_tn / (total_tn + total_fp) if (total_tn + total_fp) > 0 else 0

# Print overall Sensitivity and Specificity
print(f"\nOverall Sensitivity (Recall): {overall_sensitivity:.2f}")
print(f"Overall Specificity: {overall_specificity:.2f}")


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# Define column labels (adjust as per your dataset)
column_labels = ['date', 'x_mean', 'x_max', 'x_min', 'x_std', 'x_var', 'x_skew', 'x_kurtosis',
                 'y_mean', 'y_max', 'y_min', 'y_std', 'y_var', 'y_skew', 'y_kurtosis',
                 'z_mean', 'z_max', 'z_min', 'z_std', 'z_var', 'z_skew', 'z_kurtosis',
                 'n', 'x_range', 'y_range', 'z_range', 'svm', 'vmm', 'sma', 'ai', 'label']

# Data files
data_files = ["resampled_2_1S.csv", "resampled_2_3S.csv", "resampled_2_5S.csv"]

# Initialize lists to store all data
data_frames = []

# Load all data files and append to data_frames list
for file_name in data_files:
    df = pd.read_csv(file_name)
    data_frames.append(df)

# Concatenate all data frames into a single dataframe
data = pd.concat(data_frames, ignore_index=True)

# Assuming 'date' is the column containing timestamps
data['date'] = pd.to_datetime(data['date'])  # Convert to datetime format

# Calculate time difference
time_diff = data['date'].diff().dropna()  # Drop NA values and calculate time difference

# Calculate frequency (assuming uniform sampling)
mean_time_diff = time_diff.mean()
frequency = 1 / mean_time_diff.total_seconds()  # Convert to Hz

print("Sampling frequency for combined data:", frequency, "Hz")

# Calculate window size based on frequency
window_duration = 0.5  # Seconds
window_size = int(frequency * window_duration)

if window_size == 0:
    print("Window size is zero, adjusting to 1")
    window_size = 1

# Extract windowed features and labels
windowed_features = []
windowed_labels = []

for i in range(0, len(data), window_size):
    window = data.iloc[i:i+window_size]
    if len(window) == window_size:
        # Compute mean values as features for this window
        window_features = window.mean()
        window_label = window['label'].iloc[0]
        windowed_features.append(window_features)
        windowed_labels.append(window_label)

# Convert lists to DataFrame
X = pd.DataFrame(windowed_features).drop(columns=['date'])  # Drop 'date' column for features
y = np.array(windowed_labels)

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Scale the input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Deal with class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)

# Initialize Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Make predictions
y_pred = rf_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Test accuracy for combined data:", accuracy * 100, "%")

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Calculate overall sensitivity and specificity
total_tp = 0  # Total True Positives
total_fn = 0  # Total False Negatives
total_fp = 0  # Total False Positives
total_tn = 0  # Total True Negatives

# Loop through the confusion matrix to aggregate TP, FN, FP, TN
num_classes = conf_matrix.shape[0]  # Get number of classes

for i in range(num_classes):
    TP = conf_matrix[i, i]  # True Positive for class i
    FN = conf_matrix[i, :].sum() - TP  # False Negative for class i
    FP = conf_matrix[:, i].sum() - TP  # False Positive for class i
    TN = conf_matrix.sum() - (TP + FN + FP)  # True Negative for class i

    total_tp += TP
    total_fn += FN
    total_fp += FP
    total_tn += TN

# Calculate overall sensitivity and specificity
overall_sensitivity = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
overall_specificity = total_tn / (total_tn + total_fp) if (total_tn + total_fp) > 0 else 0

# Print overall sensitivity and specificity
print(f"\nOverall Sensitivity (Recall): {overall_sensitivity:.2f}")
print(f"Overall Specificity: {overall_specificity:.2f}")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
data = pd.read_csv("dataset_6.csv")

# Define window size
window_size = 15  # Approximately 15 data points for 0.5 seconds

# Initialize lists to store windowed features and labels
windowed_features = []
windowed_labels = []

# Segment the time-series data into windows
for i in range(0, len(data) - window_size + 1, window_size):
    window = data.iloc[i:i + window_size]
    if len(window) == window_size:  # Ensure the window is complete
        # Extract features from the window
        window_features = {
            'mean_acc_x': window['acc_x'].mean(),
            'mean_acc_y': window['acc_y'].mean(),
            'mean_acc_z': window['acc_z'].mean(),
            'std_acc_x': window['acc_x'].std(),
            'std_acc_y': window['acc_y'].std(),
            'std_acc_z': window['acc_z'].std(),
            'skew_acc_x': window['acc_x'].skew(),
            'skew_acc_y': window['acc_y'].skew(),
            'skew_acc_z': window['acc_z'].skew(),
            'kurt_acc_x': window['acc_x'].kurt(),
            'kurt_acc_y': window['acc_y'].kurt(),
            'kurt_acc_z': window['acc_z'].kurt()
        }
        windowed_features.append(window_features)

        # Assign label to the window (assuming it's the same for all samples within the window)
        window_label = window['label'].iloc[0]
        windowed_labels.append(window_label)

# Convert lists to DataFrame
X_windowed = pd.DataFrame(windowed_features)
y_windowed = np.array(windowed_labels)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X_windowed)

# Scale the input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Deal with class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y_windowed)

# Split the data into train and test using stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)

# Initialize Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, max_features='sqrt', random_state=42)

# Train the Random Forest classifier
rf_classifier.fit(X_train, y_train)

# Make predictions on the testing data
y_pred_rf = rf_classifier.predict(X_test)

# Evaluate the Random Forest model's performance
accuracy_rf = accuracy_score(y_test, y_pred_rf)
f1 = f1_score(y_test, y_pred_rf, average='weighted')
precision = precision_score(y_test, y_pred_rf, average='weighted')
recall = recall_score(y_test, y_pred_rf, average='weighted')

# Print metrics for this single iteration
print("Single Iteration Performance:")
print(f"  Accuracy: {accuracy_rf * 100:.2f}%")
print(f"  F1 Score: {f1:.2f}")
print(f"  Precision: {precision:.2f}")
print(f"  Recall: {recall:.2f}")

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_rf)

# Calculate overall sensitivity and specificity
total_tp = conf_matrix.diagonal().sum()  # Total True Positives
total_fn = conf_matrix.sum(axis=1).sum() - total_tp  # Total False Negatives
total_fp = conf_matrix.sum(axis=0).sum() - total_tp  # Total False Positives
total_tn = conf_matrix.sum() - (total_tp + total_fn + total_fp)  # Total True Negatives

overall_sensitivity = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
overall_specificity = total_tn / (total_tn + total_fp) if (total_tn + total_fp) > 0 else 0

# Print overall sensitivity and specificity
print(f"Overall Sensitivity (Recall): {overall_sensitivity:.2f}")
print(f"Overall Specificity: {overall_specificity:.2f}")

# Plot confusion matrix for the single iteration
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# Load the dataset (replace 'CURC.csv' with your actual file path)
data = pd.read_csv('CURC.csv')

# Convert the 'Time' column to datetime format (assuming Time is in HH:MM:SS format)
data['Time'] = pd.to_datetime(data['Time'], format='%H:%M:%S').dt.time

# Define a larger window size in seconds
window_size_seconds = 6  # Adjust this value as needed
window_size_samples = window_size_seconds  # 6 seconds = 6 data points

# Initialize lists to store windowed features and labels
windowed_features = []
windowed_labels = []

# Segment the time-series data into windows
for i in range(0, len(data) - window_size_samples + 1, window_size_samples):
    window = data.iloc[i:i + window_size_samples]
    if len(window) == window_size_samples:  # Ensure the window is complete
        # Extract features from the window
        window_features = {
            'mean_x': window['X-axis (g)'].mean(),
            'mean_y': window['Y-axis (g)'].mean(),
            'mean_z': window['Z-axis (g)'].mean(),
            'std_x': window['X-axis (g)'].std(),
            'std_y': window['Y-axis (g)'].std(),
            'std_z': window['Z-axis (g)'].std(),
            'skew_x': window['X-axis (g)'].skew(),
            'skew_y': window['Y-axis (g)'].skew(),
            'skew_z': window['Z-axis (g)'].skew(),
            'kurt_x': window['X-axis (g)'].kurt(),
            'kurt_y': window['Y-axis (g)'].kurt(),
            'kurt_z': window['Z-axis (g)'].kurt()
        }
        windowed_features.append(window_features)

        # Assign labels to the window based on 'IteragreementLocom' and 'IteragreementFeeding'
        locomotion_label = window['IteragreementLocom'].mode().iloc[0]
        feeding_label = window['IteragreementFeeding'].mode().iloc[0]
        combined_label = f"{locomotion_label}{feeding_label}"
        windowed_labels.append(combined_label)

# Convert lists to DataFrame
X_windowed = pd.DataFrame(windowed_features)
y_windowed = np.array(windowed_labels)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X_windowed)

# Scale the input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Deal with class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y_windowed)

# Split the data into train and test using stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)

# Initialize lists to store metrics
accuracies = []
f1_scores = []
precisions = []
recalls = []

# Perform 10 iterations
for i in range(1):
    # Instantiate Random Forest classifier
    rf_classifier = RandomForestClassifier(n_estimators=100, max_features='sqrt', random_state=42)

    # Train the Random Forest classifier
    rf_classifier.fit(X_train, y_train)

    # Make predictions on the testing data
    y_pred_rf = rf_classifier.predict(X_test)

    # Evaluate the Random Forest model's performance
    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    f1 = f1_score(y_test, y_pred_rf, average='weighted')
    precision = precision_score(y_test, y_pred_rf, average='weighted')
    recall = recall_score(y_test, y_pred_rf, average='weighted')

    # Append metrics to lists
    accuracies.append(accuracy_rf)
    f1_scores.append(f1)
    precisions.append(precision)
    recalls.append(recall)

    print(f"Iteration {i + 1}:")
    print(f"  Accuracy: {accuracy_rf * 100:.2f}%")
    print(f"  F1 Score: {f1:.2f}")
    print(f"  Precision: {precision:.2f}")
    print(f"  Recall: {recall:.2f}")

# Print average metrics
print("\nAverage Metrics over 10 iterations:")
print(f"  Average Accuracy: {np.mean(accuracies) * 100:.2f}%")
print(f"  Average F1 Score: {np.mean(f1_scores):.2f}")
print(f"  Average Precision: {np.mean(precisions):.2f}")
print(f"  Average Recall: {np.mean(recalls):.2f}")

# Calculate confusion matrix and derive specificity
conf_matrix = confusion_matrix(y_test, y_pred_rf)
tn, fp, fn, tp = conf_matrix.ravel()  # Assuming binary classification
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

print(f"\nOverall Sensitivity (Recall): {sensitivity:.2f}")
print(f"Overall Specificity: {specificity:.2f}")

In [None]:
import matplotlib.pyplot as plt

# Data
datasets = ['1', '2', '3', '4', '5']
model_accuracies = [92.72, 93.41, 99.97, 90.17, 86.08]
published_accuracies = [85.67, 94.43, "NA", 87.15, "NA"]
f1_scores = [0.93, 0.93, 1.00, 0.90, 0.86]

# Filter out 'NA' values
filtered_published_accuracies = [acc if acc != 'NA' else None for acc in published_accuracies]

# Plotting
plt.figure(figsize=(12, 7))

bar_width = 0.25
index = range(len(datasets))

# Plot model accuracies with solid color
bars_model = plt.bar(index, model_accuracies, bar_width, label='Model Accuracy', color='skyblue', edgecolor='black', hatch='')

# Plot published accuracies with dashed color
bars_published = plt.bar([i + bar_width for i in index if published_accuracies[i] != 'NA'],
                         [acc for acc in filtered_published_accuracies if acc is not None],
                         bar_width, label='Published Accuracy', color='lightgreen', edgecolor='black', hatch='//')

# Plot F1 scores with dotted color
bars_f1 = plt.bar([i + 2 * bar_width for i in index], f1_scores, bar_width, label='F1 Score', color='salmon', edgecolor='black', hatch='..')

# Labels
plt.xlabel('Dataset', fontsize=14)
plt.ylabel('Scores', fontsize=14)

# Align x-axis labels with bars
plt.xticks([i + bar_width for i in index], datasets, fontsize=12)

plt.legend(fontsize=12)

# Annotate bars with values
for bar in bars_model:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.5, f"{yval:.1f}%", ha='center', va='bottom', fontsize=10, color='black')

for bar in bars_published:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.5, f"{yval:.1f}%", ha='center', va='bottom', fontsize=10, color='black')

for bar in bars_f1:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.05, f"{yval:.2f}", ha='center', va='bottom', fontsize=10, color='black')

# Improve layout and show plot
plt.tight_layout()
plt.grid(axis='y', linestyle='--', alpha=0.7)  # Add grid for better readability
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Data
datasets = ['1', '2', '3', '4', '5']
model_accuracies = [92.72, 93.41, 99.97, 90.17, 86.08]
published_accuracies = [85.67, 94.43, "NA", 87.15, "NA"]
f1_scores = [0.93, 0.93, 1.00, 0.90, 0.86]

# Filter out 'NA' values
filtered_published_accuracies = [acc if acc != 'NA' else None for acc in published_accuracies]

# Plotting
plt.figure(figsize=(12, 7))

bar_width = 0.25
index = range(len(datasets))

# Plot model accuracies with solid color
bars_model = plt.bar(index, model_accuracies, bar_width, label='Model Accuracy', color='skyblue', edgecolor='black', hatch='')

# Plot published accuracies with dashed color
bars_published = plt.bar([i + bar_width for i in index if published_accuracies[i] != 'NA'],
                         [acc for acc in filtered_published_accuracies if acc is not None],
                         bar_width, label='Published Accuracy', color='lightgreen', edgecolor='black', hatch='//')

# Plot F1 scores with dotted color
bars_f1 = plt.bar([i + 2 * bar_width for i in index], [score * 100 for score in f1_scores],
                   bar_width, label='F1 Score (%)', color='salmon', edgecolor='black', hatch='..')

# Labels
plt.xlabel('Dataset', fontsize=14)
plt.ylabel('Scores', fontsize=14)

# Align x-axis labels with bars
plt.xticks([i + bar_width for i in index], datasets, fontsize=12)

plt.legend(fontsize=12)

# Annotate bars with values
for bar in bars_model:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.5, f"{yval:.1f}%", ha='center', va='bottom', fontsize=10, color='black')

for bar in bars_published:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.5, f"{yval:.1f}%", ha='center', va='bottom', fontsize=10, color='black')

for bar in bars_f1:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.5, f"{yval:.1f}%", ha='center', va='bottom', fontsize=10, color='black')

# Improve layout and show plot
plt.tight_layout()
plt.grid(axis='y', linestyle='--', alpha=0.7)  # Add grid for better readability
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Data
datasets = ['1', '2', '3', '4', '5']
model_accuracies = [92.72, 93.41, 99.97, 90.17, 86.08]
published_accuracies = [85.67, 94.43, "NA", 87.15, "NA"]
f1_scores = [0.93, 0.93, 1.00, 0.90, 0.86]

# Filter out 'NA' values
filtered_published_accuracies = [acc if acc != 'NA' else None for acc in published_accuracies]

# Plotting
plt.figure(figsize=(12, 7))

bar_width = 0.25
index = range(len(datasets))

# Plot model accuracies with solid color
bars_model = plt.bar(index, model_accuracies, bar_width, label='Model Accuracy', color='skyblue', edgecolor='black', hatch='')

# Plot published accuracies with dashed color
bars_published = plt.bar([i + bar_width for i in index if published_accuracies[i] != 'NA'],
                         [acc for acc in filtered_published_accuracies if acc is not None],
                         bar_width, label='Published Accuracy', color='lightgreen', edgecolor='black', hatch='//')

# Plot F1 scores with dotted color
bars_f1 = plt.bar([i + 2 * bar_width for i in index], [score * 100 for score in f1_scores],
                   bar_width, label='F1 Score (%)', color='salmon', edgecolor='black', hatch='..')

# Labels
plt.xlabel('Dataset', fontsize=14)
plt.ylabel('Scores', fontsize=14)

# Align x-axis labels with bars
plt.xticks([i + bar_width for i in index], datasets, fontsize=12)

plt.legend(fontsize=12)

# Improve layout and show plot
plt.tight_layout()
plt.grid(axis='y', linestyle='--', alpha=0.7)  # Add grid for better readability
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Data
datasets = ['1', '2', '3', '4', '5']
model_accuracies = [92.72, 93.41, 99.97, 90.17, 86.08]
published_accuracies = [85.67, 94.43, "NA", 87.15, "NA"]
f1_scores = [0.93, 0.93, 1.00, 0.90, 0.86]

# Filter out 'NA' values
filtered_published_accuracies = [acc if acc != 'NA' else None for acc in published_accuracies]

# Plotting
plt.figure(figsize=(12, 7))

bar_width = 0.25
index = range(len(datasets))

# Plot model accuracies with solid color
bars_model = plt.bar(index, model_accuracies, bar_width, label='Model Accuracy', color='skyblue', edgecolor='black', hatch='')

# Plot published accuracies with dashed color
bars_published = plt.bar([i + bar_width for i in index if published_accuracies[i] != 'NA'],
                         [acc for acc in filtered_published_accuracies if acc is not None],
                         bar_width, label='Published Accuracy', color='lightgreen', edgecolor='black', hatch='//')

# Plot F1 scores with dotted color
bars_f1 = plt.bar([i + 2 * bar_width for i in index], [score * 100 for score in f1_scores],
                   bar_width, label='F1 Score (%)', color='salmon', edgecolor='black', hatch='..')

# Labels
plt.xlabel('Dataset', fontsize=18)  # Increased font size
plt.ylabel('Scores', fontsize=18)    # Increased font size

# Align x-axis labels with bars
plt.xticks([i + bar_width for i in index], datasets, fontsize=16)  # Increased font size

plt.legend(fontsize=16)  # Increased font size

# Improve layout and show plot
plt.tight_layout()
plt.grid(axis='y', linestyle='--', alpha=0.7)  # Add grid for better readability
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Data
datasets = ['1', '2', '3', '4', '5']
model_accuracies = [92.72, 93.41, 99.97, 90.17, 86.08]
published_accuracies = [85.67, 94.43, "NA", 87.15, "NA"]
f1_scores = [0.93, 0.93, 1.00, 0.90, 0.86]

# Filter out 'NA' values
filtered_published_accuracies = [acc if acc != 'NA' else None for acc in published_accuracies]

# Plotting
plt.figure(figsize=(12, 7))

bar_width = 0.25
index = range(len(datasets))

# Plot model accuracies with solid color
bars_model = plt.bar(index, model_accuracies, bar_width, label='Model Accuracy', color='skyblue', edgecolor='black', hatch='')

# Plot published accuracies with dashed color
bars_published = plt.bar([i + bar_width for i in index if published_accuracies[i] != 'NA'],
                         [acc for acc in filtered_published_accuracies if acc is not None],
                         bar_width, label='Published Accuracy', color='lightgreen', edgecolor='black', hatch='//')

# Plot F1 scores with dotted color
bars_f1 = plt.bar([i + 2 * bar_width for i in index], [score * 100 for score in f1_scores],
                   bar_width, label='F1 Score (%)', color='salmon', edgecolor='black', hatch='..')

# Labels
plt.xlabel('Dataset', fontsize=18)  # Increased font size
plt.ylabel('Scores', fontsize=18)    # Increased font size

# Align x-axis labels with bars
plt.xticks([i + bar_width for i in index], datasets, fontsize=16)  # Increased font size
plt.yticks(fontsize=16)  # Increased font size for Y-axis tick labels

plt.legend(fontsize=16)  # Increased font size

# Improve layout and show plot
plt.tight_layout()
plt.grid(axis='y', linestyle='--', alpha=0.7)  # Add grid for better readability
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Data
datasets = ['1', '2', '3', '4', '5']
model_accuracies = [92.72, 93.41, 99.97, 90.17, 86.08]
published_accuracies = [85.67, 94.43, "NA", 87.15, "NA"]
f1_scores = [0.93, 0.93, 1.00, 0.90, 0.86]

# Filter out 'NA' values
filtered_published_accuracies = [acc if acc != 'NA' else None for acc in published_accuracies]

# Plotting
plt.figure(figsize=(12, 7))

bar_width = 0.25
index = range(len(datasets))

# Plot model accuracies with solid color
bars_model = plt.bar(index, model_accuracies, bar_width, label='Model Accuracy', color='skyblue', edgecolor='black', hatch='')

# Plot published accuracies with dashed color
bars_published = plt.bar([i + bar_width for i in index if published_accuracies[i] != 'NA'],
                         [acc for acc in filtered_published_accuracies if acc is not None],
                         bar_width, label='Published Accuracy', color='lightgreen', edgecolor='black', hatch='//')

# Plot F1 scores with dotted color
bars_f1 = plt.bar([i + 2 * bar_width for i in index], [score * 100 for score in f1_scores],
                   bar_width, label='F1 Score', color='salmon', edgecolor='black', hatch='..')

# Labels
plt.xlabel('Dataset', fontsize=18)  # Increased font size
plt.ylabel('Scores', fontsize=18)    # Increased font size

# Align x-axis labels with bars
plt.xticks([i + bar_width for i in index], datasets, fontsize=16)  # Increased font size
plt.yticks(fontsize=16)  # Increased font size for Y-axis tick labels

# Move the legend below the plot
plt.legend(fontsize=16, loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=3)  # Adjust ncol for layout

# Improve layout and show plot
plt.tight_layout()
plt.grid(axis='y', linestyle='--', alpha=0.7)  # Add grid for better readability
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Data
datasets = ['1', '2', '3', '4', '5']
model_accuracies = [92.72, 93.41, 99.97, 90.17, 86.08]
published_accuracies = [85.67, 94.43, "NA", 87.15, "NA"]
f1_scores = [0.93, 0.93, 1.00, 0.90, 0.86]

# Filter out 'NA' values
filtered_published_accuracies = [acc if acc != 'NA' else None for acc in published_accuracies]

# Plotting
plt.figure(figsize=(12, 7))

bar_width = 0.25
index = range(len(datasets))

# Define darker shades for bars
model_color = '#87ceeb'          # Darker light blue for Model Accuracy
published_color = '#ffcc99'      # Darker light green for Published Accuracy
f1_color = '#ff9999'             # Darker light pink for F1 Score

# Plot model accuracies with solid color
bars_model = plt.bar(index, model_accuracies, bar_width, label='Model Accuracy', color=model_color, edgecolor='black')

# Plot published accuracies with dashed color
bars_published = plt.bar([i + bar_width for i in index if published_accuracies[i] != 'NA'],
                         [acc for acc in filtered_published_accuracies if acc is not None],
                         bar_width, label='Published Accuracy', color=published_color, edgecolor='black', hatch='//')

# Plot F1 scores with dotted color
bars_f1 = plt.bar([i + 2 * bar_width for i in index], [score * 100 for score in f1_scores],
                   bar_width, label='F1 Score', color=f1_color, edgecolor='black', hatch='..')

# Labels
plt.xlabel('Dataset', fontsize=18, fontweight='bold')  # Increased font size and bold
plt.ylabel('Scores', fontsize=18, fontweight='bold')    # Increased font size and bold

# Align x-axis labels with bars
plt.xticks([i + bar_width for i in index], datasets, fontsize=16, fontweight='bold')  # Increased font size and bold
plt.yticks(fontsize=16, fontweight='bold')  # Increased font size and bold for Y-axis tick labels

# Move the legend below the plot
plt.legend(fontsize=16, loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=3)  # Adjust ncol for layout

# Improve layout and show plot
plt.tight_layout()
plt.grid(axis='y', linestyle='--', alpha=0.7)  # Add grid for better readability
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Data
datasets = ['1', '2', '3', '4', '5']
model_accuracies = [92.72, 93.41, 99.97, 90.17, 86.08]
published_accuracies = [85.67, 94.43, "NA", 87.15, "NA"]
f1_scores = [0.93, 0.93, 1.00, 0.90, 0.86]

# Filter out 'NA' values
filtered_published_accuracies = [acc if acc != 'NA' else None for acc in published_accuracies]

# Plotting
plt.figure(figsize=(12, 7))

bar_width = 0.25
index = range(len(datasets))

# Define original colors for bars
model_color = 'skyblue'          # Original color for Model Accuracy
published_color = 'lightgreen'   # Original color for Published Accuracy
f1_color = 'salmon'              # Original color for F1 Score

# Plot model accuracies with solid color
bars_model = plt.bar(index, model_accuracies, bar_width, label='Model Accuracy', color=model_color, edgecolor='black')

# Plot published accuracies with dashed color
bars_published = plt.bar([i + bar_width for i in index if published_accuracies[i] != 'NA'],
                         [acc for acc in filtered_published_accuracies if acc is not None],
                         bar_width, label='Published Accuracy', color=published_color, edgecolor='black', hatch='//')

# Plot F1 scores with dotted color
bars_f1 = plt.bar([i + 2 * bar_width for i in index], [score * 100 for score in f1_scores],
                   bar_width, label='F1 Score', color=f1_color, edgecolor='black', hatch='..')

# Labels
plt.xlabel('Dataset', fontsize=18, fontweight='bold')  # Increased font size and bold
plt.ylabel('Scores', fontsize=18, fontweight='bold')    # Increased font size and bold

# Align x-axis labels with bars
plt.xticks([i + bar_width for i in index], datasets, fontsize=16, fontweight='bold')  # Increased font size and bold
plt.yticks(fontsize=16, fontweight='bold')  # Increased font size and bold for Y-axis tick labels

# Move the legend below the plot
plt.legend(fontsize=16, loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=3)  # Adjust ncol for layout

# Improve layout and show plot
plt.tight_layout()
plt.grid(axis='y', linestyle='--', alpha=0.7)  # Add grid for better readability
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Data
datasets = ['1', '2', '3', '4', '5']
model_accuracies = [92.72, 93.41, 99.97, 90.17, 86.08]
published_accuracies = [85.67, 94.43, "NA", 87.15, "NA"]
f1_scores = [0.93, 0.93, 1.00, 0.90, 0.86]

# Filter out 'NA' values
filtered_published_accuracies = [acc if acc != 'NA' else None for acc in published_accuracies]

# Plotting
plt.figure(figsize=(12, 7))

bar_width = 0.25
index = range(len(datasets))

# Define original colors for bars
model_color = 'skyblue'          # Original color for Model Accuracy
published_color = 'lightgreen'   # Original color for Published Accuracy
f1_color = 'salmon'              # Original color for F1 Score

# Plot model accuracies with solid color
bars_model = plt.bar(index, model_accuracies, bar_width, label='Model Accuracy', color=model_color, edgecolor='black')

# Plot published accuracies with dashed color
bars_published = plt.bar([i + bar_width for i in index if published_accuracies[i] != 'NA'],
                         [acc for acc in filtered_published_accuracies if acc is not None],
                         bar_width, label='Published Accuracy', color=published_color, edgecolor='black', hatch='//')

# Plot F1 scores with dotted color
bars_f1 = plt.bar([i + 2 * bar_width for i in index], [score * 100 for score in f1_scores],
                   bar_width, label='F1 Score', color=f1_color, edgecolor='black', hatch='..')

# Labels
plt.xlabel('Dataset', fontsize=18, fontweight='bold')  # Increased font size and bold
plt.ylabel('Test Accuracy %', fontsize=18, fontweight='bold')    # Increased font size and bold

# Align x-axis labels with bars
plt.xticks([i + bar_width for i in index], datasets, fontsize=16, fontweight='bold')  # Increased font size and bold
plt.yticks(fontsize=16, fontweight='bold')  # Increased font size and bold for Y-axis tick labels

# Move the legend below the plot
plt.legend(fontsize=16, loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=3)  # Adjust ncol for layout

# Improve layout and show plot
plt.tight_layout()
plt.grid(False)  # Remove grid
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# Load the dataset (replace 'CURC.csv' with your actual file path)
data = pd.read_csv('CURC.csv')

# Convert the 'Time' column to datetime format (assuming Time is in HH:MM:SS format)
data['Time'] = pd.to_datetime(data['Time'], format='%H:%M:%S').dt.time

# Define a larger window size in seconds
window_size_seconds = 6  # Adjust this value as needed
window_size_samples = window_size_seconds  # 6 seconds = 6 data points

# Initialize lists to store windowed features and labels
windowed_features = []
windowed_labels = []

# Segment the time-series data into windows
for i in range(0, len(data) - window_size_samples + 1, window_size_samples):
    window = data.iloc[i:i + window_size_samples]
    if len(window) == window_size_samples:  # Ensure the window is complete
        # Extract features from the window
        window_features = {
            'mean_x': window['X-axis (g)'].mean(),
            'mean_y': window['Y-axis (g)'].mean(),
            'mean_z': window['Z-axis (g)'].mean(),
            'std_x': window['X-axis (g)'].std(),
            'std_y': window['Y-axis (g)'].std(),
            'std_z': window['Z-axis (g)'].std(),
            'skew_x': window['X-axis (g)'].skew(),
            'skew_y': window['Y-axis (g)'].skew(),
            'skew_z': window['Z-axis (g)'].skew(),
            'kurt_x': window['X-axis (g)'].kurt(),
            'kurt_y': window['Y-axis (g)'].kurt(),
            'kurt_z': window['Z-axis (g)'].kurt()
        }
        windowed_features.append(window_features)

        # Assign labels to the window based on 'IteragreementLocom' and 'IteragreementFeeding'
        locomotion_label = window['IteragreementLocom'].mode().iloc[0]
        feeding_label = window['IteragreementFeeding'].mode().iloc[0]
        combined_label = f"{locomotion_label}{feeding_label}"
        windowed_labels.append(combined_label)

# Convert lists to DataFrame
X_windowed = pd.DataFrame(windowed_features)
y_windowed = np.array(windowed_labels)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X_windowed)

# Scale the input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Deal with class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y_windowed)

# Split the data into train and test using stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)

# Initialize lists to store metrics
accuracies = []
f1_scores = []
precisions = []
recalls = []

# Perform 10 iterations
for i in range(1):
    # Instantiate Random Forest classifier
    rf_classifier = RandomForestClassifier(n_estimators=100, max_features='sqrt', random_state=42)

    # Train the Random Forest classifier
    rf_classifier.fit(X_train, y_train)

    # Make predictions on the testing data
    y_pred_rf = rf_classifier.predict(X_test)

    # Evaluate the Random Forest model's performance
    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    f1 = f1_score(y_test, y_pred_rf, average='weighted')
    precision = precision_score(y_test, y_pred_rf, average='weighted')
    recall = recall_score(y_test, y_pred_rf, average='weighted')

    # Append metrics to lists
    accuracies.append(accuracy_rf)
    f1_scores.append(f1)
    precisions.append(precision)
    recalls.append(recall)

    print(f"Iteration {i + 1}:")
    print(f"  Accuracy: {accuracy_rf * 100:.2f}%")
    print(f"  F1 Score: {f1:.2f}")
    print(f"  Precision: {precision:.2f}")
    print(f"  Recall: {recall:.2f}")

# Print average metrics
print("\nAverage Metrics over 10 iterations:")
print(f"  Average Accuracy: {np.mean(accuracies) * 100:.2f}%")
print(f"  Average F1 Score: {np.mean(f1_scores):.2f}")
print(f"  Average Precision: {np.mean(precisions):.2f}")
print(f"  Average Recall: {np.mean(recalls):.2f}")

# Calculate confusion matrix and display it
conf_matrix = confusion_matrix(y_test, y_pred_rf)
print("\nConfusion Matrix:")
print(conf_matrix)

# Compute specificity and sensitivity for each class
sensitivity_per_class = []
specificity_per_class = []
total_true_negatives = 0
total_false_positives = 0

for i in range(len(conf_matrix)):
    # Sensitivity for class i (True Positive Rate)
    true_positive = conf_matrix[i, i]
    false_negative = conf_matrix[i, :].sum() - true_positive
    sensitivity = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0
    sensitivity_per_class.append(sensitivity)

    # Specificity for class i (True Negative Rate)
    false_positive = conf_matrix[:, i].sum() - true_positive
    true_negative = conf_matrix.sum() - (false_positive + false_negative + true_positive)
    specificity = true_negative / (true_negative + false_positive) if (true_negative + false_positive) > 0 else 0
    specificity_per_class.append(specificity)

    # Sum overall true negatives and false positives for global specificity
    total_true_negatives += true_negative
    total_false_positives += false_positive

# Overall Sensitivity (weighted average of sensitivities)
overall_sensitivity = np.mean(sensitivity_per_class)
# Overall Specificity (based on global confusion matrix)
overall_specificity = total_true_negatives / (total_true_negatives + total_false_positives)

print(f"\nOverall Sensitivity (Recall): {overall_sensitivity:.2f}")
print(f"Overall Specificity: {overall_specificity:.2f}")

# Display classification report (includes precision, recall, and F1-score for each class)
class_report = classification_report(y_test, y_pred_rf)
print("\nClassification Report:")
print(class_report)

In [None]:
!pip install lazypredict

In [None]:
!pip install scikit-learn pandas numpy

In [None]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyClassifier

# Step 1: Load the dataset
# Replace 'CURC.csv' with the correct path to your file
data = pd.read_csv('/content/AX3_RAW_DATA06272024_ok.csv')

# Convert the 'Time' column to datetime (if applicable)
data['Time'] = pd.to_datetime(data['Time'], format='%H:%M:%S').dt.time

# Preview the dataset
print(data.head())
print(data.info())

# Step 2: Define Features and Target
# Replace 'acc_x', 'acc_y', 'acc_z' with your feature columns
# Replace 'behavior' with your target column
X = data[['acc_x', 'acc_y', 'acc_z']]
y = data['behavior']

# Check for missing values
print(X.isnull().sum())
print(y.isnull().sum())

# Handle missing values if necessary
X = X.fillna(0)  # Replace missing values with 0 (adjust if needed)

# Step 3: Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 4: Use Lazy Predict for Classification
# Initialize LazyClassifier
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

# Fit models and evaluate
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# Display results
print(models)

# Step 5: Save Results to CSV (Optional)
models.to_csv('model_comparison.csv', index=False)

In [None]:
#Import necessary libraries
#DATASET 5
import pandas as pd
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyClassifier
from sklearn.preprocessing import LabelEncoder

# Load the dataset (replace 'CURC.csv' with your actual file path)
data = pd.read_csv('/content/AX3_RAW_DATA06272024_ok.csv')

# Step 1: Combine relevant columns to create a behavior label
# Combine 'IteragreementLocom' and 'IteragreementFeeding' into a single 'Behavior' column
data['Behavior'] = data['IteragreementLocom'].astype(str) + '-' + data['IteragreementFeeding'].astype(str)

# Step 2: Encode the 'Behavior' column to numeric labels
label_encoder = LabelEncoder()
data['Behavior_Label'] = label_encoder.fit_transform(data['Behavior'])

# Print the mapping of behaviors to numeric labels
print("Behavior Mapping:")
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

# Step 3: Define features (accelerometer data) and target (behavior labels)
X = data[['X-axis (g)', 'Y-axis (g)', 'Z-axis (g)']]  # Feature columns
y = data['Behavior_Label']                           # Target column

# Step 4: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 5: Initialize LazyClassifier
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

# Step 6: Fit LazyClassifier on the training data and evaluate on the test data
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# Display the results
print("\nLazyClassifier Results:")
print(models)

# Optional: Save the LazyClassifier results to a CSV file
models.to_csv('lazy_classifier_results.csv', index=False)

# Step 7: Visualize behavior labels (optional)
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
data['Behavior'].value_counts().plot(kind='bar')
plt.title('Behavior Label Distribution')
plt.xlabel('Behavior')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Example LazyClassifier results as a DataFrame
import pandas as pd

# Replace this with your LazyClassifier results DataFrame
data = {
    "Model": [
        "XGBClassifier", "LGBMClassifier", "KNeighborsClassifier", "RandomForestClassifier",
        "BaggingClassifier", "LabelSpreading", "PassiveAggressiveClassifier",
        "DecisionTreeClassifier", "LabelPropagation", "NearestCentroid",
        "ExtraTreesClassifier", "GaussianNB", "ExtraTreeClassifier", "AdaBoostClassifier",
        "QuadraticDiscriminantAnalysis", "SVC", "SGDClassifier", "BernoulliNB",
        "Perceptron", "LogisticRegression", "LinearDiscriminantAnalysis",
        "LinearSVC", "CalibratedClassifierCV", "RidgeClassifierCV", "RidgeClassifier",
        "DummyClassifier"
    ],
    "Accuracy": [
        0.64, 0.65, 0.63, 0.63, 0.61, 0.62, 0.55, 0.57, 0.61, 0.49, 0.61, 0.59,
        0.55, 0.58, 0.60, 0.63, 0.58, 0.58, 0.45, 0.58, 0.58, 0.58, 0.58, 0.58,
        0.58, 0.57
    ],
    "Balanced Accuracy": [
        0.49, 0.48, 0.47, 0.47, 0.47, 0.46, 0.46, 0.46, 0.46, 0.46, 0.44, 0.44,
        0.43, 0.43, 0.41, 0.41, 0.41, 0.40, 0.39, 0.37, 0.37, 0.37, 0.37, 0.36,
        0.36, 0.33
    ],
    "F1 Score": [
        0.63, 0.63, 0.61, 0.61, 0.60, 0.60, 0.52, 0.57, 0.59, 0.51, 0.59, 0.57,
        0.54, 0.55, 0.56, 0.58, 0.44, 0.55, 0.46, 0.51, 0.51, 0.51, 0.51, 0.51,
        0.51, 0.42
    ],
    "Time Taken": [
        1.26, 0.20, 0.06, 0.35, 0.08, 0.27, 0.02, 0.02, 0.26, 0.02, 0.29, 0.01,
        0.01, 0.15, 0.03, 0.21, 0.03, 0.02, 0.02, 0.05, 0.03, 0.02, 0.08, 0.01,
        0.01, 0.01
    ]
}

results = pd.DataFrame(data).set_index("Model")

# Plot Accuracy
plt.figure(figsize=(10, 6))
results['Accuracy'].sort_values().plot(kind='barh', color='skyblue')
plt.title('Model Accuracy')
plt.xlabel('Accuracy')
plt.ylabel('Model')
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Plot Balanced Accuracy
plt.figure(figsize=(10, 6))
results['Balanced Accuracy'].sort_values().plot(kind='barh', color='lightgreen')
plt.title('Model Balanced Accuracy')
plt.xlabel('Balanced Accuracy')
plt.ylabel('Model')
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Plot F1 Score
plt.figure(figsize=(10, 6))
results['F1 Score'].sort_values().plot(kind='barh', color='salmon')
plt.title('Model F1 Score')
plt.xlabel('F1 Score')
plt.ylabel('Model')
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Plot Time Taken
plt.figure(figsize=(10, 6))
results['Time Taken'].sort_values().plot(kind='barh', color='orange')
plt.title('Model Time Taken')
plt.xlabel('Time Taken (seconds)')
plt.ylabel('Model')
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Example LazyClassifier results as a DataFrame
data = {
    "Model": [
        "XGBClassifier", "LGBMClassifier", "KNeighborsClassifier", "RandomForestClassifier",
        "BaggingClassifier", "LabelSpreading", "PassiveAggressiveClassifier",
        "DecisionTreeClassifier", "LabelPropagation", "NearestCentroid",
        "ExtraTreesClassifier", "GaussianNB", "ExtraTreeClassifier", "AdaBoostClassifier",
        "QuadraticDiscriminantAnalysis", "SVC", "SGDClassifier", "BernoulliNB",
        "Perceptron", "LogisticRegression", "LinearDiscriminantAnalysis",
        "LinearSVC", "CalibratedClassifierCV", "RidgeClassifierCV", "RidgeClassifier",
        "DummyClassifier"
    ],
    "Accuracy": [
        0.64, 0.65, 0.63, 0.63, 0.61, 0.62, 0.55, 0.57, 0.61, 0.49, 0.61, 0.59,
        0.55, 0.58, 0.60, 0.63, 0.58, 0.58, 0.45, 0.58, 0.58, 0.58, 0.58, 0.58,
        0.58, 0.57
    ],
    "Time Taken": [
        1.26, 0.20, 0.06, 0.35, 0.08, 0.27, 0.02, 0.02, 0.26, 0.02, 0.29, 0.01,
        0.01, 0.15, 0.03, 0.21, 0.03, 0.02, 0.02, 0.05, 0.03, 0.02, 0.08, 0.01,
        0.01, 0.01
    ]
}

# Create a DataFrame
results = pd.DataFrame(data).set_index("Model")

# Plot Accuracy
plt.figure(figsize=(10, 6))
results['Accuracy'].sort_values().plot(kind='barh', color='skyblue')
plt.title('Model Accuracy')
plt.xlabel('Accuracy')
plt.ylabel('Model')
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Plot Time Taken
plt.figure(figsize=(10, 6))
results['Time Taken'].sort_values().plot(kind='barh', color='orange')
plt.title('Model Time Taken')
plt.xlabel('Time Taken (seconds)')
plt.ylabel('Model')
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
pip install lazypredict


In [None]:
#dataset 2
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from lazypredict.Supervised import LazyClassifier
import matplotlib.pyplot as plt

# Define file names
file_names = ["cow1.csv", "cow2.csv", "cow3.csv", "cow4.csv", "cow5.csv", "cow6.csv"]

# Initialize an empty list to store DataFrames
all_data = []

# Iterate over each file and read them
for file_name in file_names:
    data = pd.read_csv(file_name)
    all_data.append(data)

# Concatenate all the data into one DataFrame
combined_data = pd.concat(all_data, ignore_index=True)

# Define window size
window_size = 15  # Approximately 15 data points for 0.5 seconds

# Initialize lists to store windowed features and labels
windowed_features = []
windowed_labels = []

# Segment the time-series data into windows
for i in range(0, len(combined_data) - window_size + 1, window_size):
    window = combined_data.iloc[i:i + window_size]
    if len(window) == window_size:  # Ensure the window is complete
        # Extract features from the window
        window_features = {
            'mean_acc_x': window['AccX'].mean(),
            'mean_acc_y': window['AccY'].mean(),
            'mean_acc_z': window['AccZ'].mean(),
            'std_acc_x': window['AccX'].std(),
            'std_acc_y': window['AccY'].std(),
            'std_acc_z': window['AccZ'].std(),
            'skew_acc_x': window['AccX'].skew(),
            'skew_acc_y': window['AccY'].skew(),
            'skew_acc_z': window['AccZ'].skew(),
            'kurt_acc_x': window['AccX'].kurt(),
            'kurt_acc_y': window['AccY'].kurt(),
            'kurt_acc_z': window['AccZ'].kurt()
        }
        windowed_features.append(window_features)

        # Assign label to the window (assuming it's the same for all samples within the window)
        window_label = window['Label'].iloc[0]
        windowed_labels.append(window_label)

# Convert lists to DataFrame
X_windowed = pd.DataFrame(windowed_features)
y_windowed = np.array(windowed_labels)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X_windowed)

# Scale the input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Deal with class imbalance using SMOTE with a reduced number of neighbors
smote = SMOTE(random_state=42, k_neighbors=2)  # Reduce k_neighbors to 2
X_resampled, y_resampled = smote.fit_resample(X_scaled, y_windowed)

# Split the data into train and test using stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)

# Apply LazyPredict for model benchmarking
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

# Fit and evaluate the models
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# Display results
print("\nLazyPredict Results:")
print(models)

# Optional: Save the results to a CSV file
models.to_csv('lazy_classifier_results.csv', index=False)

# Plot the top models' performance
plt.figure(figsize=(12, 8))
sns.barplot(x=models.index[:10], y=models['Accuracy'][:10], palette='viridis')
plt.title("Top 10 Model Accuracies")
plt.ylabel("Accuracy")
plt.xlabel("Model")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyClassifier
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

# Define file names
file_names = ["cow1.csv", "cow2.csv", "cow3.csv", "cow4.csv", "cow5.csv", "cow6.csv"]

# Label mapping provided for behaviors
behavior_mapping = {
    "RES": "Resting in standing position",
    "RUS": "Ruminating in standing position",
    "MOV": "Moving",
    "GRZ": "Grazing",
    "SLT": "Salt licking",
    "FES": "Feeding in stanchion",
    "DRN": "Drinking",
    "LCK": "Licking",
    "REL": "Resting in lying position",
    "URI": "Urinating",
    "ATT": "Attacking",
    "ESC": "Escaping",
    "BMN": "Being mounted",
    "ETC": "Other behaviors",
    "BLN": "Data without video, no label",
}

# Initialize an empty list to store DataFrames
all_data = []

# Iterate over each file and read them
for file_name in file_names:
    data = pd.read_csv(file_name)
    all_data.append(data)

# Concatenate all datasets into a single DataFrame
data = pd.concat(all_data, ignore_index=True)

# Step 1: Remove rows with labels like 'ETC' (Other behaviors) and 'BLN' (No label)
unwanted_labels = ['ETC', 'BLN']
data = data[~data['Label'].isin(unwanted_labels)]

# Step 2: Map numeric label values to descriptions using the provided mapping
data['Behavior_Description'] = data['Label'].map(behavior_mapping)

# Step 3: Encode the 'Behavior_Description' column to numeric labels
label_encoder = LabelEncoder()
data['Behavior_Label'] = label_encoder.fit_transform(data['Behavior_Description'])

# Print the mapping of labels to numeric values
print("Label Mapping to Numeric Labels:")
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

# Step 4: Clean column names (strip any leading/trailing spaces)
data.columns = data.columns.str.strip()

# Step 5: Check available columns to ensure the accelerometer data columns are present
print("Available Columns in Data:")
print(data.columns)

# Step 6: Define features (accelerometer data) and target (behavior labels)
# Update feature columns to match actual names in your data
if 'AccX' in data.columns and 'AccY' in data.columns and 'AccZ' in data.columns:
    X = data[['AccX', 'AccY', 'AccZ']]  # Feature columns
else:
    print("One or more accelerometer data columns are missing.")

y = data['Behavior_Label']  # Target column

# Step 7: Ensure that there are no missing or NaN values in the features and target columns
data_cleaned = data.dropna(subset=['AccX', 'AccY', 'AccZ', 'Behavior_Label'])

# Step 8: Split the cleaned dataset into training and testing sets
X_cleaned = data_cleaned[['AccX', 'AccY', 'AccZ']]
y_cleaned = data_cleaned['Behavior_Label']

X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y_cleaned, test_size=0.3, random_state=42)

# Step 9: Initialize LazyClassifier
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

# Fit LazyClassifier on the training data and evaluate on the test data
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# Display the results
print("\nLazyClassifier Results:")
print(models)

# Optional: Save the LazyClassifier results to a CSV file
models.to_csv('lazy_classifier_results.csv', index=False)

# Step 10: Visualize label distribution
plt.figure(figsize=(10, 6))
data['Behavior_Description'].value_counts().plot(kind='bar')
plt.title('Behavior Label Distribution')
plt.xlabel('Behavior')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyClassifier
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

# Define file names
file_names = ["cow1.csv", "cow2.csv", "cow3.csv", "cow4.csv", "cow5.csv", "cow6.csv"]

# Label mapping provided for behaviors
behavior_mapping = {
    "RES": "Resting in standing position",
    "RUS": "Ruminating in standing position",
    "MOV": "Moving",
    "GRZ": "Grazing",
    "SLT": "Salt licking",
    "FES": "Feeding in stanchion",
    "DRN": "Drinking",
    "LCK": "Licking",
    "REL": "Resting in lying position",
    "URI": "Urinating",
    "ATT": "Attacking",
    "ESC": "Escaping",
    "BMN": "Being mounted",
    "ETC": "Other behaviors",
    "BLN": "Data without video, no label",
}

# Initialize an empty list to store DataFrames
all_data = []

# Iterate over each file and read them
for file_name in file_names:
    data = pd.read_csv(file_name)
    all_data.append(data)

# Concatenate all datasets into a single DataFrame
data = pd.concat(all_data, ignore_index=True)

# Step 1: Remove rows with labels like 'ETC' (Other behaviors) and 'BLN' (No label)
unwanted_labels = ['ETC', 'BLN']
data = data[~data['Label'].isin(unwanted_labels)]

# Step 2: Map numeric label values to descriptions using the provided mapping
data['Behavior_Description'] = data['Label'].map(behavior_mapping)

# Step 3: Encode the 'Behavior_Description' column to numeric labels
label_encoder = LabelEncoder()
data['Behavior_Label'] = label_encoder.fit_transform(data['Behavior_Description'])

# Print the mapping of labels to numeric values
print("Label Mapping to Numeric Labels:")
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

# Step 4: Clean column names (strip any leading/trailing spaces)
data.columns = data.columns.str.strip()

# Step 5: Check available columns to ensure the accelerometer data columns are present
print("Available Columns in Data:")
print(data.columns)

# Step 6: Limit the dataset size to 1% of the original data
data = data.sample(frac=0.01, random_state=42)

# Step 7: Define features (accelerometer data) and target (behavior labels)
if 'AccX' in data.columns and 'AccY' in data.columns and 'AccZ' in data.columns:
    X = data[['AccX', 'AccY', 'AccZ']]  # Feature columns
else:
    print("One or more accelerometer data columns are missing.")

y = data['Behavior_Label']  # Target column

# Step 8: Ensure that there are no missing or NaN values in the features and target columns
data_cleaned = data.dropna(subset=['AccX', 'AccY', 'AccZ', 'Behavior_Label'])

# Step 9: Split the cleaned dataset into training and testing sets
X_cleaned = data_cleaned[['AccX', 'AccY', 'AccZ']]
y_cleaned = data_cleaned['Behavior_Label']

X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y_cleaned, test_size=0.3, random_state=42)

# Step 10: Initialize LazyClassifier
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

# Fit LazyClassifier on the training data and evaluate on the test data
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# Display the results
print("\nLazyClassifier Results:")
print(models)

# Optional: Save the LazyClassifier results to a CSV file
models.to_csv('lazy_classifier_results.csv', index=False)

# Step 11: Visualize label distribution
plt.figure(figsize=(10, 6))
data['Behavior_Description'].value_counts().plot(kind='bar')
plt.title('Behavior Label Distribution')
plt.xlabel('Behavior')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
# dataset 3
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyClassifier
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

# Column labels provided for accelerometer data and statistics
column_labels = ['date', 'x_mean', 'x_max', 'x_min', 'x_std', 'x_var', 'x_skew', 'x_kurtosis',
                 'y_mean', 'y_max', 'y_min', 'y_std', 'y_var', 'y_skew', 'y_kurtosis',
                 'z_mean', 'z_max', 'z_min', 'z_std', 'z_var', 'z_skew', 'z_kurtosis',
                 'n', 'x_range', 'y_range', 'z_range', 'svm', 'vmm', 'sma', 'ai', 'label']

# Data files
data_files = ["resampled_2_1S.csv", "resampled_2_3S.csv", "resampled_2_5S.csv"]

# Initialize an empty list to store DataFrames
data_frames = []

# Step 1: Load all data files into a single DataFrame
for file_name in data_files:
    data = pd.read_csv(file_name, names=column_labels, header=None)
    data_frames.append(data)

# Combine all data into a single DataFrame
data = pd.concat(data_frames, ignore_index=True)

# Step 2: Map numeric labels to meaningful behavior descriptions
# Here we assume the labels 1.0-6.0 map to specific behaviors (adjust this as necessary)
behavior_mapping = {
    1.0: 'Behavior 1',
    2.0: 'Behavior 2',
    3.0: 'Behavior 3',
    4.0: 'Behavior 4',
    5.0: 'Behavior 5',
    6.0: 'Behavior 6'
}

data['Behavior_Description'] = data['label'].map(behavior_mapping)

# Step 3: Encode the 'Behavior_Description' to numeric labels
label_encoder = LabelEncoder()
data['Behavior_Label'] = label_encoder.fit_transform(data['Behavior_Description'])

# Step 4: Define feature columns and target column
# We use all the columns except 'date' and 'label' as features
X = data.drop(columns=['date', 'label', 'Behavior_Description', 'Behavior_Label'])
y = data['Behavior_Label']

# Step 5: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 6: Initialize LazyClassifier and fit the model
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# Display the model results
print("\nLazyClassifier Results:")
print(models)

# Optional: Save the results to a CSV file
models.to_csv('lazy_classifier_results.csv', index=False)

# Step 7: Visualize the label distribution
plt.figure(figsize=(10, 6))
data['Behavior_Description'].value_counts().plot(kind='bar')
plt.title('Behavior Label Distribution')
plt.xlabel('Behavior')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()


In [None]:
# dataset 3
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyClassifier
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

# Column labels provided for accelerometer data and statistics
column_labels = ['date', 'x_mean', 'x_max', 'x_min', 'x_std', 'x_var', 'x_skew', 'x_kurtosis',
                 'y_mean', 'y_max', 'y_min', 'y_std', 'y_var', 'y_skew', 'y_kurtosis',
                 'z_mean', 'z_max', 'z_min', 'z_std', 'z_var', 'z_skew', 'z_kurtosis',
                 'n', 'x_range', 'y_range', 'z_range', 'svm', 'vmm', 'sma', 'ai', 'label']

# Data files
data_files = ["resampled_2_1S.csv", "resampled_2_3S.csv", "resampled_2_5S.csv"]

# Initialize an empty list to store DataFrames
data_frames = []

# Step 1: Load all data files into a single DataFrame
for file_name in data_files:
    data = pd.read_csv(file_name, names=column_labels, header=None)
    data_frames.append(data)

# Combine all data into a single DataFrame
data = pd.concat(data_frames, ignore_index=True)

# Step 2: Map numeric labels to meaningful behavior descriptions
# Here we assume the labels 1-6 map to specific behaviors (adjust this as necessary)
behavior_mapping = {
    1: 'Behavior 1',
    2: 'Behavior 2',
    3: 'Behavior 3',
    4: 'Behavior 4',
    5: 'Behavior 5',
    6: 'Behavior 6'
}

data['Behavior_Description'] = data['label'].map(behavior_mapping)

# Step 3: Encode the 'Behavior_Description' to numeric labels
label_encoder = LabelEncoder()
data['Behavior_Label'] = label_encoder.fit_transform(data['Behavior_Description'])

# Step 4: Define feature columns and target column
# We use all the columns except 'date' and 'label' as features
X = data.drop(columns=['date', 'label', 'Behavior_Description', 'Behavior_Label'])
y = data['Behavior_Label']

# Step 5: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 6: Initialize LazyClassifier and fit the model
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# Display the model results
print("\nLazyClassifier Results:")
print(models)

# Optional: Save the results to a CSV file
models.to_csv('lazy_classifier_results.csv', index=False)

# Step 7: Visualize the label distribution
plt.figure(figsize=(10, 6))
data['Behavior_Description'].value_counts().plot(kind='bar')
plt.title('Behavior Label Distribution')
plt.xlabel('Behavior')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyClassifier
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

# Data files
data_files = ["resampled_2_1S.csv", "resampled_2_3S.csv", "resampled_2_5S.csv"]

# Initialize an empty list to store DataFrames
data_frames = []

# Step 1: Load all data files into a single DataFrame
column_labels = ['timestamp', 'X-axis (g)', 'Y-axis (g)', 'Z-axis (g)', 'label']  # Define the column names for your dataset
for file_name in data_files:
    data = pd.read_csv(file_name, names=column_labels, header=None)
    data_frames.append(data)

# Combine all data into a single DataFrame
data = pd.concat(data_frames, ignore_index=True)

# Step 2: Map numeric labels to meaningful behavior descriptions
behavior_mapping = {
    1: 'Behavior 1',
    2: 'Behavior 2',
    3: 'Behavior 3',
    4: 'Behavior 4',
    5: 'Behavior 5',
    6: 'Behavior 6'
}

data['Behavior_Description'] = data['label'].map(behavior_mapping)

# Step 3: Encode the 'Behavior_Description' column to numeric labels
label_encoder = LabelEncoder()
data['Behavior_Label'] = label_encoder.fit_transform(data['Behavior_Description'])

# Print the mapping of behaviors to numeric labels
print("Behavior Mapping:")
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

# Step 4: Define features (accelerometer data) and target (behavior labels)
X = data[['X-axis (g)', 'Y-axis (g)', 'Z-axis (g)']]  # Feature columns
y = data['Behavior_Label']                           # Target column

# Step 5: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 6: Initialize LazyClassifier
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

# Step 7: Fit LazyClassifier on the training data and evaluate on the test data
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# Display the results
print("\nLazyClassifier Results:")
print(models)

# Optional: Save the LazyClassifier results to a CSV file
models.to_csv('lazy_classifier_results.csv', index=False)

# Step 8: Visualize behavior label distribution (optional)
plt.figure(figsize=(10, 6))
data['Behavior_Description'].value_counts().plot(kind='bar')
plt.title('Behavior Label Distribution')
plt.xlabel('Behavior')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()


In [None]:
pip install lazypredict


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from lazypredict.Supervised import LazyClassifier

# Define column labels (adjust as per your dataset)
column_labels = ['date', 'x_mean', 'x_max', 'x_min', 'x_std', 'x_var', 'x_skew', 'x_kurtosis',
                 'y_mean', 'y_max', 'y_min', 'y_std', 'y_var', 'y_skew', 'y_kurtosis',
                 'z_mean', 'z_max', 'z_min', 'z_std', 'z_var', 'z_skew', 'z_kurtosis',
                 'n', 'x_range', 'y_range', 'z_range', 'svm', 'vmm', 'sma', 'ai', 'label']

# Data files
data_files = ["resampled_2_1S.csv", "resampled_2_3S.csv", "resampled_2_5S.csv"]

# Initialize lists to store all data
data_frames = []

# Load all data files and append to data_frames list
for file_name in data_files:
    df = pd.read_csv(file_name, names=column_labels, header=None)
    data_frames.append(df)

# Concatenate all data frames into a single dataframe
data = pd.concat(data_frames, ignore_index=True)

# Assuming 'date' is the column containing timestamps
data['date'] = pd.to_datetime(data['date'])  # Convert to datetime format

# Extract features (drop 'date' and use the rest)
X = data.drop(columns=['date', 'label'])
y = data['label']

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Scale the input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Deal with class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)

# Initialize LazyClassifier and perform lazy prediction
lazy_clf = LazyClassifier()
models, predictions = lazy_clf.fit(X_train, X_test, y_train, y_test)

# Print the results
print(models)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from lazypredict.Supervised import LazyClassifier

# Define column labels (as per your dataset)
column_labels = ['date', 'x_mean', 'x_max', 'x_min', 'x_std', 'x_var', 'x_skew', 'x_kurtosis',
                 'y_mean', 'y_max', 'y_min', 'y_std', 'y_var', 'y_skew', 'y_kurtosis',
                 'z_mean', 'z_max', 'z_min', 'z_std', 'z_var', 'z_skew', 'z_kosis',
                 'n', 'x_range', 'y_range', 'z_range', 'svm', 'vmm', 'sma', 'ai', 'label']

# Data files
data_files = ["resampled_2_1S.csv", "resampled_2_3S.csv", "resampled_2_5S.csv"]

# Initialize lists to store all data
data_frames = []

# Load all data files and append to data_frames list
for file_name in data_files:
    df = pd.read_csv(file_name, names=column_labels, header=None)
    data_frames.append(df)

# Concatenate all data frames into a single dataframe
data = pd.concat(data_frames, ignore_index=True)

# Convert 'date' column to datetime format
data['date'] = pd.to_datetime(data['date'])

# Extract features (drop 'date' and 'label')
X = data.drop(columns=['date', 'label'])
y = data['label']

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Scale the input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Deal with class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)

# Initialize LazyClassifier and perform lazy prediction
lazy_clf = LazyClassifier()
models, predictions = lazy_clf.fit(X_train, X_test, y_train, y_test)

# Print the results
print(models)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from lazypredict.Supervised import LazyClassifier

# Define column labels (as per your dataset)
column_labels = ['date', 'x_mean', 'x_max', 'x_min', 'x_std', 'x_var', 'x_skew', 'x_kurtosis',
                 'y_mean', 'y_max', 'y_min', 'y_std', 'y_var', 'y_skew', 'y_kurtosis',
                 'z_mean', 'z_max', 'z_min', 'z_std', 'z_var', 'z_skew', 'z_kurtosis',
                 'n', 'x_range', 'y_range', 'z_range', 'svm', 'vmm', 'sma', 'ai', 'label']

# Data files
data_files = ["resampled_2_1S.csv", "resampled_2_3S.csv", "resampled_2_5S.csv"]

# Initialize lists to store all data
data_frames = []

# Load all data files and append to data_frames list
for file_name in data_files:
    df = pd.read_csv(file_name, names=column_labels, header=0)  # Set header=0 to skip first row
    data_frames.append(df)

# Concatenate all data frames into a single dataframe
data = pd.concat(data_frames, ignore_index=True)

# Check the first few rows to confirm if the 'date' column is correctly loaded
print(data.head())

# Convert 'date' column to datetime format, handling errors
data['date'] = pd.to_datetime(data['date'], errors='coerce')

# Check for any rows where the 'date' column could not be parsed
print(data[data['date'].isna()])

# Extract features (drop 'date' and 'label')
X = data.drop(columns=['date', 'label'])
y = data['label']

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Scale the input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Deal with class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)

# Initialize LazyClassifier and perform lazy prediction
lazy_clf = LazyClassifier()
models, predictions = lazy_clf.fit(X_train, X_test, y_train, y_test)

# Print the results
print(models)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from lazypredict.Supervised import LazyClassifier

# Define column labels (as per your dataset)
column_labels = ['date', 'x_mean', 'x_max', 'x_min', 'x_std', 'x_var', 'x_skew', 'x_kurtosis',
                 'y_mean', 'y_max', 'y_min', 'y_std', 'y_var', 'y_skew', 'y_kurtosis',
                 'z_mean', 'z_max', 'z_min', 'z_std', 'z_var', 'z_skew', 'z_kurtosis',
                 'n', 'x_range', 'y_range', 'z_range', 'svm', 'vmm', 'sma', 'ai', 'label']

# Data files
data_files = ["resampled_2_1S.csv", "resampled_2_3S.csv", "resampled_2_5S.csv"]

# Initialize lists to store all data
data_frames = []

# Load all data files and append to data_frames list
for file_name in data_files:
    df = pd.read_csv(file_name, names=column_labels, header=0)  # Set header=0 to skip first row
    data_frames.append(df)

# Concatenate all data frames into a single dataframe
data = pd.concat(data_frames, ignore_index=True)

# Convert 'date' column to datetime format, handling errors
data['date'] = pd.to_datetime(data['date'], errors='coerce')

# Check for any rows where the 'date' column could not be parsed
print(data[data['date'].isna()])

# Convert 'label' column to categorical if it is not already
data['label'] = data['label'].astype(int)  # Ensure 'label' is of integer type

# Check if the labels are categorical
print(data['label'].unique())

# Extract features (drop 'date' and 'label')
X = data.drop(columns=['date', 'label'])
y = data['label']

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Scale the input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Deal with class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)

# Initialize LazyClassifier and perform lazy prediction
lazy_clf = LazyClassifier()
models, predictions = lazy_clf.fit(X_train, X_test, y_train, y_test)

# Print the results
print(models)

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import lazypredict
from lazypredict.Supervised import LazyClassifier
import matplotlib.pyplot as plt
import seaborn as sns

# Define column labels (adjust as per your dataset)
column_labels = ['date', 'x_mean', 'x_max', 'x_min', 'x_std', 'x_var', 'x_skew', 'x_kurtosis',
                 'y_mean', 'y_max', 'y_min', 'y_std', 'y_var', 'y_skew', 'y_kurtosis',
                 'z_mean', 'z_max', 'z_min', 'z_std', 'z_var', 'z_skew', 'z_kurtosis',
                 'n', 'x_range', 'y_range', 'z_range', 'svm', 'vmm', 'sma', 'ai', 'label']

# Data files
data_files = ["resampled_2_1S.csv", "resampled_2_3S.csv", "resampled_2_5S.csv"]

# Initialize lists to store all data
data_frames = []

# Load all data files and append to data_frames list
for file_name in data_files:
    df = pd.read_csv(file_name, names=column_labels, header=0)  # Adjusted header=0 to skip the first row
    data_frames.append(df)

# Concatenate all data frames into a single dataframe
data = pd.concat(data_frames, ignore_index=True)

# Assuming 'date' is the column containing timestamps
data['date'] = pd.to_datetime(data['date'], errors='coerce')  # Handle errors during conversion

# Check if there are any invalid date values
if data['date'].isna().any():
    print(f"Warning: There are invalid date values in the dataset.")
    print(data[data['date'].isna()])

# Calculate time difference
time_diff = data['date'].diff().dropna()  # Drop NA values and calculate time difference

# Calculate frequency (assuming uniform sampling)
mean_time_diff = time_diff.mean()
frequency = 1 / mean_time_diff.total_seconds()  # Convert to Hz
print(f"Sampling frequency for combined data: {frequency:.2f} Hz")

# Calculate window size based on frequency
window_duration = 0.5  # Seconds
window_size = int(frequency * window_duration)

# Ensure window size is at least 1
if window_size == 0:
    print("Window size is zero, adjusting to 1")
    window_size = 1

# Extract windowed features and labels
windowed_features = []
windowed_labels = []

for i in range(0, len(data), window_size):
    window = data.iloc[i:i + window_size]
    if len(window) == window_size:  # Compute mean values as features for this window
        window_features = window.mean()
        window_label = window['label'].iloc[0]  # Assuming the label is the same for the entire window
        windowed_features.append(window_features)
        windowed_labels.append(window_label)

# Convert lists to DataFrame
X = pd.DataFrame(windowed_features).drop(columns=['date'])  # Drop 'date' column for features
y = np.array(windowed_labels)

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Scale the input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Deal with class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)

# Use LazyPredict to compare models
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# Print the model performance summary
print(models)


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import lazypredict
from lazypredict.Supervised import LazyClassifier
import matplotlib.pyplot as plt
import seaborn as sns

# Define column labels (adjust as per your dataset)
column_labels = ['date', 'x_mean', 'x_max', 'x_min', 'x_std', 'x_var', 'x_skew', 'x_kurtosis',
                 'y_mean', 'y_max', 'y_min', 'y_std', 'y_var', 'y_skew', 'y_kurtosis',
                 'z_mean', 'z_max', 'z_min', 'z_std', 'z_var', 'z_skew', 'z_kurtosis',
                 'n', 'x_range', 'y_range', 'z_range', 'svm', 'vmm', 'sma', 'ai', 'label']

# Data files
data_files = ["resampled_2_1S.csv", "resampled_2_3S.csv", "resampled_2_5S.csv"]

# Initialize lists to store all data
data_frames = []

# Load all data files and append to data_frames list
for file_name in data_files:
    df = pd.read_csv(file_name, names=column_labels, header=0)  # Adjusted header=0 to skip the first row
    data_frames.append(df)

# Concatenate all data frames into a single dataframe
data = pd.concat(data_frames, ignore_index=True)

# Assuming 'date' is the column containing timestamps
data['date'] = pd.to_datetime(data['date'], errors='coerce')  # Handle errors during conversion

# Check if there are any invalid date values
if data['date'].isna().any():
    print(f"Warning: There are invalid date values in the dataset.")
    print(data[data['date'].isna()])

# Calculate time difference
time_diff = data['date'].diff().dropna()  # Drop NA values and calculate time difference

# Calculate frequency (assuming uniform sampling)
mean_time_diff = time_diff.mean()
frequency = 1 / mean_time_diff.total_seconds()  # Convert to Hz
print(f"Sampling frequency for combined data: {frequency:.2f} Hz")

# Calculate window size based on frequency
window_duration = 0.5  # Seconds
window_size = int(frequency * window_duration)

# Ensure window size is at least 1
if window_size == 0:
    print("Window size is zero, adjusting to 1")
    window_size = 1

# Extract windowed features and labels
windowed_features = []
windowed_labels = []

for i in range(0, len(data), window_size):
    window = data.iloc[i:i + window_size]
    if len(window) == window_size:  # Compute mean values as features for this window
        window_features = window.mean()
        window_label = window['label'].iloc[0]  # Assuming the label is the same for the entire window
        windowed_features.append(window_features)
        windowed_labels.append(window_label)

# Convert lists to DataFrame
X = pd.DataFrame(windowed_features).drop(columns=['date'])  # Drop 'date' column for features
y = np.array(windowed_labels)

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Scale the input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Check if the labels are continuous (for regression) or discrete (for classification)
print(np.unique(y))

# If the labels are continuous, convert them to discrete classes
if np.issubdtype(y.dtype, np.floating):  # Check if the labels are continuous (floats)
    # Example: Bin continuous labels into 3 classes
    y_discrete = pd.cut(y, bins=3, labels=[0, 1, 2])  # Adjust as needed for your dataset
    print("Labels have been binned into discrete classes.")
else:
    y_discrete = y  # No change if labels are already discrete

# Deal with class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y_discrete)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)

# Use LazyPredict to compare models
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# Print the model performance summary
print(models)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import lazypredict
from lazypredict.Supervised import LazyClassifier

# Define column labels (adjust as per your dataset)
column_labels = ['date', 'x_mean', 'x_max', 'x_min', 'x_std', 'x_var', 'x_skew', 'x_kurtosis',
                 'y_mean', 'y_max', 'y_min', 'y_std', 'y_var', 'y_skew', 'y_kurtosis',
                 'z_mean', 'z_max', 'z_min', 'z_std', 'z_var', 'z_skew', 'z_kurtosis',
                 'n', 'x_range', 'y_range', 'z_range', 'svm', 'vmm', 'sma', 'ai', 'label']

# Data files
data_files = ["resampled_2_1S.csv", "resampled_2_3S.csv", "resampled_2_5S.csv"]

# Initialize lists to store all data
data_frames = []

# Load all data files and append to data_frames list
for file_name in data_files:
    df = pd.read_csv(file_name)
    data_frames.append(df)

# Concatenate all data frames into a single dataframe
data = pd.concat(data_frames, ignore_index=True)

# Assuming 'date' is the column containing timestamps
data['date'] = pd.to_datetime(data['date'])  # Convert to datetime format

# Calculate time difference
time_diff = data['date'].diff().dropna()  # Drop NA values and calculate time difference

# Calculate frequency (assuming uniform sampling)
mean_time_diff = time_diff.mean()
frequency = 1 / mean_time_diff.total_seconds()  # Convert to Hz

print("Sampling frequency for combined data:", frequency, "Hz")

# Calculate window size based on frequency
window_duration = 0.5  # Seconds
window_size = int(frequency * window_duration)

if window_size == 0:
    print("Window size is zero, adjusting to 1")
    window_size = 1

# Extract windowed features and labels
windowed_features = []
windowed_labels = []

for i in range(0, len(data), window_size):
    window = data.iloc[i:i+window_size]
    if len(window) == window_size:
        # Compute mean values as features for this window
        window_features = window.mean()
        window_label = window['label'].iloc[0]
        windowed_features.append(window_features)
        windowed_labels.append(window_label)

# Convert lists to DataFrame
X = pd.DataFrame(windowed_features).drop(columns=['date'])  # Drop 'date' column for features
y = np.array(windowed_labels)

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Scale the input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Deal with class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)

# Use LazyPredict to evaluate models
lazy_classifier = LazyClassifier()
models = lazy_classifier.fit(X_train, X_test, y_train, y_test)

# Show model performance comparison
print(models[0])  # This prints out performance metrics of all models

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import lazypredict
from lazypredict.Supervised import LazyClassifier

# Define column labels (adjust as per your dataset)
column_labels = ['date', 'x_mean', 'x_max', 'x_min', 'x_std', 'x_var', 'x_skew', 'x_kurtosis',
                 'y_mean', 'y_max', 'y_min', 'y_std', 'y_var', 'y_skew', 'y_kurtosis',
                 'z_mean', 'z_max', 'z_min', 'z_std', 'z_var', 'z_skew', 'z_kurtosis',
                 'n', 'x_range', 'y_range', 'z_range', 'svm', 'vmm', 'sma', 'ai', 'label']

# Data files
data_files = ["resampled_2_1S.csv", "resampled_2_3S.csv", "resampled_2_5S.csv"]

# Initialize lists to store all data
data_frames = []

# Load all data files and append to data_frames list
for file_name in data_files:
    df = pd.read_csv(file_name)
    data_frames.append(df)

# Concatenate all data frames into a single dataframe
data = pd.concat(data_frames, ignore_index=True)

# Assuming 'date' is the column containing timestamps
data['date'] = pd.to_datetime(data['date'])  # Convert to datetime format

# Calculate time difference
time_diff = data['date'].diff().dropna()  # Drop NA values and calculate time difference

# Calculate frequency (assuming uniform sampling)
mean_time_diff = time_diff.mean()
frequency = 1 / mean_time_diff.total_seconds()  # Convert to Hz

print("Sampling frequency for combined data:", frequency, "Hz")

# Calculate window size based on frequency
window_duration = 0.5  # Seconds
window_size = int(frequency * window_duration)

if window_size == 0:
    print("Window size is zero, adjusting to 1")
    window_size = 1

# Extract windowed features and labels
windowed_features = []
windowed_labels = []

for i in range(0, len(data), window_size):
    window = data.iloc[i:i+window_size]
    if len(window) == window_size:
        # Compute mean values as features for this window
        window_features = window.mean()
        window_label = window['label'].iloc[0]
        windowed_features.append(window_features)
        windowed_labels.append(window_label)

# Convert lists to DataFrame
X = pd.DataFrame(windowed_features).drop(columns=['date'])  # Drop 'date' column for features
y = np.array(windowed_labels)

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Scale the input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Deal with class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)

# Use LazyPredict to evaluate models
lazy_classifier = LazyClassifier()
models = lazy_classifier.fit(X_train, X_test, y_train, y_test)

# Show model performance comparison
print(models[0])  # This prints out performance metrics of all models

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import lazypredict
from lazypredict.Supervised import LazyClassifier
import warnings

# Suppress Dask warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="dask")

# Define column labels (adjust as per your dataset)
column_labels = ['date', 'x_mean', 'x_max', 'x_min', 'x_std', 'x_var', 'x_skew', 'x_kurtosis',
                 'y_mean', 'y_max', 'y_min', 'y_std', 'y_var', 'y_skew', 'y_kurtosis',
                 'z_mean', 'z_max', 'z_min', 'z_std', 'z_var', 'z_skew', 'z_kurtosis',
                 'n', 'x_range', 'y_range', 'z_range', 'svm', 'vmm', 'sma', 'ai', 'label']

# Data files
data_files = ["resampled_2_1S.csv", "resampled_2_3S.csv", "resampled_2_5S.csv"]

# Initialize lists to store all data
data_frames = []

# Load all data files and append to data_frames list
for file_name in data_files:
    df = pd.read_csv(file_name)
    data_frames.append(df)

# Concatenate all data frames into a single dataframe
data = pd.concat(data_frames, ignore_index=True)

# Assuming 'date' is the column containing timestamps
data['date'] = pd.to_datetime(data['date'])  # Convert to datetime format

# Calculate time difference
time_diff = data['date'].diff().dropna()  # Drop NA values and calculate time difference

# Calculate frequency (assuming uniform sampling)
mean_time_diff = time_diff.mean()
frequency = 1 / mean_time_diff.total_seconds()  # Convert to Hz

print("Sampling frequency for combined data:", frequency, "Hz")

# Calculate window size based on frequency
window_duration = 0.5  # Seconds
window_size = max(1, int(frequency * window_duration))  # Ensure window size is at least 1

# Extract windowed features and labels
windowed_features = []
windowed_labels = []

for i in range(0, len(data), window_size):
    window = data.iloc[i:i+window_size]
    if len(window) == window_size:
        # Compute mean values as features for this window
        window_features = window.mean()
        window_label = window['label'].iloc[0]
        windowed_features.append(window_features)
        windowed_labels.append(window_label)

# Convert lists to DataFrame
X = pd.DataFrame(windowed_features).drop(columns=['date'])  # Drop 'date' column for features
y = np.array(windowed_labels)

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Scale the input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Deal with class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)

# Use LazyPredict to evaluate models
lazy_classifier = LazyClassifier()
models = lazy_classifier.fit(X_train, X_test, y_train, y_test)

# Show model performance comparison
print(models[0])  # This prints out performance metrics of all models

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import lazypredict
from lazypredict.Supervised import LazyClassifier
import warnings

# Suppress Dask warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="dask")

# Define column labels (adjust as per your dataset)
column_labels = ['date', 'x_mean', 'x_max', 'x_min', 'x_std', 'x_var', 'x_skew', 'x_kurtosis',
                 'y_mean', 'y_max', 'y_min', 'y_std', 'y_var', 'y_skew', 'y_kurtosis',
                 'z_mean', 'z_max', 'z_min', 'z_std', 'z_var', 'z_skew', 'z_kurtosis',
                 'n', 'x_range', 'y_range', 'z_range', 'svm', 'vmm', 'sma', 'ai', 'label']

# Data files
data_files = ["resampled_2_1S.csv", "resampled_2_3S.csv", "resampled_2_5S.csv"]

# Initialize lists to store all data
data_frames = []

# Load all data files and append to data_frames list
for file_name in data_files:
    df = pd.read_csv(file_name)
    data_frames.append(df)

# Concatenate all data frames into a single dataframe
data = pd.concat(data_frames, ignore_index=True)

# Assuming 'date' is the column containing timestamps
data['date'] = pd.to_datetime(data['date'])  # Convert to datetime format

# Feature columns (excluding 'date' and 'label')
X = data[column_labels[:-1]]

# Target variable
y = data['label']

# Handle missing values (if any)
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Scaling features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Apply SMOTE for class balancing
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Apply LazyPredict classifier
clf = LazyClassifier()
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# View results
print(models)


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import lazypredict
from lazypredict.Supervised import LazyClassifier
import warnings

# Suppress Dask warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="dask")

# Define column labels (adjust as per your dataset)
column_labels = ['date', 'x_mean', 'x_max', 'x_min', 'x_std', 'x_var', 'x_skew', 'x_kurtosis',
                 'y_mean', 'y_max', 'y_min', 'y_std', 'y_var', 'y_skew', 'y_kurtosis',
                 'z_mean', 'z_max', 'z_min', 'z_std', 'z_var', 'z_skew', 'z_kurtosis',
                 'n', 'x_range', 'y_range', 'z_range', 'svm', 'vmm', 'sma', 'ai', 'label']

# Data files
data_files = ["resampled_2_1S.csv", "resampled_2_3S.csv", "resampled_2_5S.csv"]

# Initialize lists to store all data
data_frames = []

# Load all data files and append to data_frames list
for file_name in data_files:
    df = pd.read_csv(file_name)
    data_frames.append(df)

# Concatenate all data frames into a single dataframe
data = pd.concat(data_frames, ignore_index=True)

# Assuming 'date' is the column containing timestamps
data['date'] = pd.to_datetime(data['date'])  # Convert to datetime format

# Remove the 'date' column from features (or convert it to numeric if needed)
X = data[column_labels[:-1]]  # Exclude the 'date' column

# Alternatively, if you want to use date features (like year, month, day, etc.):
# X['year'] = X['date'].dt.year
# X['month'] = X['date'].dt.month
# X['day'] = X['date'].dt.day
# X = X.drop(columns=['date'])  # Drop original date column

# Target variable
y = data['label']

# Handle missing values (if any)
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Scaling features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Apply SMOTE for class balancing
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Apply LazyPredict classifier
clf = LazyClassifier()
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# View results
print(models)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import lazypredict
from lazypredict.Supervised import LazyClassifier
import warnings

# Suppress Dask warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="dask")

# Define column labels (adjust as per your dataset)
column_labels = ['date', 'x_mean', 'x_max', 'x_min', 'x_std', 'x_var', 'x_skew', 'x_kurtosis',
                 'y_mean', 'y_max', 'y_min', 'y_std', 'y_var', 'y_skew', 'y_kurtosis',
                 'z_mean', 'z_max', 'z_min', 'z_std', 'z_var', 'z_skew', 'z_kurtosis',
                 'n', 'x_range', 'y_range', 'z_range', 'svm', 'vmm', 'sma', 'ai', 'label']

# Data files
data_files = ["resampled_2_1S.csv", "resampled_2_3S.csv", "resampled_2_5S.csv"]

# Initialize lists to store all data
data_frames = []

# Load all data files and append to data_frames list
for file_name in data_files:
    df = pd.read_csv(file_name)
    data_frames.append(df)

# Concatenate all data frames into a single dataframe
data = pd.concat(data_frames, ignore_index=True)

# Assuming 'date' is the column containing timestamps
data['date'] = pd.to_datetime(data['date'])  # Convert to datetime format

# Drop the 'date' column completely from features
X = data[column_labels[:-1]]  # Exclude the 'label' and 'date' columns

# Ensure 'date' is fully excluded
X = X.loc[:, X.columns != 'date']

# Target variable
y = data['label']

# Handle missing values (if any)
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Scaling features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Apply SMOTE for class balancing
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Apply LazyPredict classifier
clf = LazyClassifier()
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# View results
print(models)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import lazypredict
from lazypredict.Supervised import LazyClassifier
import warnings

# Suppress Dask warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="dask")

# Define column labels (adjust as per your dataset)
column_labels = ['date', 'x_mean', 'x_max', 'x_min', 'x_std', 'x_var', 'x_skew', 'x_kurtosis',
                 'y_mean', 'y_max', 'y_min', 'y_std', 'y_var', 'y_skew', 'y_kurtosis',
                 'z_mean', 'z_max', 'z_min', 'z_std', 'z_var', 'z_skew', 'z_kurtosis',
                 'n', 'x_range', 'y_range', 'z_range', 'svm', 'vmm', 'sma', 'ai', 'label']

# Data files
data_files = ["resampled_2_1S.csv", "resampled_2_3S.csv", "resampled_2_5S.csv"]

# Initialize lists to store all data
data_frames = []

# Load all data files and append to data_frames list
for file_name in data_files:
    df = pd.read_csv(file_name)
    data_frames.append(df)

# Concatenate all data frames into a single dataframe
data = pd.concat(data_frames, ignore_index=True)

# Assuming 'date' is the column containing timestamps
data['date'] = pd.to_datetime(data['date'])  # Convert to datetime format

# Drop the 'date' column completely from features
X = data[column_labels[:-1]]  # Exclude the 'label' and 'date' columns

# Ensure 'date' is fully excluded
X = X.loc[:, X.columns != 'date']

# Target variable
y = data['label']

# Downsample to 10% of the data to reduce memory usage
data_downsampled = data.sample(frac=0.1, random_state=42)

# Extract features and target from downsampled data
X_downsampled = data_downsampled[column_labels[:-1]].loc[:, data_downsampled.columns != 'date']
y_downsampled = data_downsampled['label']

# Handle missing values (if any) on the downsampled data
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X_downsampled)

# Scaling features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Apply SMOTE for class balancing
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y_downsampled)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Apply LazyPredict classifier
clf = LazyClassifier()
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# View results
print(models)


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import lazypredict
from lazypredict.Supervised import LazyClassifier
import warnings

# Suppress Dask warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="dask")

# Define column labels (adjust as per your dataset)
column_labels = ['date', 'x_mean', 'x_max', 'x_min', 'x_std', 'x_var', 'x_skew', 'x_kurtosis',
                 'y_mean', 'y_max', 'y_min', 'y_std', 'y_var', 'y_skew', 'y_kurtosis',
                 'z_mean', 'z_max', 'z_min', 'z_std', 'z_var', 'z_skew', 'z_kurtosis',
                 'n', 'x_range', 'y_range', 'z_range', 'svm', 'vmm', 'sma', 'ai', 'label']

# Data files
data_files = ["resampled_2_1S.csv", "resampled_2_3S.csv", "resampled_2_5S.csv"]

# Initialize lists to store all data
data_frames = []

# Load all data files and append to data_frames list
for file_name in data_files:
    df = pd.read_csv(file_name)
    data_frames.append(df)

# Concatenate all data frames into a single dataframe
data = pd.concat(data_frames, ignore_index=True)

# Assuming 'date' is the column containing timestamps
data['date'] = pd.to_datetime(data['date'])  # Convert to datetime format

# Downsampling (example step, adjust as necessary)
# Let's assume we downsample to 30% of the data for quicker testing
data_downsampled = data.sample(frac=0.3, random_state=42)

# Extract features and target from downsampled data
X_downsampled = data_downsampled.drop(columns=['date', 'label'])  # Drop 'date' and 'label' columns
y_downsampled = data_downsampled['label']  # 'label' is the target variable

# Handle missing values (if any) on the downsampled data
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X_downsampled)

# Scaling features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Apply SMOTE for class balancing
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y_downsampled)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Apply LazyPredict classifier
clf = LazyClassifier()
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# View results
print(models)


In [None]:
pip install xgboost

In [None]:
#dataset 4 (datset_6)
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyClassifier
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

# Load the data from CSV file
data = pd.read_csv("dataset_6.csv")

# Display the first few rows to understand the data structure
print(data.head())

# Step 1: Clean the data - check for missing values and remove rows with missing data
data_cleaned = data.dropna()  # Drop rows with NaNs (if necessary)

# Step 2: Map behavior labels to numeric values (if necessary)
# Assuming behavior labels are in a column named 'label' or 'Behavior_Label'
behavior_mapping = {
    'Grazing': 0,
    'Lying-Resting': 1,
    'Lying-Ruminating': 2,
    'Standing-Resting': 3,
    'Standing-Ruminating': 4,
    'Walking': 5
}

# Map the behavior labels
data_cleaned['Behavior_Label'] = data_cleaned['label'].map(behavior_mapping)

# Step 3: Encode labels (optional, since mapping already gives numeric values)
# label_encoder = LabelEncoder()
# data_cleaned['Behavior_Label'] = label_encoder.fit_transform(data_cleaned['label'])

# Step 4: Define feature columns (exclude 'label' and 'Behavior_Label' columns)
# Assuming the dataset has relevant feature columns for classification
X = data_cleaned.drop(columns=['label', 'Behavior_Label'])

# Define the target variable (Behavior labels)
y = data_cleaned['Behavior_Label']

# Step 5: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 6: Initialize LazyClassifier and fit the model
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# Display the model results
print("\nLazyClassifier Results:")
print(models)

# Optional: Save the results to a CSV file
models.to_csv('lazy_classifier_results.csv', index=False)

# Step 7: Visualize the label distribution
plt.figure(figsize=(10, 6))
data_cleaned['label'].value_counts().plot(kind='bar')
plt.title('Behavior Label Distribution')
plt.xlabel('Behavior')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
pip install pandas numpy scikit-learn lazypredict matplotlib

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import lazypredict
from lazypredict.Supervised import LazyClassifier
import xgboost as xgb
from sklearn.metrics import classification_report
import warnings

# Suppress Dask warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="dask")

# Define column labels (adjust as per your dataset)
column_labels = ['date', 'x_mean', 'x_max', 'x_min', 'x_std', 'x_var', 'x_skew', 'x_kurtosis',
                 'y_mean', 'y_max', 'y_min', 'y_std', 'y_var', 'y_skew', 'y_kurtosis',
                 'z_mean', 'z_max', 'z_min', 'z_std', 'z_var', 'z_skew', 'z_kurtosis',
                 'n', 'x_range', 'y_range', 'z_range', 'svm', 'vmm', 'sma', 'ai', 'label']

# Data files
data_files = ["resampled_2_1S.csv", "resampled_2_3S.csv", "resampled_2_5S.csv"]

# Initialize lists to store all data
data_frames = []

# Load all data files and append to data_frames list
for file_name in data_files:
    df = pd.read_csv(file_name)
    data_frames.append(df)

# Concatenate all data frames into a single dataframe
data = pd.concat(data_frames, ignore_index=True)

# Assuming 'date' is the column containing timestamps
data['date'] = pd.to_datetime(data['date'])  # Convert to datetime format

# Downsampling (example step, adjust as necessary)
# Let's assume we downsample to 30% of the data for quicker testing
data_downsampled = data.sample(frac=0.3, random_state=42)

# Extract features and target from downsampled data
X_downsampled = data_downsampled.drop(columns=['date', 'label'])  # Drop 'date' and 'label' columns
y_downsampled = data_downsampled['label']  # 'label' is the target variable

# Handle missing values (if any) on the downsampled data
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X_downsampled)

# Scaling features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Apply SMOTE for class balancing
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y_downsampled)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Apply LazyPredict classifier
clf = LazyClassifier()
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# View results from LazyPredict
print(models)

# Now, let's add XGBClassifier for comparison
xgb_clf = xgb.XGBClassifier(eval_metric='mlogloss', use_label_encoder=False)
xgb_clf.fit(X_train, y_train)

# Predict with XGBClassifier
y_pred_xgb = xgb_clf.predict(X_test)

# Display classification report for XGBClassifier
print("\nXGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb))


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import xgboost as xgb
from sklearn.metrics import accuracy_score
import time
import warnings

# Suppress Dask warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="dask")

# Data files
data_files = ["resampled_2_1S.csv", "resampled_2_3S.csv", "resampled_2_5S.csv"]

# Initialize lists to store all data
data_frames = []

# Load all data files and append to data_frames list
for file_name in data_files:
    df = pd.read_csv(file_name)
    data_frames.append(df)

# Concatenate all data frames into a single dataframe
data = pd.concat(data_frames, ignore_index=True)

# Assuming 'date' is the column containing timestamps
data['date'] = pd.to_datetime(data['date'])  # Convert to datetime format

# Downsampling (example step, adjust as necessary)
data_downsampled = data.sample(frac=0.3, random_state=42)

# Extract features and target from downsampled data
X_downsampled = data_downsampled.drop(columns=['date', 'label'])  # Drop 'date' and 'label' columns
y_downsampled = data_downsampled['label']  # 'label' is the target variable

# Handle missing values (if any) on the downsampled data
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X_downsampled)

# Scaling features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Apply SMOTE for class balancing
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y_downsampled)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Initialize and fit the XGBClassifier
xgb_clf = xgb.XGBClassifier(eval_metric='mlogloss', use_label_encoder=False)

# Record the start time for training
start_time = time.time()

# Fit the model
xgb_clf.fit(X_train, y_train)

# Record the end time for training
end_time = time.time()

# Calculate training time
train_time = end_time - start_time

# Make predictions on the test set
y_pred_xgb = xgb_clf.predict(X_test)

# Calculate accuracy
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

# Print results
print(f"XGBoost Accuracy: {accuracy_xgb:.4f}")
print(f"XGBoost Training Time: {train_time:.4f} seconds")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import xgboost as xgb
from sklearn.metrics import accuracy_score
import time
import warnings

# Suppress Dask warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="dask")

# Data files
data_files = ["resampled_2_1S.csv", "resampled_2_3S.csv", "resampled_2_5S.csv"]

# Initialize lists to store all data
data_frames = []

# Load all data files and append to data_frames list
for file_name in data_files:
    df = pd.read_csv(file_name)
    data_frames.append(df)

# Concatenate all data frames into a single dataframe
data = pd.concat(data_frames, ignore_index=True)

# Assuming 'date' is the column containing timestamps
data['date'] = pd.to_datetime(data['date'])  # Convert to datetime format

# Downsampling (example step, adjust as necessary)
data_downsampled = data.sample(frac=0.3, random_state=42)

# Extract features and target from downsampled data
X_downsampled = data_downsampled.drop(columns=['date', 'label'])  # Drop 'date' and 'label' columns
y_downsampled = data_downsampled['label']  # 'label' is the target variable

# Shift the class labels to start from 0
y_downsampled -= 1  # Subtract 1 from all labels to ensure they start from 0

# Handle missing values (if any) on the downsampled data
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X_downsampled)

# Scaling features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Apply SMOTE for class balancing
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y_downsampled)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Initialize and fit the XGBClassifier
xgb_clf = xgb.XGBClassifier(eval_metric='mlogloss', use_label_encoder=False)

# Record the start time for training
start_time = time.time()

# Fit the model
xgb_clf.fit(X_train, y_train)

# Record the end time for training
end_time = time.time()

# Calculate training time
train_time = end_time - start_time

# Make predictions on the test set
y_pred_xgb = xgb_clf.predict(X_test)

# Calculate accuracy
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

# Print results
print(f"XGBoost Accuracy: {accuracy_xgb:.4f}")
print(f"XGBoost Training Time: {train_time:.4f} seconds")

In [None]:
#dataset 1 (data)
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyClassifier
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

# Load the data from CSV file
data = pd.read_csv("data.csv")

# Display the first few rows to understand the data structure
print(data.head())

# Step 1: Clean the data - check for missing values and remove rows with missing data
data_cleaned = data.dropna()  # Drop rows with NaNs (if necessary)

# Step 2: Map behavior labels to numeric values (already numeric in the provided dataset)
# Assuming the behavior labels are in a column named 'behavior' with:
# Feeding: 0, Rumination: 1, Standing: 2, Lying: 3, Walking: 4

# Step 3: Define feature columns (exclude 'date', 'time', 'cow_num', 'behavior')
X = data_cleaned.drop(columns=['date', 'time', 'cow_num', 'behavior'])

# Define the target variable (Behavior labels)
y = data_cleaned['behavior']  # This column contains the behavior labels (already numeric)

# Step 4: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 5: Initialize LazyClassifier and fit the model
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# Display the model results
print("\nLazyClassifier Results:")
print(models)

# Optional: Save the results to a CSV file
models.to_csv('lazy_classifier_results.csv', index=False)

# Step 6: Visualize the behavior label distribution
plt.figure(figsize=(10, 6))
data_cleaned['behavior'].value_counts().plot(kind='bar')
plt.title('Behavior Label Distribution')
plt.xlabel('Behavior')
plt.ylabel('Count')
plt.xticks(rotation=45, labels=["Feeding", "Rumination", "Standing", "Lying", "Walking"])
plt.show()


In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyClassifier
import matplotlib.pyplot as plt

# Load the data from CSV file
data = pd.read_csv("data.csv")

# Display the first few rows to understand the data structure
print(data.head())

# Step 1: Clean the data - check for missing values and remove rows with missing data
data_cleaned = data.dropna()  # Drop rows with NaNs (if necessary)

# Step 2: Randomly sample a smaller subset of the data (e.g., 10% of the original data)
sampled_data = data_cleaned.sample(frac=0.1, random_state=42)  # Adjust frac for different sizes

# Step 3: Define feature columns (exclude 'date', 'time', 'cow_num', 'behavior')
X = sampled_data.drop(columns=['date', 'time', 'cow_num', 'behavior'])

# Define the target variable (Behavior labels)
y = sampled_data['behavior']  # This column contains the behavior labels (already numeric)

# Step 4: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 5: Initialize LazyClassifier and fit the model
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# Display the model results
print("\nLazyClassifier Results:")
print(models)

# Optional: Save the results to a CSV file
models.to_csv('lazy_classifier_results.csv', index=False)

# Step 6: Visualize the behavior label distribution
plt.figure(figsize=(10, 6))
sampled_data['behavior'].value_counts().plot(kind='bar')
plt.title('Behavior Label Distribution')
plt.xlabel('Behavior')
plt.ylabel('Count')
plt.xticks(rotation=45, labels=["Feeding", "Rumination", "Standing", "Lying", "Walking"])
plt.show()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyClassifier
import matplotlib.pyplot as plt

# Load only the necessary columns to save memory
cols_to_use = ['acc_x', 'acc_y', 'acc_z', 'behavior']  # Essential columns

# Load a smaller subset of the data (e.g., 1% of the original dataset)
data = pd.read_csv("data.csv", usecols=cols_to_use)  # Load only the columns that are needed

# Display the first few rows to understand the data structure
print(data.head())

# Step 1: Clean the data - check for missing values and remove rows with missing data
data_cleaned = data.dropna()  # Drop rows with NaNs

# Step 2: Randomly sample a smaller subset of the data (e.g., 1% of the original data)
sampled_data = data_cleaned.sample(frac=0.01, random_state=42)  # Adjust frac for different sizes

# Step 3: Define feature columns (exclude 'behavior' column)
X = sampled_data.drop(columns=['behavior'])

# Define the target variable (Behavior labels)
y = sampled_data['behavior']  # This column contains the behavior labels (already numeric)

# Step 4: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 5: Initialize LazyClassifier and fit the model
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# Display the model results
print("\nLazyClassifier Results:")
print(models)

# Optional: Save the results to a CSV file
models.to_csv('lazy_classifier_results.csv', index=False)

# Step 6: Visualize the behavior label distribution
plt.figure(figsize=(10, 6))
sampled_data['behavior'].value_counts().plot(kind='bar')
plt.title('Behavior Label Distribution')
plt.xlabel('Behavior')
plt.ylabel('Count')
plt.xticks(rotation=45, labels=["Feeding", "Rumination", "Standing", "Lying", "Walking"])
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Dataset results for each dataset
datasets = {
    'Dataset 1': {
        'models': ['LGBMClassifier', 'RandomForestClassifier', 'XGBClassifier', 'ExtraTreesClassifier', 'KNeighborsClassifier',
                   'BaggingClassifier', 'LabelPropagation', 'LabelSpreading', 'AdaBoostClassifier', 'GaussianNB',
                   'QuadraticDiscriminantAnalysis', 'ExtraTreeClassifier', 'SVC', 'DecisionTreeClassifier', 'LinearSVC',
                   'RidgeClassifierCV', 'RidgeClassifier', 'LinearDiscriminantAnalysis', 'CalibratedClassifierCV',
                   'SGDClassifier', 'LogisticRegression', 'BernoulliNB', 'Perceptron', 'PassiveAggressiveClassifier',
                   'NearestCentroid', 'DummyClassifier'],
        'accuracies': [0.65, 0.64, 0.64, 0.63, 0.62, 0.62, 0.64, 0.64, 0.60, 0.52, 0.51, 0.57, 0.63, 0.56, 0.61, 0.61,
                       0.61, 0.61, 0.60, 0.61, 0.60, 0.39, 0.59, 0.27, 0.48],
        'time': [1.66, 1.97, 1.55, 1.45, 0.33, 0.41, 2.44, 5.71, 0.49, 0.02, 0.06, 0.03, 6.17, 0.09, 0.08, 0.03, 0.03,
                 0.08, 0.30, 0.14, 0.19, 0.04, 0.12, 0.12, 0.05, 0.03],
    },
    'Dataset 2': {
        'models': ['BaggingClassifier', 'DecisionTreeClassifier', 'KNeighborsClassifier', 'NearestCentroid',
                   'ExtraTreeClassifier', 'XGBClassifier', 'ExtraTreesClassifier', 'RandomForestClassifier', 'GaussianNB',
                   'LabelPropagation', 'LabelSpreading', 'PassiveAggressiveClassifier', 'LGBMClassifier', 'Perceptron',
                   'LogisticRegression', 'LinearDiscriminantAnalysis', 'CalibratedClassifierCV', 'SVC', 'DummyClassifier',
                   'BernoulliNB', 'AdaBoostClassifier', 'SGDClassifier', 'RidgeClassifier', 'RidgeClassifierCV', 'LinearSVC'],
        'accuracies': [0.67, 0.61, 0.67, 0.16, 0.60, 0.68, 0.69, 0.69, 0.44, 0.65, 0.65, 0.23, 0.40, 0.53, 0.63, 0.63,
                       0.63, 0.63, 0.63, 0.63, 0.63, 0.63, 0.63, 0.63, 0.63],
        'time': [0.48, 0.09, 0.31, 0.02, 0.03, 4.64, 1.50, 1.31, 0.03, 2.17, 2.74, 0.07, 2.10, 0.07, 0.34, 0.06, 0.57,
                 3.15, 0.03, 0.05, 0.05, 0.90, 0.15, 0.03, 0.05, 0.07],
    },
    'Dataset 3': {
        'models': ['ExtraTreesClassifier', 'RandomForestClassifier', 'LGBMClassifier', 'LabelSpreading', 'LabelPropagation',
                   'BaggingClassifier', 'KNeighborsClassifier', 'DecisionTreeClassifier', 'ExtraTreeClassifier', 'SVC', 'NuSVC',
                   'LogisticRegression', 'CalibratedClassifierCV', 'LinearSVC', 'SGDClassifier', 'QuadraticDiscriminantAnalysis',
                   'RidgeClassifier', 'RidgeClassifierCV', 'AdaBoostClassifier', 'LinearDiscriminantAnalysis', 'PassiveAggressiveClassifier',
                   'Perceptron', 'NearestCentroid', 'BernoulliNB', 'GaussianNB', 'DummyClassifier'],
        'accuracies': [0.97, 0.96, 0.96, 0.94, 0.94, 0.94, 0.89, 0.89, 0.86, 0.81, 0.81, 0.73, 0.64, 0.62, 0.62, 0.58, 0.58,
                       0.54, 0.54, 0.54, 0.53, 0.53, 0.53, 0.49, 0.40, 0.39, 0.35, 0.16],
        'time': [2.98, 9.66, 4.62, 23.73, 14.60, 8.72, 0.70, 1.14, 0.07, 13.10, 32.57, 0.62, 10.63, 1.62, 0.64, 0.15, 0.05,
                 0.20, 8.19, 0.13, 0.30, 0.26, 0.05, 0.09, 0.07, 0.04],
    },
    'Dataset 4': {
        'models': ['ExtraTreesClassifier', 'XGBClassifier', 'LGBMClassifier', 'RandomForestClassifier', 'LabelPropagation',
                   'KNeighborsClassifier', 'LabelSpreading', 'BaggingClassifier', 'DecisionTreeClassifier', 'ExtraTreeClassifier',
                   'SVC', 'QuadraticDiscriminantAnalysis', 'GaussianNB', 'AdaBoostClassifier', 'NearestCentroid', 'LogisticRegression',
                   'LinearDiscriminantAnalysis', 'CalibratedClassifierCV'],
        'accuracies': [0.91, 0.89, 0.89, 0.89, 0.86, 0.87, 0.86, 0.86, 0.82, 0.81, 0.82, 0.73, 0.69, 0.65, 0.63, 0.71, 0.70, 0.70],
        'time': [2.98, 9.66, 4.62, 1.31, 2.44, 0.70, 1.50, 1.31, 1.31, 1.14, 2.10, 0.07, 0.02, 0.04, 0.62, 0.07, 0.07, 0.07],
    },
    'Dataset 5': {
        'models': ['XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier', 'RandomForestClassifier', 'BaggingClassifier',
                   'LabelSpreading', 'PassiveAggressiveClassifier', 'DecisionTreeClassifier', 'LabelPropagation', 'NearestCentroid',
                   'ExtraTreesClassifier', 'GaussianNB', 'ExtraTreeClassifier', 'AdaBoostClassifier', 'QuadraticDiscriminantAnalysis',
                   'SVC', 'SGDClassifier', 'BernoulliNB', 'Perceptron', 'LogisticRegression', 'LinearDiscriminantAnalysis',
                   'LinearSVC', 'CalibratedClassifierCV', 'RidgeClassifierCV', 'RidgeClassifier', 'DummyClassifier'],
        'accuracies': [0.64, 0.65, 0.63, 0.63, 0.61, 0.62, 0.55, 0.57, 0.61, 0.49, 0.61, 0.59, 0.55, 0.58, 0.60, 0.63, 0.58,
                       0.58, 0.58, 0.45, 0.58, 0.58, 0.58, 0.58, 0.57],
        'time': [1.26, 0.20, 0.06, 0.35, 0.08, 0.27, 0.02, 0.02, 0.26, 0.02, 0.29, 0.01, 0.01, 0.15, 0.03, 0.21, 0.03,
                 0.02, 0.02, 0.03, 0.03, 0.05, 0.02, 0.04, 0.01],
    }
}

# Creating plots for accuracy and time

fig, axes = plt.subplots(2, 1, figsize=(12, 12))

# Accuracy Plot
for idx, (dataset, data) in enumerate(datasets.items()):
    axes[0].bar(data['models'], data['accuracies'], label=dataset, alpha=0.7)
axes[0].set_title('Model Accuracy Comparison')
axes[0].set_xlabel('Models')
axes[0].set_ylabel('Accuracy')
axes[0].tick_params(axis='x', rotation=90)
axes[0].legend(title="Datasets", bbox_to_anchor=(1.05, 1), loc='upper left')

# Time Plot
for idx, (dataset, data) in enumerate(datasets.items()):
    axes[1].bar(data['models'], data['time'], label=dataset, alpha=0.7)
axes[1].set_title('Model Time Comparison')
axes[1].set_xlabel('Models')
axes[1].set_ylabel('Time (seconds)')
axes[1].tick_params(axis='x', rotation=90)
axes[1].legend(title="Datasets", bbox_to_anchor=(1.05, 1), loc='upper left')

# Displaying the plots
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Dataset results for each dataset
datasets = {
    'Dataset 1': {
        'models': ['LGBMClassifier', 'RandomForestClassifier', 'XGBClassifier', 'ExtraTreesClassifier', 'KNeighborsClassifier',
                   'BaggingClassifier', 'LabelPropagation', 'LabelSpreading', 'AdaBoostClassifier', 'GaussianNB',
                   'QuadraticDiscriminantAnalysis', 'ExtraTreeClassifier', 'SVC', 'DecisionTreeClassifier', 'LinearSVC',
                   'RidgeClassifierCV', 'RidgeClassifier', 'LinearDiscriminantAnalysis', 'CalibratedClassifierCV',
                   'SGDClassifier', 'LogisticRegression', 'BernoulliNB', 'Perceptron', 'PassiveAggressiveClassifier',
                   'NearestCentroid', 'DummyClassifier'],
        'accuracies': [0.65, 0.64, 0.64, 0.63, 0.62, 0.62, 0.64, 0.64, 0.60, 0.52, 0.51, 0.57, 0.63, 0.56, 0.61, 0.61,
                       0.61, 0.61, 0.60, 0.61, 0.60, 0.39, 0.59, 0.27, 0.48],
        'time': [1.66, 1.97, 1.55, 1.45, 0.33, 0.41, 2.44, 5.71, 0.49, 0.02, 0.06, 0.03, 6.17, 0.09, 0.08, 0.03, 0.03,
                 0.08, 0.30, 0.14, 0.19, 0.04, 0.12, 0.12, 0.05, 0.03],
    },
    'Dataset 2': {
        'models': ['BaggingClassifier', 'DecisionTreeClassifier', 'KNeighborsClassifier', 'NearestCentroid',
                   'ExtraTreeClassifier', 'XGBClassifier', 'ExtraTreesClassifier', 'RandomForestClassifier', 'GaussianNB',
                   'LabelPropagation', 'LabelSpreading', 'PassiveAggressiveClassifier', 'LGBMClassifier', 'Perceptron',
                   'LogisticRegression', 'LinearDiscriminantAnalysis', 'CalibratedClassifierCV', 'SVC', 'DummyClassifier',
                   'BernoulliNB', 'AdaBoostClassifier', 'SGDClassifier', 'RidgeClassifier', 'RidgeClassifierCV', 'LinearSVC'],
        'accuracies': [0.67, 0.61, 0.67, 0.16, 0.60, 0.68, 0.69, 0.69, 0.44, 0.65, 0.65, 0.23, 0.40, 0.53, 0.63, 0.63,
                       0.63, 0.63, 0.63, 0.63, 0.63, 0.63, 0.63, 0.63],
        'time': [0.48, 0.09, 0.31, 0.02, 0.03, 4.64, 1.50, 1.31, 0.03, 2.17, 2.74, 0.07, 2.10, 0.07, 0.34, 0.06, 0.57,
                 3.15, 0.03, 0.05, 0.05, 0.90, 0.15, 0.03, 0.05, 0.07],
    },
    'Dataset 3': {
        'models': ['ExtraTreesClassifier', 'RandomForestClassifier', 'LGBMClassifier', 'LabelSpreading', 'LabelPropagation',
                   'BaggingClassifier', 'KNeighborsClassifier', 'DecisionTreeClassifier', 'ExtraTreeClassifier', 'SVC', 'NuSVC',
                   'LogisticRegression', 'CalibratedClassifierCV', 'LinearSVC', 'SGDClassifier', 'QuadraticDiscriminantAnalysis',
                   'RidgeClassifier', 'RidgeClassifierCV', 'AdaBoostClassifier', 'LinearDiscriminantAnalysis', 'PassiveAggressiveClassifier',
                   'Perceptron', 'NearestCentroid', 'BernoulliNB', 'GaussianNB', 'DummyClassifier'],
        'accuracies': [0.97, 0.96, 0.96, 0.94, 0.94, 0.94, 0.89, 0.89, 0.86, 0.81, 0.81, 0.73, 0.64, 0.62, 0.62, 0.58, 0.58,
                       0.54, 0.54, 0.54, 0.53, 0.53, 0.53, 0.49, 0.40, 0.39, 0.35, 0.16],
        'time': [2.98, 9.66, 4.62, 23.73, 14.60, 8.72, 0.70, 1.14, 0.07, 13.10, 32.57, 0.62, 10.63, 1.62, 0.64, 0.15, 0.05,
                 0.20, 8.19, 0.13, 0.30, 0.26, 0.05, 0.09, 0.07, 0.04],
    },
    'Dataset 4': {
        'models': ['ExtraTreesClassifier', 'XGBClassifier', 'LGBMClassifier', 'RandomForestClassifier', 'LabelPropagation',
                   'KNeighborsClassifier', 'LabelSpreading', 'BaggingClassifier', 'DecisionTreeClassifier', 'ExtraTreeClassifier',
                   'SVC', 'QuadraticDiscriminantAnalysis', 'GaussianNB', 'AdaBoostClassifier', 'NearestCentroid', 'LogisticRegression',
                   'LinearDiscriminantAnalysis', 'CalibratedClassifierCV'],
        'accuracies': [0.91, 0.89, 0.89, 0.89, 0.86, 0.87, 0.86, 0.86, 0.82, 0.81, 0.82, 0.73, 0.69, 0.65, 0.63, 0.71, 0.70, 0.70],
        'time': [2.98, 9.66, 4.62, 1.31, 2.44, 0.70, 1.50, 1.31, 1.31, 1.14, 2.10, 0.07, 0.02, 0.04, 0.62, 0.07, 0.07, 0.07],
    },
    'Dataset 5': {
        'models': ['XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier', 'RandomForestClassifier', 'BaggingClassifier',
                   'LabelSpreading', 'PassiveAggressiveClassifier', 'DecisionTreeClassifier', 'LabelPropagation', 'NearestCentroid',
                   'ExtraTreesClassifier', 'GaussianNB', 'ExtraTreeClassifier', 'AdaBoostClassifier', 'QuadraticDiscriminantAnalysis',
                   'SVC', 'SGDClassifier', 'BernoulliNB', 'Perceptron', 'LogisticRegression', 'LinearDiscriminantAnalysis',
                   'LinearSVC', 'CalibratedClassifierCV', 'RidgeClassifierCV', 'RidgeClassifier', 'DummyClassifier'],
        'accuracies': [0.64, 0.65, 0.63, 0.63, 0.61, 0.62, 0.55, 0.57, 0.61, 0.49, 0.61, 0.59, 0.55, 0.58, 0.60, 0.63, 0.58,
                       0.58, 0.58, 0.45, 0.58, 0.58, 0.58, 0.57],
        'time': [1.26, 0.20, 0.06, 0.35, 0.08, 0.27, 0.02, 0.13, 0.41, 0.32, 0.02, 0.31, 0.36, 0.03, 0.07, 0.06, 0.09, 0.19,
                 0.08, 0.11, 0.07, 0.01, 0.03, 0.07, 0.02],
    }
}

# Ensuring that the number of models and accuracies match
for dataset, data in datasets.items():
    if len(data['models']) != len(data['accuracies']):
        print(f"Mismatch in dataset {dataset}: Models count = {len(data['models'])}, Accuracies count = {len(data['accuracies'])}")

# Creating plots for accuracy and time
fig, axes = plt.subplots(2, 1, figsize=(12, 12))

# Accuracy Plot
for idx, (dataset, data) in enumerate(datasets.items()):
    axes[0].bar(data['models'], data['accuracies'], label=dataset, alpha=0.7)
axes[0].set_title('Model Accuracy Comparison')
axes[0].set_xlabel('Models')
axes[0].set_ylabel('Accuracy')
axes[0].tick_params(axis='x', rotation=90)
axes[0].legend(title="Datasets", bbox_to_anchor=(1.05, 1), loc='upper left')

# Time Plot
for idx, (dataset, data) in enumerate(datasets.items()):
    axes[1].bar(data['models'], data['time'], label=dataset, alpha=0.7)
axes[1].set_title('Model Time Comparison')
axes[1].set_xlabel('Models')
axes[1].set_ylabel('Time (seconds)')
axes[1].tick_params(axis='x', rotation=90)
axes[1].legend(title="Datasets", bbox_to_anchor=(1.05, 1), loc='upper left')

# Displaying the plots
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Selected models to plot
selected_models = [
    'RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
    'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
    'GaussianNB', 'ExtraTreesClassifier'
]

# Dataset results for each dataset
datasets = {
    'Dataset 1': {
        'models': ['LGBMClassifier', 'RandomForestClassifier', 'XGBClassifier', 'ExtraTreesClassifier', 'KNeighborsClassifier',
                   'BaggingClassifier', 'LabelPropagation', 'LabelSpreading', 'AdaBoostClassifier', 'GaussianNB',
                   'QuadraticDiscriminantAnalysis', 'ExtraTreeClassifier', 'SVC', 'DecisionTreeClassifier', 'LinearSVC',
                   'RidgeClassifierCV', 'RidgeClassifier', 'LinearDiscriminantAnalysis', 'CalibratedClassifierCV',
                   'SGDClassifier', 'LogisticRegression', 'BernoulliNB', 'Perceptron', 'PassiveAggressiveClassifier',
                   'NearestCentroid', 'DummyClassifier'],
        'accuracies': [0.65, 0.64, 0.64, 0.63, 0.62, 0.62, 0.64, 0.64, 0.60, 0.52, 0.51, 0.57, 0.63, 0.56, 0.61, 0.61,
                       0.61, 0.61, 0.60, 0.61, 0.60, 0.39, 0.59, 0.27, 0.48],
        'time': [1.66, 1.97, 1.55, 1.45, 0.33, 0.41, 2.44, 5.71, 0.49, 0.02, 0.06, 0.03, 6.17, 0.09, 0.08, 0.03, 0.03,
                 0.08, 0.30, 0.14, 0.19, 0.04, 0.12, 0.12, 0.05, 0.03],
    },
    'Dataset 2': {
        'models': ['BaggingClassifier', 'DecisionTreeClassifier', 'KNeighborsClassifier', 'NearestCentroid',
                   'ExtraTreeClassifier', 'XGBClassifier', 'ExtraTreesClassifier', 'RandomForestClassifier', 'GaussianNB',
                   'LabelPropagation', 'LabelSpreading', 'PassiveAggressiveClassifier', 'LGBMClassifier', 'Perceptron',
                   'LogisticRegression', 'LinearDiscriminantAnalysis', 'CalibratedClassifierCV', 'SVC', 'DummyClassifier',
                   'BernoulliNB', 'AdaBoostClassifier', 'SGDClassifier', 'RidgeClassifier', 'RidgeClassifierCV', 'LinearSVC'],
        'accuracies': [0.67, 0.61, 0.67, 0.16, 0.60, 0.68, 0.69, 0.69, 0.44, 0.65, 0.65, 0.23, 0.40, 0.53, 0.63, 0.63,
                       0.63, 0.63, 0.63, 0.63, 0.63, 0.63, 0.63, 0.63],
        'time': [0.48, 0.09, 0.31, 0.02, 0.03, 4.64, 1.50, 1.31, 0.03, 2.17, 2.74, 0.07, 2.10, 0.07, 0.34, 0.06, 0.57,
                 3.15, 0.03, 0.05, 0.05, 0.90, 0.15, 0.03, 0.05, 0.07],
    },
    'Dataset 3': {
        'models': ['ExtraTreesClassifier', 'RandomForestClassifier', 'LGBMClassifier', 'LabelSpreading', 'LabelPropagation',
                   'BaggingClassifier', 'KNeighborsClassifier', 'DecisionTreeClassifier', 'ExtraTreeClassifier', 'SVC', 'NuSVC',
                   'LogisticRegression', 'CalibratedClassifierCV', 'LinearSVC', 'SGDClassifier', 'QuadraticDiscriminantAnalysis',
                   'RidgeClassifier', 'RidgeClassifierCV', 'AdaBoostClassifier', 'LinearDiscriminantAnalysis', 'PassiveAggressiveClassifier',
                   'Perceptron', 'NearestCentroid', 'BernoulliNB', 'GaussianNB', 'DummyClassifier'],
        'accuracies': [0.97, 0.96, 0.96, 0.94, 0.94, 0.94, 0.89, 0.89, 0.86, 0.81, 0.81, 0.73, 0.64, 0.62, 0.62, 0.58, 0.58,
                       0.54, 0.54, 0.54, 0.53, 0.53, 0.53, 0.49, 0.40, 0.39, 0.35, 0.16],
        'time': [2.98, 9.66, 4.62, 23.73, 14.60, 8.72, 0.70, 1.14, 0.07, 13.10, 32.57, 0.62, 10.63, 1.62, 0.64, 0.15, 0.05,
                 0.20, 8.19, 0.13, 0.30, 0.26, 0.05, 0.09, 0.07, 0.04],
    },
    'Dataset 4': {
        'models': ['ExtraTreesClassifier', 'XGBClassifier', 'LGBMClassifier', 'RandomForestClassifier', 'LabelPropagation',
                   'KNeighborsClassifier', 'LabelSpreading', 'BaggingClassifier', 'DecisionTreeClassifier', 'ExtraTreeClassifier',
                   'SVC', 'QuadraticDiscriminantAnalysis', 'GaussianNB', 'AdaBoostClassifier', 'NearestCentroid', 'LogisticRegression',
                   'LinearDiscriminantAnalysis', 'CalibratedClassifierCV'],
        'accuracies': [0.91, 0.89, 0.89, 0.89, 0.86, 0.87, 0.86, 0.86, 0.82, 0.81, 0.82, 0.73, 0.69, 0.65, 0.63, 0.71, 0.70, 0.70],
        'time': [2.98, 9.66, 4.62, 1.31, 2.44, 0.70, 1.50, 1.31, 1.31, 1.14, 2.10, 0.07, 0.02, 0.04, 0.62, 0.07, 0.07, 0.07],
    },
    'Dataset 5': {
        'models': ['XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier', 'RandomForestClassifier', 'BaggingClassifier',
                   'LabelSpreading', 'PassiveAggressiveClassifier', 'DecisionTreeClassifier', 'LabelPropagation', 'NearestCentroid',
                   'ExtraTreesClassifier', 'GaussianNB', 'ExtraTreeClassifier', 'AdaBoostClassifier', 'QuadraticDiscriminantAnalysis',
                   'SVC', 'SGDClassifier', 'BernoulliNB', 'Perceptron', 'LogisticRegression', 'LinearDiscriminantAnalysis',
                   'LinearSVC', 'CalibratedClassifierCV', 'RidgeClassifierCV', 'RidgeClassifier', 'DummyClassifier'],
        'accuracies': [0.64, 0.65, 0.63, 0.63, 0.61, 0.62, 0.55, 0.57, 0.61, 0.49, 0.61, 0.59, 0.55, 0.58, 0.60, 0.63, 0.58,
                       0.58, 0.58, 0.45, 0.58, 0.58, 0.58, 0.58, 0.58, 0.57],
        'time': [1.26, 0.20, 0.06, 0.35, 0.08, 0.27, 0.02, 0.02, 0.26, 0.02, 0.29, 0.06, 0.06, 0.29, 0.16, 0.29, 0.02,
                 0.09, 0.16, 0.16, 0.30, 0.16, 0.29, 0.27, 0.13],
    }
}

# Sorting models based on the selected models
def filter_selected_models(datasets, selected_models):
    filtered_data = {}
    for dataset, data in datasets.items():
        selected_indices = [i for i, model in enumerate(data['models']) if model in selected_models]
        filtered_data[dataset] = {
            'models': np.array(data['models'])[selected_indices],
            'accuracies': np.array(data['accuracies'])[selected_indices],
            'time': np.array(data['time'])[selected_indices]
        }
    return filtered_data

# Get the filtered data for the selected models
filtered_data = filter_selected_models(datasets, selected_models)

# Plotting accuracy and time performance for selected models
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 6))

# Plot for Accuracy
for dataset, data in filtered_data.items():
    ax1.barh(data['models'], data['accuracies'], label=dataset, alpha=0.7)

ax1.set_xlabel('Accuracy')
ax1.set_title('Top Models by Accuracy (Selected Models)')
ax1.legend()

# Plot for Time Performance
for dataset, data in filtered_data.items():
    ax2.barh(data['models'], data['time'], label=dataset, alpha=0.7)

ax2.set_xlabel('Time (s)')
ax2.set_title('Top Models by Time Performance (Selected Models)')
ax2.legend()

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Selected models to plot
selected_models = [
    'RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
    'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
    'GaussianNB', 'ExtraTreesClassifier'
]

# Dataset results for each dataset
datasets = {
    'Dataset 1': {
        'models': ['LGBMClassifier', 'RandomForestClassifier', 'XGBClassifier', 'ExtraTreesClassifier', 'KNeighborsClassifier',
                   'BaggingClassifier', 'LabelPropagation', 'LabelSpreading', 'AdaBoostClassifier', 'GaussianNB',
                   'QuadraticDiscriminantAnalysis', 'ExtraTreeClassifier', 'SVC', 'DecisionTreeClassifier', 'LinearSVC',
                   'RidgeClassifierCV', 'RidgeClassifier', 'LinearDiscriminantAnalysis', 'CalibratedClassifierCV',
                   'SGDClassifier', 'LogisticRegression', 'BernoulliNB', 'Perceptron', 'PassiveAggressiveClassifier',
                   'NearestCentroid', 'DummyClassifier'],
        'accuracies': [0.65, 0.64, 0.64, 0.63, 0.62, 0.62, 0.64, 0.64, 0.60, 0.52, 0.51, 0.57, 0.63, 0.56, 0.61, 0.61,
                       0.61, 0.61, 0.60, 0.61, 0.60, 0.39, 0.59, 0.27, 0.48],
        'time': [1.66, 1.97, 1.55, 1.45, 0.33, 0.41, 2.44, 5.71, 0.49, 0.02, 0.06, 0.03, 6.17, 0.09, 0.08, 0.03, 0.03,
                 0.08, 0.30, 0.14, 0.19, 0.04, 0.12, 0.12, 0.05, 0.03],
    },
    'Dataset 2': {
        'models': ['BaggingClassifier', 'DecisionTreeClassifier', 'KNeighborsClassifier', 'NearestCentroid',
                   'ExtraTreeClassifier', 'XGBClassifier', 'ExtraTreesClassifier', 'RandomForestClassifier', 'GaussianNB',
                   'LabelPropagation', 'LabelSpreading', 'PassiveAggressiveClassifier', 'LGBMClassifier', 'Perceptron',
                   'LogisticRegression', 'LinearDiscriminantAnalysis', 'CalibratedClassifierCV', 'SVC', 'DummyClassifier',
                   'BernoulliNB', 'AdaBoostClassifier', 'SGDClassifier', 'RidgeClassifier', 'RidgeClassifierCV', 'LinearSVC'],
        'accuracies': [0.67, 0.61, 0.67, 0.16, 0.60, 0.68, 0.69, 0.69, 0.44, 0.65, 0.65, 0.23, 0.40, 0.53, 0.63, 0.63,
                       0.63, 0.63, 0.63, 0.63, 0.63, 0.63, 0.63, 0.63],
        'time': [0.48, 0.09, 0.31, 0.02, 0.03, 4.64, 1.50, 1.31, 0.03, 2.17, 2.74, 0.07, 2.10, 0.07, 0.34, 0.06, 0.57,
                 3.15, 0.03, 0.05, 0.05, 0.90, 0.15, 0.03, 0.05, 0.07],
    },
    'Dataset 3': {
        'models': ['ExtraTreesClassifier', 'RandomForestClassifier', 'LGBMClassifier', 'LabelSpreading', 'LabelPropagation',
                   'BaggingClassifier', 'KNeighborsClassifier', 'DecisionTreeClassifier', 'ExtraTreeClassifier', 'SVC', 'NuSVC',
                   'LogisticRegression', 'CalibratedClassifierCV', 'LinearSVC', 'SGDClassifier', 'QuadraticDiscriminantAnalysis',
                   'RidgeClassifier', 'RidgeClassifierCV', 'AdaBoostClassifier', 'LinearDiscriminantAnalysis', 'PassiveAggressiveClassifier',
                   'Perceptron', 'NearestCentroid', 'BernoulliNB', 'GaussianNB', 'DummyClassifier'],
        'accuracies': [0.97, 0.96, 0.96, 0.94, 0.94, 0.94, 0.89, 0.89, 0.86, 0.81, 0.81, 0.73, 0.64, 0.62, 0.62, 0.58, 0.58,
                       0.54, 0.54, 0.54, 0.53, 0.53, 0.53, 0.49, 0.40, 0.39, 0.35, 0.16],
        'time': [2.98, 9.66, 4.62, 23.73, 14.60, 8.72, 0.70, 1.14, 0.07, 13.10, 32.57, 0.62, 10.63, 1.62, 0.64, 0.15, 0.05,
                 0.20, 8.19, 0.13, 0.30, 0.26, 0.05, 0.09, 0.07, 0.04],
    },
    'Dataset 4': {
        'models': ['ExtraTreesClassifier', 'XGBClassifier', 'LGBMClassifier', 'RandomForestClassifier', 'LabelPropagation',
                   'KNeighborsClassifier', 'LabelSpreading', 'BaggingClassifier', 'DecisionTreeClassifier', 'ExtraTreeClassifier',
                   'SVC', 'QuadraticDiscriminantAnalysis', 'GaussianNB', 'AdaBoostClassifier', 'NearestCentroid', 'LogisticRegression',
                   'LinearDiscriminantAnalysis', 'CalibratedClassifierCV'],
        'accuracies': [0.91, 0.89, 0.89, 0.89, 0.86, 0.87, 0.86, 0.86, 0.82, 0.81, 0.82, 0.73, 0.69, 0.65, 0.63, 0.71, 0.70, 0.70],
        'time': [2.98, 9.66, 4.62, 1.31, 2.44, 0.70, 1.50, 1.31, 1.31, 1.14, 2.10, 0.07, 0.02, 0.04, 0.62, 0.07, 0.07, 0.07],
    },
    'Dataset 5': {
        'models': ['XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier', 'RandomForestClassifier', 'BaggingClassifier',
                   'LabelSpreading', 'PassiveAggressiveClassifier', 'DecisionTreeClassifier', 'LabelPropagation', 'NearestCentroid',
                   'ExtraTreesClassifier', 'GaussianNB', 'ExtraTreeClassifier', 'AdaBoostClassifier', 'QuadraticDiscriminantAnalysis',
                   'SVC', 'SGDClassifier', 'BernoulliNB', 'Perceptron', 'LogisticRegression', 'LinearDiscriminantAnalysis',
                   'LinearSVC', 'CalibratedClassifierCV', 'RidgeClassifierCV', 'RidgeClassifier', 'DummyClassifier'],
        'accuracies': [0.64, 0.65, 0.63, 0.63, 0.61, 0.62, 0.55, 0.57, 0.61, 0.49, 0.61, 0.59, 0.55, 0.58, 0.60, 0.63, 0.58,
                       0.58, 0.58, 0.45, 0.58, 0.58, 0.58, 0.58, 0.57],
        'time': [1.26, 0.20, 0.06, 0.35, 0.08, 1.10, 0.21, 0.02, 0.31, 0.31, 0.49, 0.07, 0.03, 0.21, 0.23, 0.40, 0.50,
                 0.06, 0.05, 0.14, 0.05, 0.16, 0.30, 0.03, 0.08, 0.08],
    }
}

# Function to filter selected models
def filter_selected_models(datasets, selected_models):
    filtered_data = {}
    for dataset, data in datasets.items():
        selected_indices = [i for i, model in enumerate(data['models']) if model in selected_models]
        filtered_data[dataset] = {
            'models': np.array(data['models'])[selected_indices],
            'accuracies': np.array(data['accuracies'])[selected_indices],
            'time': np.array(data['time'])[selected_indices]
        }
    return filtered_data

# Get the filtered data for the selected models
filtered_data = filter_selected_models(datasets, selected_models)

# Plotting accuracy and time performance for selected models
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))

# Plot for Accuracy
for dataset, data in filtered_data.items():
    ax1.barh(data['models'], data['accuracies'], label=dataset, alpha=0.7)

ax1.set_xlabel('Accuracy')
ax1.set_title('Top Models by Accuracy (Selected Models)')
ax1.legend()

# Plot for Time Performance
for dataset, data in filtered_data.items():
    ax2.barh(data['models'], data['time'], label=dataset, alpha=0.7)

ax2.set_xlabel('Time (s)')
ax2.set_title('Top Models by Time Performance (Selected Models)')
ax2.legend()

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Selected models to plot
selected_models = [
    'RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
    'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
    'GaussianNB', 'ExtraTreesClassifier'
]

# Dataset results for each dataset
datasets = {
    'Dataset 1': {
        'models': ['LGBMClassifier', 'RandomForestClassifier', 'XGBClassifier', 'ExtraTreesClassifier', 'KNeighborsClassifier',
                   'BaggingClassifier', 'LabelPropagation', 'LabelSpreading', 'AdaBoostClassifier', 'GaussianNB',
                   'QuadraticDiscriminantAnalysis', 'ExtraTreeClassifier', 'SVC', 'DecisionTreeClassifier', 'LinearSVC',
                   'RidgeClassifierCV', 'RidgeClassifier', 'LinearDiscriminantAnalysis', 'CalibratedClassifierCV',
                   'SGDClassifier', 'LogisticRegression', 'BernoulliNB', 'Perceptron', 'PassiveAggressiveClassifier',
                   'NearestCentroid', 'DummyClassifier'],
        'accuracies': [0.65, 0.64, 0.64, 0.63, 0.62, 0.62, 0.64, 0.64, 0.60, 0.52, 0.51, 0.57, 0.63, 0.56, 0.61, 0.61,
                       0.61, 0.61, 0.60, 0.61, 0.60, 0.39, 0.59, 0.27, 0.48],
        'time': [1.66, 1.97, 1.55, 1.45, 0.33, 0.41, 2.44, 5.71, 0.49, 0.02, 0.06, 0.03, 6.17, 0.09, 0.08, 0.03, 0.03,
                 0.08, 0.30, 0.14, 0.19, 0.04, 0.12, 0.12, 0.05, 0.03],
    },
    'Dataset 2': {
        'models': ['BaggingClassifier', 'DecisionTreeClassifier', 'KNeighborsClassifier', 'NearestCentroid',
                   'ExtraTreeClassifier', 'XGBClassifier', 'ExtraTreesClassifier', 'RandomForestClassifier', 'GaussianNB',
                   'LabelPropagation', 'LabelSpreading', 'PassiveAggressiveClassifier', 'LGBMClassifier', 'Perceptron',
                   'LogisticRegression', 'LinearDiscriminantAnalysis', 'CalibratedClassifierCV', 'SVC', 'DummyClassifier',
                   'BernoulliNB', 'AdaBoostClassifier', 'SGDClassifier', 'RidgeClassifier', 'RidgeClassifierCV', 'LinearSVC'],
        'accuracies': [0.67, 0.61, 0.67, 0.16, 0.60, 0.68, 0.69, 0.69, 0.44, 0.65, 0.65, 0.23, 0.40, 0.53, 0.63, 0.63,
                       0.63, 0.63, 0.63, 0.63, 0.63, 0.63, 0.63, 0.63],
        'time': [0.48, 0.09, 0.31, 0.02, 0.03, 4.64, 1.50, 1.31, 0.03, 2.17, 2.74, 0.07, 2.10, 0.07, 0.34, 0.06, 0.57,
                 3.15, 0.03, 0.05, 0.05, 0.90, 0.15, 0.03, 0.05, 0.07],
    },
    'Dataset 3': {
        'models': ['ExtraTreesClassifier', 'RandomForestClassifier', 'LGBMClassifier', 'LabelSpreading', 'LabelPropagation',
                   'BaggingClassifier', 'KNeighborsClassifier', 'DecisionTreeClassifier', 'ExtraTreeClassifier', 'SVC', 'NuSVC',
                   'LogisticRegression', 'CalibratedClassifierCV', 'LinearSVC', 'SGDClassifier', 'QuadraticDiscriminantAnalysis',
                   'RidgeClassifier', 'RidgeClassifierCV', 'AdaBoostClassifier', 'LinearDiscriminantAnalysis', 'PassiveAggressiveClassifier',
                   'Perceptron', 'NearestCentroid', 'BernoulliNB', 'GaussianNB', 'DummyClassifier'],
        'accuracies': [0.97, 0.96, 0.96, 0.94, 0.94, 0.94, 0.89, 0.89, 0.86, 0.81, 0.81, 0.73, 0.64, 0.62, 0.62, 0.58, 0.58,
                       0.54, 0.54, 0.54, 0.53, 0.53, 0.53, 0.49, 0.40, 0.39, 0.35, 0.16],
        'time': [2.98, 9.66, 4.62, 23.73, 14.60, 8.72, 0.70, 1.14, 0.07, 13.10, 32.57, 0.62, 10.63, 1.62, 0.64, 0.15, 0.05,
                 0.20, 8.19, 0.13, 0.30, 0.26, 0.05, 0.09, 0.07, 0.04],
    },
    'Dataset 4': {
        'models': ['ExtraTreesClassifier', 'XGBClassifier', 'LGBMClassifier', 'RandomForestClassifier', 'LabelPropagation',
                   'KNeighborsClassifier', 'LabelSpreading', 'BaggingClassifier', 'DecisionTreeClassifier', 'ExtraTreeClassifier',
                   'SVC', 'QuadraticDiscriminantAnalysis', 'GaussianNB', 'AdaBoostClassifier', 'NearestCentroid', 'LogisticRegression',
                   'LinearDiscriminantAnalysis', 'CalibratedClassifierCV'],
        'accuracies': [0.91, 0.89, 0.89, 0.89, 0.86, 0.87, 0.86, 0.86, 0.82, 0.81, 0.82, 0.73, 0.69, 0.65, 0.63, 0.71, 0.70, 0.70],
        'time': [2.98, 9.66, 4.62, 1.31, 2.44, 0.70, 1.50, 1.31, 1.31, 1.14, 2.10, 0.07, 0.02, 0.04, 0.62, 0.07, 0.07, 0.07],
    },
    'Dataset 5': {
        'models': ['XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier', 'RandomForestClassifier', 'BaggingClassifier',
                   'LabelSpreading', 'PassiveAggressiveClassifier', 'DecisionTreeClassifier', 'LabelPropagation', 'NearestCentroid',
                   'ExtraTreesClassifier', 'GaussianNB', 'ExtraTreeClassifier', 'AdaBoostClassifier', 'QuadraticDiscriminantAnalysis',
                   'SVC', 'SGDClassifier', 'BernoulliNB', 'Perceptron', 'LogisticRegression', 'LinearDiscriminantAnalysis',
                   'LinearSVC', 'CalibratedClassifierCV', 'RidgeClassifierCV', 'RidgeClassifier', 'DummyClassifier'],
        'accuracies': [0.64, 0.65, 0.63, 0.63, 0.61, 0.62, 0.55, 0.57, 0.61, 0.49, 0.61, 0.59, 0.55, 0.58, 0.60, 0.63, 0.58,
                       0.58, 0.58, 0.45, 0.58, 0.58, 0.58, 0.58, 0.57],
        'time': [3.10, 1.97, 2.44, 2.50, 0.33, 4.44, 2.44, 2.44, 2.10, 0.09, 0.05, 0.04, 1.12, 0.03, 0.05, 0.06, 0.07, 0.12,
                 0.03, 0.19, 0.17, 0.10, 0.07, 0.10, 0.15],
    }
}

# Function to filter selected models
def filter_selected_models(datasets, selected_models):
    filtered_data = {}
    for dataset, data in datasets.items():
        selected_indices = [i for i, model in enumerate(data['models']) if model in selected_models]
        filtered_data[dataset] = {
            'models': np.array(data['models'])[selected_indices],
            'accuracies': np.array(data['accuracies'])[selected_indices],
            'time': np.array(data['time'])[selected_indices]
        }
    return filtered_data

# Get the filtered data for the selected models
filtered_data = filter_selected_models(datasets, selected_models)

# Plotting accuracy and time performance for selected models
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 12))

# Plot for Accuracy
for dataset, data in filtered_data.items():
    ax1.bar(data['models'], data['accuracies'], label=dataset, width=0.15, align='center')

ax1.set_ylabel('Accuracy')
ax1.set_title('Top Models by Accuracy (Selected Models)')
ax1.legend()

# Plot for Time Performance
for dataset, data in filtered_data.items():
    ax2.bar(data['models'], data['time'], label=dataset, width=0.15, align='center')

ax2.set_ylabel('Time (s)')
ax2.set_title('Top Models by Time Performance (Selected Models)')
ax2.legend()

plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Selected models to plot
selected_models = [
    'RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
    'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
    'GaussianNB', 'ExtraTreesClassifier'
]

# Dataset results for each dataset
datasets = {
    'Dataset 1': {
        'models': ['LGBMClassifier', 'RandomForestClassifier', 'XGBClassifier', 'ExtraTreesClassifier', 'KNeighborsClassifier',
                   'BaggingClassifier', 'LabelPropagation', 'LabelSpreading', 'AdaBoostClassifier', 'GaussianNB',
                   'QuadraticDiscriminantAnalysis', 'ExtraTreeClassifier', 'SVC', 'DecisionTreeClassifier', 'LinearSVC',
                   'RidgeClassifierCV', 'RidgeClassifier', 'LinearDiscriminantAnalysis', 'CalibratedClassifierCV',
                   'SGDClassifier', 'LogisticRegression', 'BernoulliNB', 'Perceptron', 'PassiveAggressiveClassifier',
                   'NearestCentroid', 'DummyClassifier'],
        'accuracies': [0.65, 0.64, 0.64, 0.63, 0.62, 0.62, 0.64, 0.64, 0.60, 0.52, 0.51, 0.57, 0.63, 0.56, 0.61, 0.61,
                       0.61, 0.61, 0.60, 0.61, 0.60, 0.39, 0.59, 0.27, 0.48],
        'time': [1.66, 1.97, 1.55, 1.45, 0.33, 0.41, 2.44, 5.71, 0.49, 0.02, 0.06, 0.03, 6.17, 0.09, 0.08, 0.03, 0.03,
                 0.08, 0.30, 0.14, 0.19, 0.04, 0.12, 0.12, 0.05, 0.03],
    },
    # Add other datasets here...
}

# Function to filter selected models
def filter_selected_models(datasets, selected_models):
    filtered_data = {}
    for dataset, data in datasets.items():
        selected_indices = [i for i, model in enumerate(data['models']) if model in selected_models]
        filtered_data[dataset] = {
            'models': np.array(data['models'])[selected_indices],
            'accuracies': np.array(data['accuracies'])[selected_indices],
            'time': np.array(data['time'])[selected_indices]
        }
    return filtered_data

# Get the filtered data for the selected models
filtered_data = filter_selected_models(datasets, selected_models)

# Plotting accuracy and time performance for selected models
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))

# Set width of bars and adjust bar positions
bar_width = 0.15
index = np.arange(len(filtered_data['Dataset 1']['models']))

# Plot for Accuracy
for i, (dataset, data) in enumerate(filtered_data.items()):
    ax1.barh(index + i * bar_width, data['accuracies'], bar_width, label=dataset, alpha=0.7)

ax1.set_yticks(index + bar_width * (len(filtered_data) - 1) / 2)
ax1.set_yticklabels(filtered_data['Dataset 1']['models'])
ax1.set_xlabel('Accuracy')
ax1.set_title('Top Models by Accuracy (Selected Models)')
ax1.legend()

# Plot for Time Performance
for i, (dataset, data) in enumerate(filtered_data.items()):
    ax2.barh(index + i * bar_width, data['time'], bar_width, label=dataset, alpha=0.7)

ax2.set_yticks(index + bar_width * (len(filtered_data) - 1) / 2)
ax2.set_yticklabels(filtered_data['Dataset 1']['models'])
ax2.set_xlabel('Time (s)')
ax2.set_title('Top Models by Time Performance (Selected Models)')
ax2.legend()

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Selected models to plot
selected_models = [
    'RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
    'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
    'GaussianNB', 'ExtraTreesClassifier'
]

# Dataset results for each dataset
datasets = {
    'Dataset 1': {
        'models': ['LGBMClassifier', 'RandomForestClassifier', 'XGBClassifier', 'ExtraTreesClassifier', 'KNeighborsClassifier',
                   'BaggingClassifier', 'LabelPropagation', 'LabelSpreading', 'AdaBoostClassifier', 'GaussianNB',
                   'QuadraticDiscriminantAnalysis', 'ExtraTreeClassifier', 'SVC', 'DecisionTreeClassifier', 'LinearSVC',
                   'RidgeClassifierCV', 'RidgeClassifier', 'LinearDiscriminantAnalysis', 'CalibratedClassifierCV',
                   'SGDClassifier', 'LogisticRegression', 'BernoulliNB', 'Perceptron', 'PassiveAggressiveClassifier',
                   'NearestCentroid', 'DummyClassifier'],
        'accuracies': [0.65, 0.64, 0.64, 0.63, 0.62, 0.62, 0.64, 0.64, 0.60, 0.52, 0.51, 0.57, 0.63, 0.56, 0.61, 0.61,
                       0.61, 0.61, 0.60, 0.61, 0.60, 0.39, 0.59, 0.27, 0.48],
        'time': [1.66, 1.97, 1.55, 1.45, 0.33, 0.41, 2.44, 5.71, 0.49, 0.02, 0.06, 0.03, 6.17, 0.09, 0.08, 0.03, 0.03,
                 0.08, 0.30, 0.14, 0.19, 0.04, 0.12, 0.12, 0.05, 0.03],
    },
    'Dataset 2': {
        'models': ['BaggingClassifier', 'DecisionTreeClassifier', 'KNeighborsClassifier', 'NearestCentroid',
                   'ExtraTreeClassifier', 'XGBClassifier', 'ExtraTreesClassifier', 'RandomForestClassifier', 'GaussianNB',
                   'LabelPropagation', 'LabelSpreading', 'PassiveAggressiveClassifier', 'LGBMClassifier', 'Perceptron',
                   'LogisticRegression', 'LinearDiscriminantAnalysis', 'CalibratedClassifierCV', 'SVC', 'DummyClassifier',
                   'BernoulliNB', 'AdaBoostClassifier', 'SGDClassifier', 'RidgeClassifier', 'RidgeClassifierCV', 'LinearSVC'],
        'accuracies': [0.67, 0.61, 0.67, 0.16, 0.60, 0.68, 0.69, 0.69, 0.44, 0.65, 0.65, 0.23, 0.40, 0.53, 0.63, 0.63,
                       0.63, 0.63, 0.63, 0.63, 0.63, 0.63, 0.63, 0.63],
        'time': [0.48, 0.09, 0.31, 0.02, 0.03, 4.64, 1.50, 1.31, 0.03, 2.17, 2.74, 0.07, 2.10, 0.07, 0.34, 0.06, 0.57,
                 3.15, 0.03, 0.05, 0.05, 0.90, 0.15, 0.03, 0.05, 0.07],
    },
    'Dataset 3': {
        'models': ['ExtraTreesClassifier', 'RandomForestClassifier', 'LGBMClassifier', 'LabelSpreading', 'LabelPropagation',
                   'BaggingClassifier', 'KNeighborsClassifier', 'DecisionTreeClassifier', 'ExtraTreeClassifier', 'SVC', 'NuSVC',
                   'LogisticRegression', 'CalibratedClassifierCV', 'LinearSVC', 'SGDClassifier', 'QuadraticDiscriminantAnalysis',
                   'RidgeClassifier', 'RidgeClassifierCV', 'AdaBoostClassifier', 'LinearDiscriminantAnalysis', 'PassiveAggressiveClassifier',
                   'Perceptron', 'NearestCentroid', 'BernoulliNB', 'GaussianNB', 'DummyClassifier'],
        'accuracies': [0.97, 0.96, 0.96, 0.94, 0.94, 0.94, 0.89, 0.89, 0.86, 0.81, 0.81, 0.73, 0.64, 0.62, 0.62, 0.58, 0.58,
                       0.54, 0.54, 0.54, 0.53, 0.53, 0.53, 0.49, 0.40, 0.39, 0.35, 0.16],
        'time': [2.98, 9.66, 4.62, 23.73, 14.60, 8.72, 0.70, 1.14, 0.07, 13.10, 32.57, 0.62, 10.63, 1.62, 0.64, 0.15, 0.05,
                 0.20, 8.19, 0.13, 0.30, 0.26, 0.05, 0.09, 0.07, 0.04],
    },
    'Dataset 4': {
        'models': ['ExtraTreesClassifier', 'XGBClassifier', 'LGBMClassifier', 'RandomForestClassifier', 'LabelPropagation',
                   'KNeighborsClassifier', 'LabelSpreading', 'BaggingClassifier', 'DecisionTreeClassifier', 'ExtraTreeClassifier',
                   'SVC', 'QuadraticDiscriminantAnalysis', 'GaussianNB', 'AdaBoostClassifier', 'NearestCentroid', 'LogisticRegression',
                   'LinearDiscriminantAnalysis', 'CalibratedClassifierCV'],
        'accuracies': [0.91, 0.89, 0.89, 0.89, 0.86, 0.87, 0.86, 0.86, 0.82, 0.81, 0.82, 0.73, 0.69, 0.65, 0.63, 0.71, 0.70, 0.70],
        'time': [2.98, 9.66, 4.62, 1.31, 2.44, 0.70, 1.50, 1.31, 1.31, 1.14, 2.10, 0.07, 0.02, 0.04, 0.62, 0.07, 0.07, 0.07],
    },
    'Dataset 5': {
        'models': ['XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier', 'RandomForestClassifier', 'BaggingClassifier',
                   'LabelSpreading', 'PassiveAggressiveClassifier', 'DecisionTreeClassifier', 'LabelPropagation', 'NearestCentroid',
                   'ExtraTreesClassifier', 'GaussianNB', 'ExtraTreeClassifier', 'AdaBoostClassifier', 'QuadraticDiscriminantAnalysis',
                   'SVC', 'SGDClassifier', 'BernoulliNB', 'Perceptron', 'LogisticRegression', 'LinearDiscriminantAnalysis',
                   'LinearSVC', 'CalibratedClassifierCV', 'RidgeClassifierCV', 'RidgeClassifier', 'DummyClassifier'],
        'accuracies': [0.64, 0.65, 0.63, 0.63, 0.61, 0.62, 0.55, 0.57, 0.61, 0.49, 0.61, 0.59, 0.55, 0.58, 0.60, 0.63, 0.58,
                       0.58, 0.58, 0.45, 0.58, 0.58, 0.58, 0.58, 0.57],
        'time': [1.00, 2.47, 1.50, 1.10, 0.53, 0.37, 0.02, 1.01, 0.07, 0.04, 0.13, 0.04, 0.09, 0.06, 0.01, 0.18, 0.13,
                 0.06, 0.17, 0.09, 0.03, 0.07, 0.04, 0.08, 0.14],
    }
}

# Set positions for bars
bar_width = 0.15
index = np.arange(len(selected_models))

fig, ax = plt.subplots(figsize=(14, 8))

# Plot accuracies and times for each dataset and model
for i, dataset in enumerate(datasets.keys()):
    accuracies = datasets[dataset]['accuracies']
    times = datasets[dataset]['time']

    ax.bar(index + i * bar_width, accuracies, bar_width, label=f'{dataset} Accuracy')
    ax.bar(index + i * bar_width + bar_width / 2, times, bar_width, label=f'{dataset} Time')

# Add labels, title, and legend
ax.set_xlabel('Models')
ax.set_ylabel('Scores')
ax.set_title('Model Performance across Datasets')
ax.set_xticks(index + 2 * bar_width)
ax.set_xticklabels(selected_models, rotation=45)
ax.legend()

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Selected models to plot
selected_models = [
    'RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
    'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
    'GaussianNB', 'ExtraTreesClassifier'
]

# Dataset results for each dataset
datasets = {
    'Dataset 1': {
        'models': ['LGBMClassifier', 'RandomForestClassifier', 'XGBClassifier', 'ExtraTreesClassifier', 'KNeighborsClassifier',
                   'BaggingClassifier', 'LabelPropagation', 'LabelSpreading', 'AdaBoostClassifier', 'GaussianNB',
                   'QuadraticDiscriminantAnalysis', 'ExtraTreeClassifier', 'SVC', 'DecisionTreeClassifier', 'LinearSVC',
                   'RidgeClassifierCV', 'RidgeClassifier', 'LinearDiscriminantAnalysis', 'CalibratedClassifierCV',
                   'SGDClassifier', 'LogisticRegression', 'BernoulliNB', 'Perceptron', 'PassiveAggressiveClassifier',
                   'NearestCentroid', 'DummyClassifier'],
        'accuracies': [0.65, 0.64, 0.64, 0.63, 0.62, 0.62, 0.64, 0.64, 0.60, 0.52, 0.51, 0.57, 0.63, 0.56, 0.61, 0.61,
                       0.61, 0.61, 0.60, 0.61, 0.60, 0.39, 0.59, 0.27, 0.48],
        'time': [1.66, 1.97, 1.55, 1.45, 0.33, 0.41, 2.44, 5.71, 0.49, 0.02, 0.06, 0.03, 6.17, 0.09, 0.08, 0.03, 0.03,
                 0.08, 0.30, 0.14, 0.19, 0.04, 0.12, 0.12, 0.05, 0.03],
    },
    'Dataset 2': {
        'models': ['BaggingClassifier', 'DecisionTreeClassifier', 'KNeighborsClassifier', 'NearestCentroid',
                   'ExtraTreeClassifier', 'XGBClassifier', 'ExtraTreesClassifier', 'RandomForestClassifier', 'GaussianNB',
                   'LabelPropagation', 'LabelSpreading', 'PassiveAggressiveClassifier', 'LGBMClassifier', 'Perceptron',
                   'LogisticRegression', 'LinearDiscriminantAnalysis', 'CalibratedClassifierCV', 'SVC', 'DummyClassifier',
                   'BernoulliNB', 'AdaBoostClassifier', 'SGDClassifier', 'RidgeClassifier', 'RidgeClassifierCV', 'LinearSVC'],
        'accuracies': [0.67, 0.61, 0.67, 0.16, 0.60, 0.68, 0.69, 0.69, 0.44, 0.65, 0.65, 0.23, 0.40, 0.53, 0.63, 0.63,
                       0.63, 0.63, 0.63, 0.63, 0.63, 0.63, 0.63, 0.63],
        'time': [0.48, 0.09, 0.31, 0.02, 0.03, 4.64, 1.50, 1.31, 0.03, 2.17, 2.74, 0.07, 2.10, 0.07, 0.34, 0.06, 0.57,
                 3.15, 0.03, 0.05, 0.05, 0.90, 0.15, 0.03, 0.05, 0.07],
    },
    'Dataset 3': {
        'models': ['ExtraTreesClassifier', 'RandomForestClassifier', 'LGBMClassifier', 'LabelSpreading', 'LabelPropagation',
                   'BaggingClassifier', 'KNeighborsClassifier', 'DecisionTreeClassifier', 'ExtraTreeClassifier', 'SVC', 'NuSVC',
                   'LogisticRegression', 'CalibratedClassifierCV', 'LinearSVC', 'SGDClassifier', 'QuadraticDiscriminantAnalysis',
                   'RidgeClassifier', 'RidgeClassifierCV', 'AdaBoostClassifier', 'LinearDiscriminantAnalysis', 'PassiveAggressiveClassifier',
                   'Perceptron', 'NearestCentroid', 'BernoulliNB', 'GaussianNB', 'DummyClassifier'],
        'accuracies': [0.97, 0.96, 0.96, 0.94, 0.94, 0.94, 0.89, 0.89, 0.86, 0.81, 0.81, 0.73, 0.64, 0.62, 0.62, 0.58, 0.58,
                       0.54, 0.54, 0.54, 0.53, 0.53, 0.53, 0.49, 0.40, 0.39, 0.35, 0.16],
        'time': [2.98, 9.66, 4.62, 23.73, 14.60, 8.72, 0.70, 1.14, 0.07, 13.10, 32.57, 0.62, 10.63, 1.62, 0.64, 0.15, 0.05,
                 0.20, 8.19, 0.13, 0.30, 0.26, 0.05, 0.09, 0.07, 0.04],
    },
    'Dataset 4': {
        'models': ['ExtraTreesClassifier', 'XGBClassifier', 'LGBMClassifier', 'RandomForestClassifier', 'LabelPropagation',
                   'KNeighborsClassifier', 'LabelSpreading', 'BaggingClassifier', 'DecisionTreeClassifier', 'ExtraTreeClassifier',
                   'SVC', 'QuadraticDiscriminantAnalysis', 'GaussianNB', 'AdaBoostClassifier', 'NearestCentroid', 'LogisticRegression',
                   'LinearDiscriminantAnalysis', 'CalibratedClassifierCV'],
        'accuracies': [0.91, 0.89, 0.89, 0.89, 0.86, 0.87, 0.86, 0.86, 0.82, 0.81, 0.82, 0.73, 0.69, 0.65, 0.63, 0.71, 0.70, 0.70],
        'time': [2.98, 9.66, 4.62, 1.31, 2.44, 0.70, 1.50, 1.31, 1.31, 1.14, 2.10, 0.07, 0.02, 0.04, 0.62, 0.07, 0.07, 0.07],
    },
    'Dataset 5': {
        'models': ['XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier', 'RandomForestClassifier', 'BaggingClassifier',
                   'LabelSpreading', 'PassiveAggressiveClassifier', 'DecisionTreeClassifier', 'LabelPropagation', 'NearestCentroid',
                   'ExtraTreesClassifier', 'GaussianNB', 'ExtraTreeClassifier', 'AdaBoostClassifier', 'QuadraticDiscriminantAnalysis',
                   'SVC', 'SGDClassifier', 'BernoulliNB', 'Perceptron', 'LogisticRegression', 'LinearDiscriminantAnalysis',
                   'LinearSVC', 'CalibratedClassifierCV', 'RidgeClassifierCV', 'RidgeClassifier', 'DummyClassifier'],
        'accuracies': [0.64, 0.65, 0.63, 0.63, 0.62, 0.61, 0.60, 0.60, 0.60, 0.59, 0.59, 0.58, 0.57, 0.56, 0.55, 0.53, 0.53, 0.52],
        'time': [0.16, 2.70, 1.21, 0.43, 2.29, 0.39, 0.37, 1.57, 2.48, 2.65, 0.33, 0.57, 0.12, 0.37, 0.57, 0.08, 0.09, 0.11],
    }
}

# Filter the accuracies and times to only include the selected models
bar_width = 0.15
index = np.arange(len(selected_models))

fig, ax = plt.subplots(figsize=(14, 8))

# Plot accuracies and times for each dataset and model
for i, dataset in enumerate(datasets.keys()):
    accuracies = [datasets[dataset]['accuracies'][datasets[dataset]['models'].index(model)] for model in selected_models]
    times = [datasets[dataset]['time'][datasets[dataset]['models'].index(model)] for model in selected_models]

    ax.bar(index + i * bar_width, accuracies, bar_width, label=f'{dataset} Accuracy')
    ax.bar(index + i * bar_width + bar_width / 2, times, bar_width, label=f'{dataset} Time')

# Add labels, title, and legend
ax.set_xlabel('Models')
ax.set_ylabel('Scores')
ax.set_title('Model Performance across Datasets')
ax.set_xticks(index + 2 * bar_width)
ax.set_xticklabels(selected_models, rotation=45)
ax.legend()

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Selected models to plot
selected_models = [
    'RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
    'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
    'GaussianNB', 'ExtraTreesClassifier'
]

# Dataset results for each dataset
datasets = {
    'Dataset 1': {
        'models': ['LGBMClassifier', 'RandomForestClassifier', 'XGBClassifier', 'ExtraTreesClassifier', 'KNeighborsClassifier',
                   'BaggingClassifier', 'LabelPropagation', 'LabelSpreading', 'AdaBoostClassifier', 'GaussianNB',
                   'QuadraticDiscriminantAnalysis', 'ExtraTreeClassifier', 'SVC', 'DecisionTreeClassifier', 'LinearSVC',
                   'RidgeClassifierCV', 'RidgeClassifier', 'LinearDiscriminantAnalysis', 'CalibratedClassifierCV',
                   'SGDClassifier', 'LogisticRegression', 'BernoulliNB', 'Perceptron', 'PassiveAggressiveClassifier',
                   'NearestCentroid', 'DummyClassifier'],
        'accuracies': [0.65, 0.64, 0.64, 0.63, 0.62, 0.62, 0.64, 0.64, 0.60, 0.52, 0.51, 0.57, 0.63, 0.56, 0.61, 0.61,
                       0.61, 0.61, 0.60, 0.61, 0.60, 0.39, 0.59, 0.27, 0.48],
        'time': [1.66, 1.97, 1.55, 1.45, 0.33, 0.41, 2.44, 5.71, 0.49, 0.02, 0.06, 0.03, 6.17, 0.09, 0.08, 0.03, 0.03,
                 0.08, 0.30, 0.14, 0.19, 0.04, 0.12, 0.12, 0.05, 0.03],
    },
    'Dataset 2': {
        'models': ['BaggingClassifier', 'DecisionTreeClassifier', 'KNeighborsClassifier', 'NearestCentroid',
                   'ExtraTreeClassifier', 'XGBClassifier', 'ExtraTreesClassifier', 'RandomForestClassifier', 'GaussianNB',
                   'LabelPropagation', 'LabelSpreading', 'PassiveAggressiveClassifier', 'LGBMClassifier', 'Perceptron',
                   'LogisticRegression', 'LinearDiscriminantAnalysis', 'CalibratedClassifierCV', 'SVC', 'DummyClassifier',
                   'BernoulliNB', 'AdaBoostClassifier', 'SGDClassifier', 'RidgeClassifier', 'RidgeClassifierCV', 'LinearSVC'],
        'accuracies': [0.67, 0.61, 0.67, 0.16, 0.60, 0.68, 0.69, 0.69, 0.44, 0.65, 0.65, 0.23, 0.40, 0.53, 0.63, 0.63,
                       0.63, 0.63, 0.63, 0.63, 0.63, 0.63, 0.63, 0.63],
        'time': [0.48, 0.09, 0.31, 0.02, 0.03, 4.64, 1.50, 1.31, 0.03, 2.17, 2.74, 0.07, 2.10, 0.07, 0.34, 0.06, 0.57,
                 3.15, 0.03, 0.05, 0.05, 0.90, 0.15, 0.03, 0.05, 0.07],
    },
    'Dataset 3': {
        'models': ['ExtraTreesClassifier', 'RandomForestClassifier', 'LGBMClassifier', 'LabelSpreading', 'LabelPropagation',
                   'BaggingClassifier', 'KNeighborsClassifier', 'DecisionTreeClassifier', 'ExtraTreeClassifier', 'SVC', 'NuSVC',
                   'LogisticRegression', 'CalibratedClassifierCV', 'LinearSVC', 'SGDClassifier', 'QuadraticDiscriminantAnalysis',
                   'RidgeClassifier', 'RidgeClassifierCV', 'AdaBoostClassifier', 'LinearDiscriminantAnalysis', 'PassiveAggressiveClassifier',
                   'Perceptron', 'NearestCentroid', 'BernoulliNB', 'GaussianNB', 'DummyClassifier'],
        'accuracies': [0.97, 0.96, 0.96, 0.94, 0.94, 0.94, 0.89, 0.89, 0.86, 0.81, 0.81, 0.73, 0.64, 0.62, 0.62, 0.58, 0.58,
                       0.54, 0.54, 0.54, 0.53, 0.53, 0.53, 0.49, 0.40, 0.39, 0.35, 0.16],
        'time': [2.98, 9.66, 4.62, 23.73, 14.60, 8.72, 0.70, 1.14, 0.07, 13.10, 32.57, 0.62, 10.63, 1.62, 0.64, 0.15, 0.05,
                 0.20, 8.19, 0.13, 0.30, 0.26, 0.05, 0.09, 0.07, 0.04],
    },
    'Dataset 4': {
        'models': ['ExtraTreesClassifier', 'XGBClassifier', 'LGBMClassifier', 'RandomForestClassifier', 'LabelPropagation',
                   'KNeighborsClassifier', 'LabelSpreading', 'BaggingClassifier', 'DecisionTreeClassifier', 'ExtraTreeClassifier',
                   'SVC', 'QuadraticDiscriminantAnalysis', 'GaussianNB', 'AdaBoostClassifier', 'NearestCentroid', 'LogisticRegression',
                   'LinearDiscriminantAnalysis', 'CalibratedClassifierCV'],
        'accuracies': [0.91, 0.89, 0.89, 0.89, 0.86, 0.87, 0.86, 0.86, 0.82, 0.81, 0.82, 0.73, 0.69, 0.65, 0.63, 0.71, 0.70, 0.70],
        'time': [2.98, 9.66, 4.62, 1.31, 2.44, 0.70, 1.50, 1.31, 1.31, 1.14, 2.10, 0.07, 0.02, 0.04, 0.62, 0.07, 0.07, 0.07],
    },
    'Dataset 5': {
        'models': ['XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier', 'RandomForestClassifier', 'BaggingClassifier',
                   'LabelSpreading', 'PassiveAggressiveClassifier', 'DecisionTreeClassifier', 'LabelPropagation', 'NearestCentroid',
                   'ExtraTreesClassifier', 'GaussianNB', 'ExtraTreeClassifier', 'AdaBoostClassifier', 'QuadraticDiscriminantAnalysis',
                   'SVC', 'SGDClassifier', 'BernoulliNB', 'Perceptron', 'LogisticRegression', 'LinearDiscriminantAnalysis',
                   'LinearSVC', 'CalibratedClassifierCV', 'RidgeClassifierCV', 'RidgeClassifier', 'DummyClassifier'],
        'accuracies': [0.64, 0.65, 0.63, 0.63, 0.62, 0.61, 0.60, 0.60, 0.60, 0.59, 0.59, 0.58, 0.57, 0.56, 0.55, 0.53, 0.53, 0.52],
        'time': [0.16, 2.70, 1.21, 0.43, 2.29, 0.39, 0.37, 1.57, 2.48, 2.65, 0.33, 0.57, 0.12, 0.37, 0.57, 0.08, 0.09, 0.11],
    }
}

# Filter the accuracies and times to only include the selected models
bar_width = 0.15
index = np.arange(len(selected_models))

fig, ax = plt.subplots(figsize=(14, 8))

# Plot accuracies and times for each dataset and model
for i, dataset in enumerate(datasets.keys()):
    # Filter models for the current dataset that are in selected_models
    valid_models = [model for model in selected_models if model in datasets[dataset]['models']]

    # Get the accuracies and times only for valid models
    accuracies = [datasets[dataset]['accuracies'][datasets[dataset]['models'].index(model)] for model in valid_models]
    times = [datasets[dataset]['time'][datasets[dataset]['models'].index(model)] for model in valid_models]

    # Plot bars
    ax.bar(index + i * bar_width, accuracies, bar_width, label=f'{dataset} Accuracy')
    ax.bar(index + i * bar_width + bar_width / 2, times, bar_width, label=f'{dataset} Time')

# Add labels, title, and legend
ax.set_xlabel('Models')
ax.set_ylabel('Scores')
ax.set_title('Model Performance across Datasets')
ax.set_xticks(index + 2 * bar_width)
ax.set_xticklabels(selected_models, rotation=45)
ax.legend()

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Selected models to plot
selected_models = [
    'RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
    'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
    'GaussianNB', 'ExtraTreesClassifier'
]

# Dataset results for each dataset
datasets = {
    'Dataset 1': {
        'models': ['LGBMClassifier', 'RandomForestClassifier', 'XGBClassifier', 'ExtraTreesClassifier', 'KNeighborsClassifier',
                   'BaggingClassifier', 'LabelPropagation', 'LabelSpreading', 'AdaBoostClassifier', 'GaussianNB',
                   'QuadraticDiscriminantAnalysis', 'ExtraTreeClassifier', 'SVC', 'DecisionTreeClassifier', 'LinearSVC',
                   'RidgeClassifierCV', 'RidgeClassifier', 'LinearDiscriminantAnalysis', 'CalibratedClassifierCV',
                   'SGDClassifier', 'LogisticRegression', 'BernoulliNB', 'Perceptron', 'PassiveAggressiveClassifier',
                   'NearestCentroid', 'DummyClassifier'],
        'accuracies': [0.65, 0.64, 0.64, 0.63, 0.62, 0.62, 0.64, 0.64, 0.60, 0.52, 0.51, 0.57, 0.63, 0.56, 0.61, 0.61,
                       0.61, 0.61, 0.60, 0.61, 0.60, 0.39, 0.59, 0.27, 0.48],
        'time': [1.66, 1.97, 1.55, 1.45, 0.33, 0.41, 2.44, 5.71, 0.49, 0.02, 0.06, 0.03, 6.17, 0.09, 0.08, 0.03, 0.03,
                 0.08, 0.30, 0.14, 0.19, 0.04, 0.12, 0.12, 0.05, 0.03],
    },
    'Dataset 2': {
        'models': ['BaggingClassifier', 'DecisionTreeClassifier', 'KNeighborsClassifier', 'NearestCentroid',
                   'ExtraTreeClassifier', 'XGBClassifier', 'ExtraTreesClassifier', 'RandomForestClassifier', 'GaussianNB',
                   'LabelPropagation', 'LabelSpreading', 'PassiveAggressiveClassifier', 'LGBMClassifier', 'Perceptron',
                   'LogisticRegression', 'LinearDiscriminantAnalysis', 'CalibratedClassifierCV', 'SVC', 'DummyClassifier',
                   'BernoulliNB', 'AdaBoostClassifier', 'SGDClassifier', 'RidgeClassifier', 'RidgeClassifierCV', 'LinearSVC'],
        'accuracies': [0.67, 0.61, 0.67, 0.16, 0.60, 0.68, 0.69, 0.69, 0.44, 0.65, 0.65, 0.23, 0.40, 0.53, 0.63, 0.63,
                       0.63, 0.63, 0.63, 0.63, 0.63, 0.63, 0.63, 0.63],
        'time': [0.48, 0.09, 0.31, 0.02, 0.03, 4.64, 1.50, 1.31, 0.03, 2.17, 2.74, 0.07, 2.10, 0.07, 0.34, 0.06, 0.57,
                 3.15, 0.03, 0.05, 0.05, 0.90, 0.15, 0.03, 0.05, 0.07],
    },
    'Dataset 3': {
        'models': ['ExtraTreesClassifier', 'RandomForestClassifier', 'LGBMClassifier', 'LabelSpreading', 'LabelPropagation',
                   'BaggingClassifier', 'KNeighborsClassifier', 'DecisionTreeClassifier', 'ExtraTreeClassifier', 'SVC', 'NuSVC',
                   'LogisticRegression', 'CalibratedClassifierCV', 'LinearSVC', 'SGDClassifier', 'QuadraticDiscriminantAnalysis',
                   'RidgeClassifier', 'RidgeClassifierCV', 'AdaBoostClassifier', 'LinearDiscriminantAnalysis', 'PassiveAggressiveClassifier',
                   'Perceptron', 'NearestCentroid', 'BernoulliNB', 'GaussianNB', 'DummyClassifier'],
        'accuracies': [0.97, 0.96, 0.96, 0.94, 0.94, 0.94, 0.89, 0.89, 0.86, 0.81, 0.81, 0.73, 0.64, 0.62, 0.62, 0.58, 0.58,
                       0.54, 0.54, 0.54, 0.53, 0.53, 0.53, 0.49, 0.40, 0.39, 0.35, 0.16],
        'time': [2.98, 9.66, 4.62, 23.73, 14.60, 8.72, 0.70, 1.14, 0.07, 13.10, 32.57, 0.62, 10.63, 1.62, 0.64, 0.15, 0.05,
                 0.20, 8.19, 0.13, 0.30, 0.26, 0.05, 0.09, 0.07, 0.04],
    },
    'Dataset 4': {
        'models': ['ExtraTreesClassifier', 'XGBClassifier', 'LGBMClassifier', 'RandomForestClassifier', 'LabelPropagation',
                   'KNeighborsClassifier', 'LabelSpreading', 'BaggingClassifier', 'DecisionTreeClassifier', 'ExtraTreeClassifier',
                   'SVC', 'QuadraticDiscriminantAnalysis', 'GaussianNB', 'AdaBoostClassifier', 'NearestCentroid', 'LogisticRegression',
                   'LinearDiscriminantAnalysis', 'CalibratedClassifierCV'],
        'accuracies': [0.91, 0.89, 0.89, 0.89, 0.86, 0.87, 0.86, 0.86, 0.82, 0.81, 0.82, 0.73, 0.69, 0.65, 0.63, 0.71, 0.70, 0.70],
        'time': [2.98, 9.66, 4.62, 1.31, 2.44, 0.70, 1.50, 1.31, 1.31, 1.14, 2.10, 0.07, 0.02, 0.04, 0.62, 0.07, 0.07, 0.07],
    },
    'Dataset 5': {
        'models': ['XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier', 'RandomForestClassifier', 'BaggingClassifier',
                   'LabelSpreading', 'PassiveAggressiveClassifier', 'DecisionTreeClassifier', 'LabelPropagation', 'NearestCentroid',
                   'ExtraTreesClassifier', 'GaussianNB', 'ExtraTreeClassifier', 'AdaBoostClassifier', 'QuadraticDiscriminantAnalysis',
                   'SVC', 'SGDClassifier', 'BernoulliNB', 'Perceptron', 'LogisticRegression', 'LinearDiscriminantAnalysis',
                   'LinearSVC', 'CalibratedClassifierCV', 'RidgeClassifierCV', 'RidgeClassifier', 'DummyClassifier'],
        'accuracies': [0.64, 0.65, 0.63, 0.63, 0.62, 0.61, 0.60, 0.60, 0.60, 0.59, 0.59, 0.58, 0.57, 0.56, 0.55, 0.53, 0.53, 0.52],
        'time': [0.16, 2.70, 1.21, 0.43, 2.29, 0.39, 0.37, 1.57, 2.48, 2.65, 0.33, 0.57, 0.12, 0.37, 0.57, 0.08, 0.09, 0.11],
    }
}

# Filter the accuracies and times to only include the selected models
bar_width = 0.15
index = np.arange(len(selected_models))

fig, ax = plt.subplots(figsize=(14, 8))

# Plot accuracies and times for each dataset and model
for i, dataset in enumerate(datasets.keys()):
    # Filter models for the current dataset that are in selected_models
    valid_models = [model for model in selected_models if model in datasets[dataset]['models']]

    # Get the accuracies and times only for valid models
    accuracies = [datasets[dataset]['accuracies'][datasets[dataset]['models'].index(model)] for model in valid_models]
    times = [datasets[dataset]['time'][datasets[dataset]['models'].index(model)] for model in valid_models]

    # Plot bars
    ax.bar(index + i * bar_width, accuracies, bar_width, label=f'{dataset} Accuracy')
    ax.bar(index + i * bar_width + bar_width / 2, times, bar_width, label=f'{dataset} Time')

# Add labels, title, and legend
ax.set_xlabel('Models')
ax.set_ylabel('Scores')
ax.set_title('Model Performance across Datasets')
ax.set_xticks(index + 2 * bar_width)
ax.set_xticklabels(selected_models, rotation=45)
ax.legend()

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Selected models to plot
selected_models = [
    'RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
    'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
    'GaussianNB', 'ExtraTreesClassifier'
]

# Dataset results for each dataset
datasets = {
    'Dataset 1': {
        'models': ['LGBMClassifier', 'RandomForestClassifier', 'XGBClassifier', 'ExtraTreesClassifier', 'KNeighborsClassifier',
                   'BaggingClassifier', 'LabelPropagation', 'LabelSpreading', 'AdaBoostClassifier', 'GaussianNB',
                   'QuadraticDiscriminantAnalysis', 'ExtraTreeClassifier', 'SVC', 'DecisionTreeClassifier', 'LinearSVC',
                   'RidgeClassifierCV', 'RidgeClassifier', 'LinearDiscriminantAnalysis', 'CalibratedClassifierCV',
                   'SGDClassifier', 'LogisticRegression', 'BernoulliNB', 'Perceptron', 'PassiveAggressiveClassifier',
                   'NearestCentroid', 'DummyClassifier'],
        'accuracies': [0.65, 0.64, 0.64, 0.63, 0.62, 0.62, 0.64, 0.64, 0.60, 0.52, 0.51, 0.57, 0.63, 0.56, 0.61, 0.61,
                       0.61, 0.61, 0.60, 0.61, 0.60, 0.39, 0.59, 0.27, 0.48],
        'time': [1.66, 1.97, 1.55, 1.45, 0.33, 0.41, 2.44, 5.71, 0.49, 0.02, 0.06, 0.03, 6.17, 0.09, 0.08, 0.03, 0.03,
                 0.08, 0.30, 0.14, 0.19, 0.04, 0.12, 0.12, 0.05, 0.03],
    },
    'Dataset 2': {
        'models': ['BaggingClassifier', 'DecisionTreeClassifier', 'KNeighborsClassifier', 'NearestCentroid',
                   'ExtraTreeClassifier', 'XGBClassifier', 'ExtraTreesClassifier', 'RandomForestClassifier', 'GaussianNB',
                   'LabelPropagation', 'LabelSpreading', 'PassiveAggressiveClassifier', 'LGBMClassifier', 'Perceptron',
                   'LogisticRegression', 'LinearDiscriminantAnalysis', 'CalibratedClassifierCV', 'SVC', 'DummyClassifier',
                   'BernoulliNB', 'AdaBoostClassifier', 'SGDClassifier', 'RidgeClassifier', 'RidgeClassifierCV', 'LinearSVC'],
        'accuracies': [0.67, 0.61, 0.67, 0.16, 0.60, 0.68, 0.69, 0.69, 0.44, 0.65, 0.65, 0.23, 0.40, 0.53, 0.63, 0.63,
                       0.63, 0.63, 0.63, 0.63, 0.63, 0.63, 0.63, 0.63],
        'time': [0.48, 0.09, 0.31, 0.02, 0.03, 4.64, 1.50, 1.31, 0.03, 2.17, 2.74, 0.07, 2.10, 0.07, 0.34, 0.06, 0.57,
                 3.15, 0.03, 0.05, 0.05, 0.90, 0.15, 0.03, 0.05, 0.07],
    },    },
    'Dataset 3': {
        'models': ['ExtraTreesClassifier', 'RandomForestClassifier', 'LGBMClassifier', 'LabelSpreading', 'LabelPropagation',
                   'BaggingClassifier', 'KNeighborsClassifier', 'DecisionTreeClassifier', 'ExtraTreeClassifier', 'SVC', 'NuSVC',
                   'LogisticRegression', 'CalibratedClassifierCV', 'LinearSVC', 'SGDClassifier', 'QuadraticDiscriminantAnalysis',
                   'RidgeClassifier', 'RidgeClassifierCV', 'AdaBoostClassifier', 'LinearDiscriminantAnalysis', 'PassiveAggressiveClassifier',
                   'Perceptron', 'NearestCentroid', 'BernoulliNB', 'GaussianNB', 'DummyClassifier'],
        'accuracies': [0.97, 0.96, 0.96, 0.94, 0.94, 0.94, 0.89, 0.89, 0.86, 0.81, 0.81, 0.73, 0.64, 0.62, 0.62, 0.58, 0.58,
                       0.54, 0.54, 0.54, 0.53, 0.53, 0.53, 0.49, 0.40, 0.39, 0.35, 0.16],
        'time': [2.98, 9.66, 4.62, 23.73, 14.60, 8.72, 0.70, 1.14, 0.07, 13.10, 32.57, 0.62, 10.63, 1.62, 0.64, 0.15, 0.05,
                 0.20, 8.19, 0.13, 0.30, 0.26, 0.05, 0.09, 0.07, 0.04],
    },
    'Dataset 4': {
        'models': ['ExtraTreesClassifier', 'XGBClassifier', 'LGBMClassifier', 'RandomForestClassifier', 'LabelPropagation',
                   'KNeighborsClassifier', 'LabelSpreading', 'BaggingClassifier', 'DecisionTreeClassifier', 'ExtraTreeClassifier',
                   'SVC', 'QuadraticDiscriminantAnalysis', 'GaussianNB', 'AdaBoostClassifier', 'NearestCentroid', 'LogisticRegression',
                   'LinearDiscriminantAnalysis', 'CalibratedClassifierCV'],
        'accuracies': [0.91, 0.89, 0.89, 0.89, 0.86, 0.87, 0.86, 0.86, 0.82, 0.81, 0.82, 0.73, 0.69, 0.65, 0.63, 0.71, 0.70, 0.70],
        'time': [2.98, 9.66, 4.62, 1.31, 2.44, 0.70, 1.50, 1.31, 1.31, 1.14, 2.10, 0.07, 0.02, 0.04, 0.62, 0.07, 0.07, 0.07],
    },
    'Dataset 5': {
        'models': ['XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier', 'RandomForestClassifier', 'BaggingClassifier',
                   'LabelSpreading', 'PassiveAggressiveClassifier', 'DecisionTreeClassifier', 'LabelPropagation', 'NearestCentroid',
                   'ExtraTreesClassifier', 'GaussianNB', 'ExtraTreeClassifier', 'AdaBoostClassifier', 'QuadraticDiscriminantAnalysis',
                   'SVC', 'SGDClassifier', 'BernoulliNB', 'Perceptron', 'LogisticRegression', 'LinearDiscriminantAnalysis',
                   'LinearSVC', 'CalibratedClassifierCV', 'RidgeClassifierCV', 'RidgeClassifier', 'DummyClassifier'],
        'accuracies': [0.64, 0.65, 0.63, 0.63, 0.62, 0.61, 0.60, 0.60, 0.60, 0.59, 0.59, 0.58, 0.57, 0.56, 0.55, 0.53, 0.53, 0.52],
        'time': [0.16, 2.70, 1.21, 0.43, 2.29, 0.39, 0.37, 1.57, 2.48, 2.65, 0.33, 0.57, 0.12, 0.37, 0.57, 0.08, 0.09, 0.11],
    }

}

# Sorting models based on the selected models
def filter_selected_models(datasets, selected_models):
    filtered_data = {}
    for dataset, data in datasets.items():
        selected_indices = [i for i, model in enumerate(data['models']) if model in selected_models]
        filtered_data[dataset] = {
            'models': np.array(data['models'])[selected_indices],
            'accuracies': np.array(data['accuracies'])[selected_indices],
            'time': np.array(data['time'])[selected_indices]
        }
    return filtered_data

# Get the filtered data for the selected models
filtered_data = filter_selected_models(datasets, selected_models)

# Plotting accuracy and time performance for selected models
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 6))

# Plot for Accuracy
bar_width = 0.15  # Width of the bars
index_offset = 0.2  # Offset for the bars for each dataset
for i, (dataset, data) in enumerate(filtered_data.items()):
    # Offset each dataset by index * bar_width for separation
    positions = np.arange(len(data['models'])) + (i - len(filtered_data) / 2) * bar_width
    ax1.barh(positions, data['accuracies'], height=bar_width, label=dataset, alpha=0.7)

ax1.set_yticks(np.arange(len(filtered_data['Dataset 1']['models'])))
ax1.set_yticklabels(filtered_data['Dataset 1']['models'])
ax1.set_xlabel('Accuracy')
ax1.set_title('Top Models by Accuracy (Selected Models)')
ax1.legend()

# Plot for Time Performance
for i, (dataset, data) in enumerate(filtered_data.items()):
    # Offset each dataset by index * bar_width for separation
    positions = np.arange(len(data['models'])) + (i - len(filtered_data) / 2) * bar_width
    ax2.barh(positions, data['time'], height=bar_width, label=dataset, alpha=0.7)

ax2.set_yticks(np.arange(len(filtered_data['Dataset 1']['models'])))
ax2.set_yticklabels(filtered_data['Dataset 1']['models'])
ax2.set_xlabel('Time (s)')
ax2.set_title('Top Models by Time Performance (Selected Models)')
ax2.legend()

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Selected models to plot
selected_models = [
    'RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
    'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
    'GaussianNB', 'ExtraTreesClassifier'
]

# Dataset results for each dataset
datasets = {
    'Dataset 1': {
        'models': ['LGBMClassifier', 'RandomForestClassifier', 'XGBClassifier', 'ExtraTreesClassifier', 'KNeighborsClassifier',
                   'BaggingClassifier', 'LabelPropagation', 'LabelSpreading', 'AdaBoostClassifier', 'GaussianNB',
                   'QuadraticDiscriminantAnalysis', 'ExtraTreeClassifier', 'SVC', 'DecisionTreeClassifier', 'LinearSVC',
                   'RidgeClassifierCV', 'RidgeClassifier', 'LinearDiscriminantAnalysis', 'CalibratedClassifierCV',
                   'SGDClassifier', 'LogisticRegression', 'BernoulliNB', 'Perceptron', 'PassiveAggressiveClassifier',
                   'NearestCentroid', 'DummyClassifier'],
        'accuracies': [0.65, 0.64, 0.64, 0.63, 0.62, 0.62, 0.64, 0.64, 0.60, 0.52, 0.51, 0.57, 0.63, 0.56, 0.61, 0.61,
                       0.61, 0.61, 0.60, 0.61, 0.60, 0.39, 0.59, 0.27, 0.48],
        'time': [1.66, 1.97, 1.55, 1.45, 0.33, 0.41, 2.44, 5.71, 0.49, 0.02, 0.06, 0.03, 6.17, 0.09, 0.08, 0.03, 0.03,
                 0.08, 0.30, 0.14, 0.19, 0.04, 0.12, 0.12, 0.05, 0.03],
    },
    'Dataset 2': {
        'models': ['BaggingClassifier', 'DecisionTreeClassifier', 'KNeighborsClassifier', 'NearestCentroid',
                   'ExtraTreeClassifier', 'XGBClassifier', 'ExtraTreesClassifier', 'RandomForestClassifier', 'GaussianNB',
                   'LabelPropagation', 'LabelSpreading', 'PassiveAggressiveClassifier', 'LGBMClassifier', 'Perceptron',
                   'LogisticRegression', 'LinearDiscriminantAnalysis', 'CalibratedClassifierCV', 'SVC', 'DummyClassifier',
                   'BernoulliNB', 'AdaBoostClassifier', 'SGDClassifier', 'RidgeClassifier', 'RidgeClassifierCV', 'LinearSVC'],
        'accuracies': [0.67, 0.61, 0.67, 0.16, 0.60, 0.68, 0.69, 0.69, 0.44, 0.65, 0.65, 0.23, 0.40, 0.53, 0.63, 0.63,
                       0.63, 0.63, 0.63, 0.63, 0.63, 0.63, 0.63, 0.63],
        'time': [0.48, 0.09, 0.31, 0.02, 0.03, 4.64, 1.50, 1.31, 0.03, 2.17, 2.74, 0.07, 2.10, 0.07, 0.34, 0.06, 0.57,
                 3.15, 0.03, 0.05, 0.05, 0.90, 0.15, 0.03, 0.05, 0.07],
    },
    'Dataset 3': {
        'models': ['ExtraTreesClassifier', 'RandomForestClassifier', 'LGBMClassifier', 'LabelSpreading', 'LabelPropagation',
                   'BaggingClassifier', 'KNeighborsClassifier', 'DecisionTreeClassifier', 'ExtraTreeClassifier', 'SVC', 'NuSVC',
                   'LogisticRegression', 'CalibratedClassifierCV', 'LinearSVC', 'SGDClassifier', 'QuadraticDiscriminantAnalysis',
                   'RidgeClassifier', 'RidgeClassifierCV', 'AdaBoostClassifier', 'LinearDiscriminantAnalysis', 'PassiveAggressiveClassifier',
                   'Perceptron', 'NearestCentroid', 'BernoulliNB', 'GaussianNB', 'DummyClassifier'],
        'accuracies': [0.97, 0.96, 0.96, 0.94, 0.94, 0.94, 0.89, 0.89, 0.86, 0.81, 0.81, 0.73, 0.64, 0.62, 0.62, 0.58, 0.58,
                       0.54, 0.54, 0.54, 0.53, 0.53, 0.53, 0.49, 0.40, 0.39, 0.35, 0.16],
        'time': [2.98, 9.66, 4.62, 23.73, 14.60, 8.72, 0.70, 1.14, 0.07, 13.10, 32.57, 0.62, 10.63, 1.62, 0.64, 0.15, 0.05,
                 0.20, 8.19, 0.13, 0.30, 0.26, 0.05, 0.09, 0.07, 0.04],
    },
    'Dataset 4': {
        'models': ['ExtraTreesClassifier', 'XGBClassifier', 'LGBMClassifier', 'RandomForestClassifier', 'LabelPropagation',
                   'KNeighborsClassifier', 'LabelSpreading', 'BaggingClassifier', 'DecisionTreeClassifier', 'ExtraTreeClassifier',
                   'SVC', 'QuadraticDiscriminantAnalysis', 'GaussianNB', 'AdaBoostClassifier', 'NearestCentroid', 'LogisticRegression',
                   'LinearDiscriminantAnalysis', 'CalibratedClassifierCV'],
        'accuracies': [0.91, 0.89, 0.89, 0.89, 0.86, 0.87, 0.86, 0.86, 0.82, 0.81, 0.82, 0.73, 0.69, 0.65, 0.63, 0.71, 0.70, 0.70],
        'time': [2.98, 9.66, 4.62, 1.31, 2.44, 0.70, 1.50, 1.31, 1.31, 1.14, 2.10, 0.07, 0.02, 0.04, 0.62, 0.07, 0.07, 0.07],
    },
    'Dataset 5': {
        'models': ['XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier', 'RandomForestClassifier', 'BaggingClassifier',
                   'LabelSpreading', 'PassiveAggressiveClassifier', 'DecisionTreeClassifier', 'LabelPropagation', 'NearestCentroid',
                   'ExtraTreesClassifier', 'GaussianNB', 'ExtraTreeClassifier', 'AdaBoostClassifier', 'QuadraticDiscriminantAnalysis',
                   'SVC', 'SGDClassifier', 'BernoulliNB', 'Perceptron', 'LogisticRegression', 'LinearDiscriminantAnalysis',
                   'LinearSVC', 'CalibratedClassifierCV', 'RidgeClassifierCV', 'RidgeClassifier', 'DummyClassifier'],
        'accuracies': [0.64, 0.65, 0.63, 0.63, 0.62, 0.61, 0.60, 0.60, 0.60, 0.59, 0.59, 0.58, 0.57, 0.56, 0.55, 0.53, 0.53, 0.52],
        'time': [0.16, 2.70, 1.21, 0.43, 2.29, 0.39, 0.37, 1.57, 2.48, 2.65, 2.56, 2.36, 3.45, 0.05, 0.40, 0.03, 0.13, 0.18],
    }
}

# Create the plot
fig, ax = plt.subplots(figsize=(14, 8))

# Color map
colors = plt.cm.Paired(np.arange(len(datasets)))

# Plot for each dataset
for i, (dataset, data) in enumerate(datasets.items()):
    bar_width = 0.15
    x_pos = np.arange(len(selected_models)) + i * bar_width

    ax.bar(x_pos, data['accuracies'], bar_width, label=f'{dataset} Accuracies', color=colors[i])
    ax.bar(x_pos + bar_width, data['time'], bar_width, label=f'{dataset} Time', color=colors[i], alpha=0.6)

ax.set_xticks(np.arange(len(selected_models)) + bar_width * (len(datasets) - 1) / 2)
ax.set_xticklabels(selected_models, rotation=45)
ax.set_xlabel('Models')
ax.set_ylabel('Scores and Time (in seconds)')
ax.set_title('Model Performance Comparison')
ax.legend()

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Selected models to plot (these should match the models available in each dataset)
selected_models = [
    'RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
    'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
    'GaussianNB', 'ExtraTreesClassifier'
]

# Dataset results for each dataset
datasets = {
    'Dataset 1': {
        'models': ['LGBMClassifier', 'RandomForestClassifier', 'XGBClassifier', 'ExtraTreesClassifier', 'KNeighborsClassifier',
                   'BaggingClassifier', 'LabelPropagation', 'LabelSpreading', 'AdaBoostClassifier', 'GaussianNB',
                   'QuadraticDiscriminantAnalysis', 'ExtraTreeClassifier', 'SVC', 'DecisionTreeClassifier', 'LinearSVC',
                   'RidgeClassifierCV', 'RidgeClassifier', 'LinearDiscriminantAnalysis', 'CalibratedClassifierCV',
                   'SGDClassifier', 'LogisticRegression', 'BernoulliNB', 'Perceptron', 'PassiveAggressiveClassifier',
                   'NearestCentroid', 'DummyClassifier'],
        'accuracies': [0.65, 0.64, 0.64, 0.63, 0.62, 0.62, 0.64, 0.64, 0.60, 0.52, 0.51, 0.57, 0.63, 0.56, 0.61, 0.61,
                       0.61, 0.61, 0.60, 0.61, 0.60, 0.39, 0.59, 0.27, 0.48],
        'time': [1.66, 1.97, 1.55, 1.45, 0.33, 0.41, 2.44, 5.71, 0.49, 0.02, 0.06, 0.03, 6.17, 0.09, 0.08, 0.03, 0.03,
                 0.08, 0.30, 0.14, 0.19, 0.04, 0.12, 0.12, 0.05, 0.03],
    },
    'Dataset 2': {
        'models': ['BaggingClassifier', 'DecisionTreeClassifier', 'KNeighborsClassifier', 'NearestCentroid',
                   'ExtraTreeClassifier', 'XGBClassifier', 'ExtraTreesClassifier', 'RandomForestClassifier', 'GaussianNB',
                   'LabelPropagation', 'LabelSpreading', 'PassiveAggressiveClassifier', 'LGBMClassifier', 'Perceptron',
                   'LogisticRegression', 'LinearDiscriminantAnalysis', 'CalibratedClassifierCV', 'SVC', 'DummyClassifier',
                   'BernoulliNB', 'AdaBoostClassifier', 'SGDClassifier', 'RidgeClassifier', 'RidgeClassifierCV', 'LinearSVC'],
        'accuracies': [0.67, 0.61, 0.67, 0.16, 0.60, 0.68, 0.69, 0.69, 0.44, 0.65, 0.65, 0.23, 0.40, 0.53, 0.63, 0.63,
                       0.63, 0.63, 0.63, 0.63, 0.63, 0.63, 0.63, 0.63],
        'time': [0.48, 0.09, 0.31, 0.02, 0.03, 4.64, 1.50, 1.31, 0.03, 2.17, 2.74, 0.07, 2.10, 0.07, 0.34, 0.06, 0.57,
                 3.15, 0.03, 0.05, 0.05, 0.90, 0.15, 0.03, 0.05, 0.07],
    },
    'Dataset 3': {
        'models': ['ExtraTreesClassifier', 'RandomForestClassifier', 'LGBMClassifier', 'LabelSpreading', 'LabelPropagation',
                   'BaggingClassifier', 'KNeighborsClassifier', 'DecisionTreeClassifier', 'ExtraTreeClassifier', 'SVC', 'NuSVC',
                   'LogisticRegression', 'CalibratedClassifierCV', 'LinearSVC', 'SGDClassifier', 'QuadraticDiscriminantAnalysis',
                   'RidgeClassifier', 'RidgeClassifierCV', 'AdaBoostClassifier', 'LinearDiscriminantAnalysis', 'PassiveAggressiveClassifier',
                   'Perceptron', 'NearestCentroid', 'BernoulliNB', 'GaussianNB', 'DummyClassifier'],
        'accuracies': [0.97, 0.96, 0.96, 0.94, 0.94, 0.94, 0.89, 0.89, 0.86, 0.81, 0.81, 0.73, 0.64, 0.62, 0.62, 0.58, 0.58,
                       0.54, 0.54, 0.54, 0.53, 0.53, 0.53, 0.49, 0.40, 0.39, 0.35, 0.16],
        'time': [2.98, 9.66, 4.62, 23.73, 14.60, 8.72, 0.70, 1.14, 0.07, 13.10, 32.57, 0.62, 10.63, 1.62, 0.64, 0.15, 0.05,
                 0.20, 8.19, 0.13, 0.30, 0.26, 0.05, 0.09, 0.07, 0.04],
    },
}

# Filter the datasets to include only selected models
for dataset in datasets:
    indices = [i for i, model in enumerate(datasets[dataset]['models']) if model in selected_models]
    datasets[dataset]['models'] = np.array(datasets[dataset]['models'])[indices]
    datasets[dataset]['accuracies'] = np.array(datasets[dataset]['accuracies'])[indices]
    datasets[dataset]['time'] = np.array(datasets[dataset]['time'])[indices]

# Create the plot
fig, ax = plt.subplots(figsize=(14, 8))

# Color map
colors = plt.cm.Paired(np.arange(len(datasets)))

# Plot for each dataset
bar_width = 0.15
for i, (dataset, data) in enumerate(datasets.items()):
    x_pos = np.arange(len(selected_models)) + i * bar_width
    ax.bar(x_pos, data['accuracies'], bar_width, label=f'{dataset} Accuracies', color=colors[i])
    ax.bar(x_pos + bar_width, data['time'], bar_width, label=f'{dataset} Time', color=colors[i], alpha=0.6)

ax.set_xticks(np.arange(len(selected_models)) + bar_width * (len(datasets) - 1) / 2)
ax.set_xticklabels(selected_models, rotation=45)
ax.set_xlabel('Models')
ax.set_ylabel('Scores and Time (in seconds)')
ax.set_title('Model Performance Comparison')
ax.legend()

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Selected models to plot (these should match the models available in each dataset)
selected_models = [
    'RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
    'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
    'GaussianNB', 'ExtraTreesClassifier'
]

# Dataset results for each dataset (5 datasets in total)
datasets = {
    'Dataset 1': {
        'models': ['LGBMClassifier', 'RandomForestClassifier', 'XGBClassifier', 'ExtraTreesClassifier', 'KNeighborsClassifier',
                   'BaggingClassifier', 'LabelPropagation', 'LabelSpreading', 'AdaBoostClassifier', 'GaussianNB',
                   'QuadraticDiscriminantAnalysis', 'ExtraTreeClassifier', 'SVC', 'DecisionTreeClassifier', 'LinearSVC',
                   'RidgeClassifierCV', 'RidgeClassifier', 'LinearDiscriminantAnalysis', 'CalibratedClassifierCV',
                   'SGDClassifier', 'LogisticRegression', 'BernoulliNB', 'Perceptron', 'PassiveAggressiveClassifier',
                   'NearestCentroid', 'DummyClassifier'],
        'accuracies': [0.65, 0.64, 0.64, 0.63, 0.62, 0.62, 0.64, 0.64, 0.60, 0.52, 0.51, 0.57, 0.63, 0.56, 0.61, 0.61,
                       0.61, 0.61, 0.60, 0.61, 0.60, 0.39, 0.59, 0.27, 0.48],
        'time': [1.66, 1.97, 1.55, 1.45, 0.33, 0.41, 2.44, 5.71, 0.49, 0.02, 0.06, 0.03, 6.17, 0.09, 0.08, 0.03, 0.03,
                 0.08, 0.30, 0.14, 0.19, 0.04, 0.12, 0.12, 0.05, 0.03],
    },
    'Dataset 2': {
        'models': ['BaggingClassifier', 'DecisionTreeClassifier', 'KNeighborsClassifier', 'NearestCentroid',
                   'ExtraTreeClassifier', 'XGBClassifier', 'ExtraTreesClassifier', 'RandomForestClassifier', 'GaussianNB',
                   'LabelPropagation', 'LabelSpreading', 'PassiveAggressiveClassifier', 'LGBMClassifier', 'Perceptron',
                   'LogisticRegression', 'LinearDiscriminantAnalysis', 'CalibratedClassifierCV', 'SVC', 'DummyClassifier',
                   'BernoulliNB', 'AdaBoostClassifier', 'SGDClassifier', 'RidgeClassifier', 'RidgeClassifierCV', 'LinearSVC'],
        'accuracies': [0.67, 0.61, 0.67, 0.16, 0.60, 0.68, 0.69, 0.69, 0.44, 0.65, 0.65, 0.23, 0.40, 0.53, 0.63, 0.63,
                       0.63, 0.63, 0.63, 0.63, 0.63, 0.63, 0.63, 0.63],
        'time': [0.48, 0.09, 0.31, 0.02, 0.03, 4.64, 1.50, 1.31, 0.03, 2.17, 2.74, 0.07, 2.10, 0.07, 0.34, 0.06, 0.57,
                 3.15, 0.03, 0.05, 0.05, 0.90, 0.15, 0.03, 0.05, 0.07],
    },
    'Dataset 3': {
        'models': ['ExtraTreesClassifier', 'RandomForestClassifier', 'LGBMClassifier', 'LabelSpreading', 'LabelPropagation',
                   'BaggingClassifier', 'KNeighborsClassifier', 'DecisionTreeClassifier', 'ExtraTreeClassifier', 'SVC', 'NuSVC',
                   'LogisticRegression', 'CalibratedClassifierCV', 'LinearSVC', 'SGDClassifier', 'QuadraticDiscriminantAnalysis',
                   'RidgeClassifier', 'RidgeClassifierCV', 'AdaBoostClassifier', 'LinearDiscriminantAnalysis', 'PassiveAggressiveClassifier',
                   'Perceptron', 'NearestCentroid', 'BernoulliNB', 'GaussianNB', 'DummyClassifier'],
        'accuracies': [0.97, 0.96, 0.96, 0.94, 0.94, 0.94, 0.89, 0.89, 0.86, 0.81, 0.81, 0.73, 0.64, 0.62, 0.62, 0.58, 0.58,
                       0.54, 0.54, 0.54, 0.53, 0.53, 0.53, 0.49, 0.40, 0.39, 0.35, 0.16],
        'time': [2.98, 9.66, 4.62, 23.73, 14.60, 8.72, 0.70, 1.14, 0.07, 13.10, 32.57, 0.62, 10.63, 1.62, 0.64, 0.15, 0.05,
                 0.20, 8.19, 0.13, 0.30, 0.26, 0.05, 0.09, 0.07, 0.04],
    },
    'Dataset 4': {
        'models': ['KNeighborsClassifier', 'RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'SVC', 'GaussianNB',
                   'ExtraTreesClassifier', 'LogisticRegression', 'BaggingClassifier', 'DecisionTreeClassifier', 'AdaBoostClassifier',
                   'LinearDiscriminantAnalysis', 'QuadraticDiscriminantAnalysis', 'LabelSpreading', 'LabelPropagation',
                   'CalibratedClassifierCV', 'RidgeClassifier', 'RidgeClassifierCV', 'SGDClassifier', 'Perceptron', 'PassiveAggressiveClassifier',
                   'NearestCentroid', 'BernoulliNB', 'ExtraTreeClassifier', 'LinearSVC'],
        'accuracies': [0.73, 0.72, 0.76, 0.75, 0.75, 0.62, 0.79, 0.75, 0.61, 0.67, 0.68, 0.68, 0.68, 0.61, 0.75, 0.64, 0.56,
                       0.60, 0.69, 0.69, 0.64, 0.61, 0.61, 0.50, 0.46, 0.40],
        'time': [0.05, 0.09, 0.22, 0.09, 0.02, 0.02, 1.03, 0.07, 0.10, 0.09, 0.32, 0.04, 0.08, 0.22, 0.18, 0.03, 0.10,
                 0.09, 0.05, 0.04, 0.15, 0.17, 0.10, 0.07, 0.04, 0.07],
    },
    'Dataset 5': {
        'models': ['RandomForestClassifier', 'LogisticRegression', 'SVC', 'XGBClassifier', 'LGBMClassifier', 'GaussianNB',
                   'AdaBoostClassifier', 'ExtraTreesClassifier', 'KNeighborsClassifier', 'DecisionTreeClassifier',
                   'BaggingClassifier', 'LabelSpreading', 'LabelPropagation', 'SGDClassifier', 'Perceptron', 'LinearSVC',
                   'CalibratedClassifierCV', 'RidgeClassifier', 'RidgeClassifierCV', 'QuadraticDiscriminantAnalysis', 'LinearDiscriminantAnalysis',
                   'NearestCentroid', 'ExtraTreeClassifier', 'BernoulliNB'],
        'accuracies': [0.69, 0.67, 0.66, 0.66, 0.74, 0.64, 0.61, 0.68, 0.64, 0.68, 0.67, 0.64, 0.63, 0.59, 0.63, 0.61, 0.55,
                       0.52, 0.51, 0.50, 0.45, 0.48, 0.48, 0.46, 0.47],
        'time': [0.10, 0.16, 0.14, 0.04, 0.05, 0.01, 0.06, 0.07, 0.16, 0.23, 0.17, 0.02, 0.11, 0.14, 0.08, 0.15, 0.05,
                 0.03, 0.18, 0.09, 0.07, 0.12, 0.16, 0.03, 0.04],
    }
}

# Function to filter and plot results
def plot_dataset(dataset_name, dataset_results, selected_models):
    # Filter models
    filtered_models = [model for model in selected_models if model in dataset_results['models']]
    filtered_accuracies = [dataset_results['accuracies'][dataset_results['models'].index(model)] for model in filtered_models]
    filtered_times = [dataset_results['time'][dataset_results['models'].index(model)] for model in filtered_models]

    # Number of models selected for plotting
    print(f"{dataset_name}: {len(filtered_models)} models selected")

    # Plot the accuracies and times for the filtered models
    x_pos = np.arange(len(filtered_models))
    fig, ax1 = plt.subplots(figsize=(10, 6))

    ax1.bar(x_pos - 0.2, filtered_accuracies, 0.4, label='Accuracy', color='b')
    ax1.set_xlabel('Model')
    ax1.set_ylabel('Accuracy', color='b')
    ax1.set_xticks(x_pos)
    ax1.set_xticklabels(filtered_models, rotation=90)
    ax1.tick_params(axis='y', labelcolor='b')

    ax2 = ax1.twinx()
    ax2.plot(x_pos + 0.2, filtered_times, 0.4, label='Time', color='r')
    ax2.set_ylabel('Time (seconds)', color='r')
    ax2.tick_params(axis='y', labelcolor='r')

    plt.title(f"Models Performance: {dataset_name}")
    plt.tight_layout()
    plt.show()

# Plot all datasets
for dataset_name, dataset_results in datasets.items():
    plot_dataset(dataset_name, dataset_results, selected_models)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Selected models to plot (these should match the models available in each dataset)
selected_models = [
    'RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
    'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
    'GaussianNB', 'ExtraTreesClassifier'
]

# Dataset results for each dataset (5 datasets in total)
datasets = {
    'Dataset 1': {
        'models': ['LGBMClassifier', 'RandomForestClassifier', 'XGBClassifier', 'ExtraTreesClassifier', 'KNeighborsClassifier',
                   'BaggingClassifier', 'LabelPropagation', 'LabelSpreading', 'AdaBoostClassifier', 'GaussianNB',
                   'QuadraticDiscriminantAnalysis', 'ExtraTreeClassifier', 'SVC', 'DecisionTreeClassifier', 'LinearSVC',
                   'RidgeClassifierCV', 'RidgeClassifier', 'LinearDiscriminantAnalysis', 'CalibratedClassifierCV',
                   'SGDClassifier', 'LogisticRegression', 'BernoulliNB', 'Perceptron', 'PassiveAggressiveClassifier',
                   'NearestCentroid', 'DummyClassifier'],
        'accuracies': [0.65, 0.64, 0.64, 0.63, 0.62, 0.62, 0.64, 0.64, 0.60, 0.52, 0.51, 0.57, 0.63, 0.56, 0.61, 0.61,
                       0.61, 0.61, 0.60, 0.61, 0.60, 0.39, 0.59, 0.27, 0.48],
        'time': [1.66, 1.97, 1.55, 1.45, 0.33, 0.41, 2.44, 5.71, 0.49, 0.02, 0.06, 0.03, 6.17, 0.09, 0.08, 0.03, 0.03,
                 0.08, 0.30, 0.14, 0.19, 0.04, 0.12, 0.12, 0.05, 0.03],
    },
    'Dataset 2': {
        'models': ['BaggingClassifier', 'DecisionTreeClassifier', 'KNeighborsClassifier', 'NearestCentroid',
                   'ExtraTreeClassifier', 'XGBClassifier', 'ExtraTreesClassifier', 'RandomForestClassifier', 'GaussianNB',
                   'LabelPropagation', 'LabelSpreading', 'PassiveAggressiveClassifier', 'LGBMClassifier', 'Perceptron',
                   'LogisticRegression', 'LinearDiscriminantAnalysis', 'CalibratedClassifierCV', 'SVC', 'DummyClassifier',
                   'BernoulliNB', 'AdaBoostClassifier', 'SGDClassifier', 'RidgeClassifier', 'RidgeClassifierCV', 'LinearSVC'],
        'accuracies': [0.67, 0.61, 0.67, 0.16, 0.60, 0.68, 0.69, 0.69, 0.44, 0.65, 0.65, 0.23, 0.40, 0.53, 0.63, 0.63,
                       0.63, 0.63, 0.63, 0.63, 0.63, 0.63, 0.63, 0.63],
        'time': [0.48, 0.09, 0.31, 0.02, 0.03, 4.64, 1.50, 1.31, 0.03, 2.17, 2.74, 0.07, 2.10, 0.07, 0.34, 0.06, 0.57,
                 3.15, 0.03, 0.05, 0.05, 0.90, 0.15, 0.03, 0.05, 0.07],
    },
    'Dataset 3': {
        'models': ['ExtraTreesClassifier', 'RandomForestClassifier', 'LGBMClassifier', 'LabelSpreading', 'LabelPropagation',
                   'BaggingClassifier', 'KNeighborsClassifier', 'DecisionTreeClassifier', 'ExtraTreeClassifier', 'SVC', 'NuSVC',
                   'LogisticRegression', 'CalibratedClassifierCV', 'LinearSVC', 'SGDClassifier', 'QuadraticDiscriminantAnalysis',
                   'RidgeClassifier', 'RidgeClassifierCV', 'AdaBoostClassifier', 'LinearDiscriminantAnalysis', 'PassiveAggressiveClassifier',
                   'Perceptron', 'NearestCentroid', 'BernoulliNB', 'GaussianNB', 'DummyClassifier'],
        'accuracies': [0.97, 0.96, 0.96, 0.94, 0.94, 0.94, 0.89, 0.89, 0.86, 0.81, 0.81, 0.73, 0.64, 0.62, 0.62, 0.58, 0.58,
                       0.54, 0.54, 0.54, 0.53, 0.53, 0.53, 0.49, 0.40, 0.39, 0.35, 0.16],
        'time': [2.98, 9.66, 4.62, 23.73, 14.60, 8.72, 0.70, 1.14, 0.07, 13.10, 32.57, 0.62, 10.63, 1.62, 0.64, 0.15, 0.05,
                 0.20, 8.19, 0.13, 0.30, 0.26, 0.05, 0.09, 0.07, 0.04],
    },
    'Dataset 4': {
        'models': ['KNeighborsClassifier', 'RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'SVC', 'GaussianNB',
                   'ExtraTreesClassifier', 'LogisticRegression', 'BaggingClassifier', 'DecisionTreeClassifier', 'AdaBoostClassifier',
                   'LinearDiscriminantAnalysis', 'QuadraticDiscriminantAnalysis', 'LabelSpreading', 'LabelPropagation',
                   'CalibratedClassifierCV', 'RidgeClassifier', 'RidgeClassifierCV', 'SGDClassifier', 'Perceptron', 'PassiveAggressiveClassifier',
                   'NearestCentroid', 'BernoulliNB', 'ExtraTreeClassifier', 'LinearSVC'],
        'accuracies': [0.73, 0.72, 0.76, 0.75, 0.75, 0.62, 0.79, 0.75, 0.61, 0.67, 0.68, 0.68, 0.68, 0.61, 0.75, 0.64, 0.56,
                       0.60, 0.69, 0.69, 0.64, 0.61, 0.61, 0.50, 0.46, 0.40],
        'time': [0.05, 0.09, 0.22, 0.09, 0.02, 0.02, 1.03, 0.07, 0.10, 0.09, 0.32, 0.04, 0.08, 0.22, 0.18, 0.03, 0.10,
                 0.09, 0.05, 0.04, 0.15, 0.17, 0.10, 0.07, 0.04, 0.07],
    },
    'Dataset 5': {
        'models': ['RandomForestClassifier', 'LogisticRegression', 'SVC', 'XGBClassifier', 'LGBMClassifier', 'GaussianNB',
                   'AdaBoostClassifier', 'ExtraTreesClassifier', 'KNeighborsClassifier', 'DecisionTreeClassifier',
                   'BaggingClassifier', 'LabelSpreading', 'LabelPropagation', 'SGDClassifier', 'Perceptron', 'LinearSVC',
                   'CalibratedClassifierCV', 'RidgeClassifier', 'RidgeClassifierCV', 'QuadraticDiscriminantAnalysis', 'LinearDiscriminantAnalysis',
                   'NearestCentroid', 'ExtraTreeClassifier', 'BernoulliNB'],
        'accuracies': [0.69, 0.67, 0.66, 0.66, 0.74, 0.64, 0.61, 0.72, 0.71, 0.60, 0.67, 0.68, 0.69, 0.64, 0.61, 0.62, 0.60,
                       0.58, 0.61, 0.56, 0.59, 0.58, 0.62, 0.55, 0.50],
        'time': [0.45, 0.53, 0.71, 0.59, 0.46, 0.19, 0.36, 0.69, 0.58, 0.33, 0.31, 0.24, 0.41, 0.50, 0.38, 0.21, 0.36,
                 0.22, 0.15, 0.12, 0.08, 0.16, 0.07, 0.11, 0.08],
    }
}

# Function to plot all datasets on one plot
def plot_all_datasets_on_one_plot(datasets, selected_models):
    fig, ax1 = plt.subplots(figsize=(12, 8))

    x_pos = np.arange(len(selected_models))

    colors = ['b', 'g', 'r', 'c', 'm']  # Different colors for each dataset

    for i, (dataset_name, dataset_results) in enumerate(datasets.items()):
        filtered_models = [model for model in selected_models if model in dataset_results['models']]
        filtered_accuracies = [dataset_results['accuracies'][dataset_results['models'].index(model)] for model in filtered_models]
        filtered_times = [dataset_results['time'][dataset_results['models'].index(model)] for model in filtered_models]

        ax1.bar(x_pos - 0.2 + (i * 0.1), filtered_accuracies, 0.4, label=f'Accuracy - {dataset_name}', color=colors[i % len(colors)])
        ax1.set_xlabel('Model')
        ax1.set_ylabel('Accuracy', color='b')
        ax1.set_xticks(x_pos)
        ax1.set_xticklabels(selected_models, rotation=90)
        ax1.tick_params(axis='y', labelcolor='b')

    ax2 = ax1.twinx()
    for i, (dataset_name, dataset_results) in enumerate(datasets.items()):
        filtered_models = [model for model in selected_models if model in dataset_results['models']]
        filtered_accuracies = [dataset_results['accuracies'][dataset_results['models'].index(model)] for model in filtered_models]
        filtered_times = [dataset_results['time'][dataset_results['models'].index(model)] for model in filtered_models]

        ax2.plot(x_pos + 0.2 + (i * 0.1), filtered_times, 0.4, label=f'Time - {dataset_name}', color=colors[i % len(colors)])

    ax2.set_ylabel('Time (seconds)', color='r')
    ax2.tick_params(axis='y', labelcolor='r')

    plt.title("Models Performance Across Datasets")
    ax1.legend(loc='upper left')
    ax2.legend(loc='upper right')
    plt.tight_layout()
    plt.show()

# Plot all datasets on one plot
plot_all_datasets_on_one_plot(datasets, selected_models)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Selected models to plot (these should match the models available in each dataset)
selected_models = [
    'RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
    'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
    'GaussianNB', 'ExtraTreesClassifier'
]

# Dataset results for each dataset (5 datasets in total)
datasets = {
    'Dataset 1': {
        'models': ['LGBMClassifier', 'RandomForestClassifier', 'XGBClassifier', 'ExtraTreesClassifier', 'KNeighborsClassifier',
                   'BaggingClassifier', 'LabelPropagation', 'LabelSpreading', 'AdaBoostClassifier', 'GaussianNB',
                   'QuadraticDiscriminantAnalysis', 'ExtraTreeClassifier', 'SVC', 'DecisionTreeClassifier', 'LinearSVC',
                   'RidgeClassifierCV', 'RidgeClassifier', 'LinearDiscriminantAnalysis', 'CalibratedClassifierCV',
                   'SGDClassifier', 'LogisticRegression', 'BernoulliNB', 'Perceptron', 'PassiveAggressiveClassifier',
                   'NearestCentroid', 'DummyClassifier'],
        'accuracies': [0.65, 0.64, 0.64, 0.63, 0.62, 0.62, 0.64, 0.64, 0.60, 0.52, 0.51, 0.57, 0.63, 0.56, 0.61, 0.61,
                       0.61, 0.61, 0.60, 0.61, 0.60, 0.39, 0.59, 0.27, 0.48],
        'time': [1.66, 1.97, 1.55, 1.45, 0.33, 0.41, 2.44, 5.71, 0.49, 0.02, 0.06, 0.03, 6.17, 0.09, 0.08, 0.03, 0.03,
                 0.08, 0.30, 0.14, 0.19, 0.04, 0.12, 0.12, 0.05, 0.03],
    },
    'Dataset 2': {
        'models': ['BaggingClassifier', 'DecisionTreeClassifier', 'KNeighborsClassifier', 'NearestCentroid',
                   'ExtraTreeClassifier', 'XGBClassifier', 'ExtraTreesClassifier', 'RandomForestClassifier', 'GaussianNB',
                   'LabelPropagation', 'LabelSpreading', 'PassiveAggressiveClassifier', 'LGBMClassifier', 'Perceptron',
                   'LogisticRegression', 'LinearDiscriminantAnalysis', 'CalibratedClassifierCV', 'SVC', 'DummyClassifier',
                   'BernoulliNB', 'AdaBoostClassifier', 'SGDClassifier', 'RidgeClassifier', 'RidgeClassifierCV', 'LinearSVC'],
        'accuracies': [0.67, 0.61, 0.67, 0.16, 0.60, 0.68, 0.69, 0.69, 0.44, 0.65, 0.65, 0.23, 0.40, 0.53, 0.63, 0.63,
                       0.63, 0.63, 0.63, 0.63, 0.63, 0.63, 0.63, 0.63],
        'time': [0.48, 0.09, 0.31, 0.02, 0.03, 4.64, 1.50, 1.31, 0.03, 2.17, 2.74, 0.07, 2.10, 0.07, 0.34, 0.06, 0.57,
                 3.15, 0.03, 0.05, 0.05, 0.90, 0.15, 0.03, 0.05, 0.07],
    },
    'Dataset 3': {
        'models': ['XGBClassifier, ExtraTreesClassifier', 'RandomForestClassifier', 'LGBMClassifier', 'LabelSpreading', 'LabelPropagation',
                   'BaggingClassifier', 'KNeighborsClassifier', 'DecisionTreeClassifier', 'ExtraTreeClassifier', 'SVC', 'NuSVC',
                   'LogisticRegression', 'CalibratedClassifierCV', 'LinearSVC', 'SGDClassifier', 'QuadraticDiscriminantAnalysis',
                   'RidgeClassifier', 'RidgeClassifierCV', 'AdaBoostClassifier', 'LinearDiscriminantAnalysis', 'PassiveAggressiveClassifier',
                   'Perceptron', 'NearestCentroid', 'BernoulliNB', 'GaussianNB', 'DummyClassifier', 'XGBClassifier'],
        'accuracies': [0.97, 0.97, 0.96, 0.96, 0.94, 0.94, 0.94, 0.89, 0.89, 0.86, 0.81, 0.81, 0.73, 0.64, 0.62, 0.62, 0.58, 0.58,
                       0.54, 0.54, 0.54, 0.53, 0.53, 0.53, 0.49, 0.40, 0.39, 0.35, 0.9660],
        'time': [34.4, 2.98, 9.66, 4.62, 23.73, 14.60, 8.72, 0.70, 1.14, 0.07, 13.10, 32.57, 0.62, 10.63, 1.62, 0.64, 0.15, 0.05,
                 0.20, 8.19, 0.13, 0.30, 0.26, 0.05, 0.09, 0.07, 0.04, 3.4],
    },
    'Dataset 4': {
        'models': ['KNeighborsClassifier', 'RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'SVC', 'GaussianNB',
                   'ExtraTreesClassifier', 'LogisticRegression', 'BaggingClassifier', 'DecisionTreeClassifier', 'AdaBoostClassifier',
                   'LinearDiscriminantAnalysis', 'QuadraticDiscriminantAnalysis', 'LabelSpreading', 'LabelPropagation',
                   'CalibratedClassifierCV', 'RidgeClassifier', 'RidgeClassifierCV', 'SGDClassifier', 'Perceptron', 'PassiveAggressiveClassifier',
                   'NearestCentroid', 'BernoulliNB', 'ExtraTreeClassifier', 'LinearSVC'],
        'accuracies': [0.73, 0.72, 0.76, 0.75, 0.75, 0.62, 0.79, 0.75, 0.61, 0.67, 0.68, 0.68, 0.68, 0.61, 0.75, 0.64, 0.56,
                       0.60, 0.69, 0.69, 0.64, 0.61, 0.61, 0.50, 0.46, 0.40],
        'time': [0.05, 0.09, 0.22, 0.09, 0.02, 0.02, 1.03, 0.07, 0.10, 0.09, 0.32, 0.04, 0.08, 0.22, 0.18, 0.03, 0.10,
                 0.09, 0.05, 0.04, 0.15, 0.17, 0.10, 0.07, 0.04, 0.07],
    },
# Data for Dataset 5
models = [
    'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier', 'RandomForestClassifier',
    'BaggingClassifier', 'LabelSpreading', 'PassiveAggressiveClassifier', 'DecisionTreeClassifier',
    'LabelPropagation', 'NearestCentroid', 'ExtraTreesClassifier', 'GaussianNB',
    'ExtraTreeClassifier', 'AdaBoostClassifier', 'QuadraticDiscriminantAnalysis', 'SVC',
    'SGDClassifier', 'BernoulliNB', 'Perceptron', 'LogisticRegression',
    'LinearDiscriminantAnalysis', 'LinearSVC', 'CalibratedClassifierCV', 'RidgeClassifierCV',
    'RidgeClassifier', 'DummyClassifier'
]

accuracies = [
    0.64, 0.65, 0.63, 0.63, 0.61, 0.62, 0.55, 0.57, 0.61, 0.49, 0.61, 0.59,
    0.55, 0.58, 0.60, 0.63, 0.58, 0.58, 0.45, 0.58, 0.58, 0.58, 0.58, 0.58,
    0.58, 0.57
]

times = [
    1.26, 0.20, 0.06, 0.35, 0.08, 0.27, 0.02, 0.02, 0.26, 0.02, 0.29, 0.01,
    0.01, 0.15, 0.03, 0.21, 0.03, 0.02, 0.02, 0.05, 0.03, 0.02, 0.08, 0.01,
    0.01, 0.01
]

}

# Function to plot model comparison
def plot_comparison(dataset_results):
    for dataset_name, result in dataset_results.items():
        accuracies = result['accuracies']
        time = result['time']

        # Only plot selected models
        accuracies_selected = [accuracies[selected_models.index(model)] if model in result['models'] else None for model in selected_models]
        time_selected = [time[selected_models.index(model)] if model in result['models'] else None for model in selected_models]

        # Create the figure
        fig, ax1 = plt.subplots(figsize=(10, 6))

        ax2 = ax1.twinx()
        ax1.bar(selected_models, accuracies_selected, color='b', alpha=0.6, label="Accuracy")
        ax2.plot(selected_models, time_selected, 'g', label="Training Time (s)")

        ax1.set_xlabel('Model')
        ax1.set_ylabel('Accuracy', color='b')
        ax2.set_ylabel('Training Time (s)', color='g')

        ax1.tick_params(axis='x', rotation=90)
        ax1.set_title(f"Model Comparison for {dataset_name}")
        plt.tight_layout()
        plt.show()

# Plot the model comparisons for each dataset
plot_comparison(datasets)


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Selected models to plot (these should match the models available in each dataset)
selected_models = [
    'RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
    'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
    'GaussianNB', 'ExtraTreesClassifier'
]

# Dataset results for each dataset (5 datasets in total)
datasets = {
    'Dataset 1': {
        'models': ['LGBMClassifier', 'RandomForestClassifier', 'XGBClassifier', 'ExtraTreesClassifier', 'KNeighborsClassifier',
                   'BaggingClassifier', 'LabelPropagation', 'LabelSpreading', 'AdaBoostClassifier', 'GaussianNB',
                   'QuadraticDiscriminantAnalysis', 'ExtraTreeClassifier', 'SVC', 'DecisionTreeClassifier', 'LinearSVC',
                   'RidgeClassifierCV', 'RidgeClassifier', 'LinearDiscriminantAnalysis', 'CalibratedClassifierCV',
                   'SGDClassifier', 'LogisticRegression', 'BernoulliNB', 'Perceptron', 'PassiveAggressiveClassifier',
                   'NearestCentroid', 'DummyClassifier'],
        'accuracies': [0.65, 0.64, 0.64, 0.63, 0.62, 0.62, 0.64, 0.64, 0.60, 0.52, 0.51, 0.57, 0.63, 0.56, 0.61, 0.61,
                       0.61, 0.61, 0.60, 0.61, 0.60, 0.39, 0.59, 0.27, 0.48],
        'time': [1.66, 1.97, 1.55, 1.45, 0.33, 0.41, 2.44, 5.71, 0.49, 0.02, 0.06, 0.03, 6.17, 0.09, 0.08, 0.03, 0.03,
                 0.08, 0.30, 0.14, 0.19, 0.04, 0.12, 0.12, 0.05, 0.03],
    },
    'Dataset 2': {
        'models': ['BaggingClassifier', 'DecisionTreeClassifier', 'KNeighborsClassifier', 'NearestCentroid',
                   'ExtraTreeClassifier', 'XGBClassifier', 'ExtraTreesClassifier', 'RandomForestClassifier', 'GaussianNB',
                   'LabelPropagation', 'LabelSpreading', 'PassiveAggressiveClassifier', 'LGBMClassifier', 'Perceptron',
                   'LogisticRegression', 'LinearDiscriminantAnalysis', 'CalibratedClassifierCV', 'SVC', 'DummyClassifier',
                   'BernoulliNB', 'AdaBoostClassifier', 'SGDClassifier', 'RidgeClassifier', 'RidgeClassifierCV', 'LinearSVC'],
        'accuracies': [0.67, 0.61, 0.67, 0.16, 0.60, 0.68, 0.69, 0.69, 0.44, 0.65, 0.65, 0.23, 0.40, 0.53, 0.63, 0.63,
                       0.63, 0.63, 0.63, 0.63, 0.63, 0.63, 0.63, 0.63],
        'time': [0.48, 0.09, 0.31, 0.02, 0.03, 4.64, 1.50, 1.31, 0.03, 2.17, 2.74, 0.07, 2.10, 0.07, 0.34, 0.06, 0.57,
                 3.15, 0.03, 0.05, 0.05, 0.90, 0.15, 0.03, 0.05, 0.07],
    },
    'Dataset 3': {
        'models': ['ExtraTreesClassifier', 'RandomForestClassifier', 'LGBMClassifier', 'LabelSpreading', 'LabelPropagation',
                   'BaggingClassifier', 'KNeighborsClassifier', 'DecisionTreeClassifier', 'ExtraTreeClassifier', 'SVC', 'NuSVC',
                   'LogisticRegression', 'CalibratedClassifierCV', 'LinearSVC', 'SGDClassifier', 'QuadraticDiscriminantAnalysis',
                   'RidgeClassifier', 'RidgeClassifierCV', 'AdaBoostClassifier', 'LinearDiscriminantAnalysis', 'PassiveAggressiveClassifier',
                   'Perceptron', 'NearestCentroid', 'BernoulliNB', 'GaussianNB', 'DummyClassifier', 'XGBClassifier'],
        'accuracies': [0.97, 0.96, 0.96, 0.94, 0.94, 0.94, 0.89, 0.89, 0.86, 0.81, 0.81, 0.73, 0.64, 0.62, 0.62, 0.58, 0.58,
                       0.54, 0.54, 0.54, 0.53, 0.53, 0.53, 0.49, 0.40, 0.39, 0.35, 0.9660],
        'time': [2.98, 9.66, 4.62, 23.73, 14.60, 8.72, 0.70, 1.14, 0.07, 13.10, 32.57, 0.62, 10.63, 1.62, 0.64, 0.15, 0.05,
                 0.20, 8.19, 0.13, 0.30, 0.26, 0.05, 0.09, 0.07, 0.04, 3.4],
    },
    'Dataset 4': {
        'models': ['KNeighborsClassifier', 'RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'SVC', 'GaussianNB',
                   'ExtraTreesClassifier', 'LogisticRegression', 'BaggingClassifier', 'DecisionTreeClassifier', 'AdaBoostClassifier',
                   'LinearDiscriminantAnalysis', 'QuadraticDiscriminantAnalysis', 'LabelSpreading', 'LabelPropagation',
                   'CalibratedClassifierCV', 'RidgeClassifier', 'RidgeClassifierCV', 'SGDClassifier', 'Perceptron', 'PassiveAggressiveClassifier',
                   'NearestCentroid', 'BernoulliNB', 'ExtraTreeClassifier', 'LinearSVC'],
        'accuracies': [0.73, 0.72, 0.76, 0.75, 0.75, 0.62, 0.79, 0.75, 0.61, 0.67, 0.68, 0.68, 0.68, 0.61, 0.75, 0.64, 0.56,
                       0.60, 0.69, 0.69, 0.64, 0.61, 0.61, 0.50, 0.46, 0.40],
        'time': [0.05, 0.09, 0.22, 0.09, 0.02, 0.02, 1.03, 0.07, 0.10, 0.09, 0.32, 0.04, 0.08, 0.22, 0.18, 0.03, 0.10,
                 0.09, 0.05, 0.04, 0.15, 0.17, 0.10, 0.07, 0.04, 0.07],
    },
    'Dataset 5': {
        'models': ['RandomForestClassifier', 'LogisticRegression', 'SVC', 'XGBClassifier', 'LGBMClassifier', 'GaussianNB',
                   'AdaBoostClassifier', 'ExtraTreesClassifier', 'KNeighborsClassifier', 'DecisionTreeClassifier',
                   'BaggingClassifier', 'LabelSpreading', 'LabelPropagation', 'SGDClassifier', 'Perceptron', 'LinearSVC',
                   'CalibratedClassifierCV', 'RidgeClassifier', 'RidgeClassifierCV', 'QuadraticDiscriminantAnalysis', 'LinearDiscriminantAnalysis',
                   'PassiveAggressiveClassifier', 'ExtraTreeClassifier', 'NearestCentroid', 'BernoulliNB'],
        'accuracies': [0.97, 0.96, 0.95, 0.97, 0.96, 0.90, 0.93, 0.92, 0.91, 0.94, 0.93, 0.78, 0.82, 0.75, 0.77, 0.76, 0.79,
                       0.72, 0.74, 0.74, 0.73, 0.71, 0.72, 0.70, 0.67, 0.65],
        'time': [0.07, 0.02, 1.64, 4.15, 3.20, 2.61, 2.34, 1.89, 1.25, 0.40, 0.58, 0.04, 0.02, 0.32, 0.34, 0.32, 0.10, 0.05,
                 0.04, 0.07, 0.03, 0.12, 0.18, 0.02, 0.09, 0.11],
    }
}

# Create a combined plot for all datasets
fig, ax1 = plt.subplots(figsize=(12, 8))

# Plot the accuracies for each dataset
for idx, (dataset_name, result) in enumerate(datasets.items()):
    accuracies = result['accuracies']
    time = result['time']

    # Only plot selected models
    accuracies_selected = [accuracies[selected_models.index(model)] if model in result['models'] else None for model in selected_models]
    time_selected = [time[selected_models.index(model)] if model in result['models'] else None for model in selected_models]

    ax1.bar(np.arange(len(selected_models)) + (idx * 0.1), accuracies_selected, width=0.1, label=f"Accuracy - {dataset_name}")

# Add labels and title
ax1.set_xticks(np.arange(len(selected_models)))
ax1.set_xticklabels(selected_models, rotation=90)
ax1.set_ylabel('Test Accuracy')
ax1.set_xlabel('Models')
ax1.set_title('Model Comparison across Datasets')

# Display the legend
ax1.legend()

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Selected models to plot (these should match the models available in each dataset)
selected_models = [
    'RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
    'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
    'GaussianNB', 'ExtraTreesClassifier'
]

# Dataset results for each dataset
datasets = {
    'Dataset 1': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.65, 0.64, 0.64, 0.62, 0.61, 0.56, 0.63, 0.60, 0.52, 0.63],
        'time': [1.66, 1.97, 1.55, 0.33, 0.14, 0.09, 6.17, 0.49, 0.02, 1.45],
    },
    'Dataset 2': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.69, 0.68, 0.69, 0.67, 0.63, 0.61, 0.63, 0.63, 0.44, 0.69],
        'time': [1.31, 4.64, 2.10, 0.31, 0.34, 0.09, 3.15, 0.05, 0.03, 1.50],
    },
    'Dataset 3': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.97, 0.97, 0.96, 0.94, 0.81, 0.89, 0.86, 0.54, 0.49, 0.96],
        'time': [2.98, 34.4, 9.66, 8.72, 32.57, 0.70, 13.10, 0.20, 0.09, 1.14],
    },
    'Dataset 4': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.72, 0.76, 0.75, 0.73, 0.75, 0.67, 0.75, 0.68, 0.62, 0.79],
        'time': [0.09, 0.22, 0.09, 0.05, 0.07, 0.09, 0.02, 0.32, 0.02, 1.03],
    },
    'Dataset 5': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.63, 0.64, 0.65, 0.63, 0.58, 0.57, 0.63, 0.58, 0.59, 0.61],
        'time': [0.35, 1.26, 0.20, 0.06, 0.05, 0.02, 0.21, 0.08, 0.01, 0.29],
    },
}

# Colors for each dataset
colors = ['b', 'g', 'r', 'c', 'm']

# Function to plot model comparison
def plot_all_datasets(dataset_results):
    fig, ax1 = plt.subplots(figsize=(12, 8))

    ax2 = ax1.twinx()

    for i, (dataset_name, result) in enumerate(dataset_results.items()):
        accuracies_selected = [result['accuracies'][result['models'].index(model)] if model in result['models'] else None
                               for model in selected_models]
        time_selected = [result['time'][result['models'].index(model)] if model in result['models'] else None
                         for model in selected_models]

        # Plot accuracy as bar plots
        ax1.bar(np.arange(len(selected_models)) + i * 0.15, accuracies_selected,
                width=0.15, color=colors[i], alpha=0.6, label=f"{dataset_name} Accuracy")

        # Plot training time as line plots
        ax2.plot(np.arange(len(selected_models)) + i * 0.15, time_selected,
                 marker='o', color=colors[i], label=f"{dataset_name} Training Time")

    ax1.set_xlabel('Model')
    ax1.set_ylabel('Test Accuracy')
    ax2.set_ylabel('Training Time (s)')

    ax1.set_xticks(np.arange(len(selected_models)))
    ax1.set_xticklabels(selected_models, rotation=90)


    ax1.legend(loc='upper left', bbox_to_anchor=(1.05, 1), title="Accuracy")
    ax2.legend(loc='lower left', bbox_to_anchor=(1.05, 0), title="Training Time")

    plt.tight_layout()
    plt.show()

# Plot all datasets on one plot
plot_all_datasets(datasets)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Selected models to plot (these should match the models available in each dataset)
selected_models = [
    'RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
    'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
    'GaussianNB', 'ExtraTreesClassifier'
]

# Dataset results for each dataset
datasets = {
    'Dataset 1': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.65, 0.64, 0.64, 0.62, 0.61, 0.56, 0.63, 0.60, 0.52, 0.63],
        'time': [1.66, 1.97, 1.55, 0.33, 0.14, 0.09, 6.17, 0.49, 0.02, 1.45],
    },
    'Dataset 2': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.69, 0.68, 0.69, 0.67, 0.63, 0.61, 0.63, 0.63, 0.44, 0.69],
        'time': [1.31, 4.64, 2.10, 0.31, 0.34, 0.09, 3.15, 0.05, 0.03, 1.50],
    },
    'Dataset 3': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.97, 0.97, 0.96, 0.94, 0.81, 0.89, 0.86, 0.54, 0.49, 0.96],
        'time': [2.98, 34.4, 9.66, 8.72, 32.57, 0.70, 13.10, 0.20, 0.09, 1.14],
    },
    'Dataset 4': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.72, 0.76, 0.75, 0.73, 0.75, 0.67, 0.75, 0.68, 0.62, 0.79],
        'time': [0.09, 0.22, 0.09, 0.05, 0.07, 0.09, 0.02, 0.32, 0.02, 1.03],
    },
    'Dataset 5': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.63, 0.64, 0.65, 0.63, 0.58, 0.57, 0.63, 0.58, 0.59, 0.61],
        'time': [0.35, 1.26, 0.20, 0.06, 0.05, 0.02, 0.21, 0.08, 0.01, 0.29],
    },
}

# Colorblind-friendly palette
colors = ['#117733', '#44AA99', '#88CCEE', '#DDCC77', '#CC6677']

# Function to plot model comparison
def plot_all_datasets(dataset_results):
    fig, ax1 = plt.subplots(figsize=(12, 8))

    ax2 = ax1.twinx()

    for i, (dataset_name, result) in enumerate(dataset_results.items()):
        accuracies_selected = [result['accuracies'][result['models'].index(model)] if model in result['models'] else None
                               for model in selected_models]
        time_selected = [result['time'][result['models'].index(model)] if model in result['models'] else None
                         for model in selected_models]

        # Plot accuracy as translucent bar plots
        ax1.bar(np.arange(len(selected_models)) + i * 0.15, accuracies_selected,
                width=0.15, color=colors[i], alpha=0.8, label=f"{dataset_name} Accuracy")

        # Plot training time as line plots
        ax2.plot(np.arange(len(selected_models)) + i * 0.15, time_selected,
                 marker='o', color=colors[i], label=f"{dataset_name} Training Time", linewidth=2)

    ax1.set_xlabel('Model')
    ax1.set_ylabel('Test Accuracy')
    ax2.set_ylabel('Training Time (s)')

    ax1.set_xticks(np.arange(len(selected_models)))
    ax1.set_xticklabels(selected_models, rotation=90)

    ax1.legend(loc='upper left', bbox_to_anchor=(1.05, 1), title="Accuracy")
    ax2.legend(loc='lower left', bbox_to_anchor=(1.05, 0), title="Training Time")

    plt.tight_layout()
    plt.show()

# Plot all datasets on one plot
plot_all_datasets(datasets)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# Selected models to plot (these should match the models available in each dataset)
selected_models = [
    'RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
    'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
    'GaussianNB', 'ExtraTreesClassifier'
]

# Dataset results for each dataset
datasets = {
    'Dataset 1': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.65, 0.64, 0.64, 0.62, 0.61, 0.56, 0.63, 0.60, 0.52, 0.63],
        'time': [1.66, 1.97, 1.55, 0.33, 0.14, 0.09, 6.17, 0.49, 0.02, 1.45],
    },
    'Dataset 2': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.69, 0.68, 0.69, 0.67, 0.63, 0.61, 0.63, 0.63, 0.44, 0.69],
        'time': [1.31, 4.64, 2.10, 0.31, 0.34, 0.09, 3.15, 0.05, 0.03, 1.50],
    },
    'Dataset 3': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.97, 0.97, 0.96, 0.94, 0.81, 0.89, 0.86, 0.54, 0.49, 0.96],
        'time': [2.98, 34.4, 9.66, 8.72, 32.57, 0.70, 13.10, 0.20, 0.09, 1.14],
    },
    'Dataset 4': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.72, 0.76, 0.75, 0.73, 0.75, 0.67, 0.75, 0.68, 0.62, 0.79],
        'time': [0.09, 0.22, 0.09, 0.05, 0.07, 0.09, 0.02, 0.32, 0.02, 1.03],
    },
    'Dataset 5': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.63, 0.64, 0.65, 0.63, 0.58, 0.57, 0.63, 0.58, 0.59, 0.61],
        'time': [0.35, 1.26, 0.20, 0.06, 0.05, 0.02, 0.21, 0.08, 0.01, 0.29],
    },
}

# Use colorblind-friendly palette
palette = sns.color_palette("colorblind", len(datasets))

# Plot the results
def plot_model_performance(dataset_results):
    fig, ax1 = plt.subplots(figsize=(14, 8))
    ax2 = ax1.twinx()

    for i, (dataset_name, result) in enumerate(dataset_results.items()):
        accuracies = [result['accuracies'][result['models'].index(model)] for model in selected_models]
        times = [result['time'][result['models'].index(model)] for model in selected_models]

        # Plot accuracy
        ax1.plot(selected_models, accuracies, marker='o', label=f"{dataset_name} Accuracy", color=palette[i])

        # Plot training time
        ax2.plot(selected_models, times, marker='x', linestyle='--', label=f"{dataset_name} Training Time", color=palette[i])

    ax1.set_xlabel('Model', fontsize=16, weight='bold')
    ax1.set_ylabel('Accuracy', fontsize=16 , weight='bold')
    ax2.set_ylabel('Training Time (s)', fontsize=16 , weight='bold')

    ax1.set_xticks(range(len(selected_models)))
    ax1.set_xticklabels(selected_models, rotation=70, fontsize=16)

    ax1.legend(loc='upper left', bbox_to_anchor=(1.05, 1), title="Accuracy", fontsize=14)
    ax2.legend(loc='lower left', bbox_to_anchor=(1.05, 0), title="Training Time", fontsize=14)


    plt.tight_layout()
    plt.show()

plot_model_performance(datasets)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# Selected models to plot (these should match the models available in each dataset)
selected_models = [
    'RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
    'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
    'GaussianNB', 'ExtraTreesClassifier'
]

# Dataset results for each dataset
datasets = {
    'Dataset 1': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.65, 0.64, 0.64, 0.62, 0.61, 0.56, 0.63, 0.60, 0.52, 0.63],
        'time': [1.66, 1.97, 1.55, 0.33, 0.14, 0.09, 6.17, 0.49, 0.02, 1.45],
    },
    'Dataset 2': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.69, 0.68, 0.69, 0.67, 0.63, 0.61, 0.63, 0.63, 0.44, 0.69],
        'time': [1.31, 4.64, 2.10, 0.31, 0.34, 0.09, 3.15, 0.05, 0.03, 1.50],
    },
    'Dataset 3': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.97, 0.97, 0.96, 0.94, 0.81, 0.89, 0.86, 0.54, 0.49, 0.96],
        'time': [2.98, 34.4, 9.66, 8.72, 32.57, 0.70, 13.10, 0.20, 0.09, 1.14],
    },
    'Dataset 4': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.72, 0.76, 0.75, 0.73, 0.75, 0.67, 0.75, 0.68, 0.62, 0.79],
        'time': [0.09, 0.22, 0.09, 0.05, 0.07, 0.09, 0.02, 0.32, 0.02, 1.03],
    },
    'Dataset 5': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.63, 0.64, 0.65, 0.63, 0.58, 0.57, 0.63, 0.58, 0.59, 0.61],
        'time': [0.35, 1.26, 0.20, 0.06, 0.05, 0.02, 0.21, 0.08, 0.01, 0.29],
    },
}

# Use colorblind-friendly palette
palette = sns.color_palette("colorblind", len(datasets))

# Plot the results
def plot_model_performance(dataset_results):
    fig, ax1 = plt.subplots(figsize=(14, 6))
    ax2 = ax1.twinx()

    # Plotting accuracy and training time for each dataset
    for i, (dataset_name, result) in enumerate(dataset_results.items()):
        accuracies = [result['accuracies'][result['models'].index(model)] for model in selected_models]
        times = [result['time'][result['models'].index(model)] for model in selected_models]

        # Plot accuracy with line style changes
        ax1.plot(selected_models, accuracies, marker='o', label=f"{dataset_name} Accuracy", color=palette[i], linestyle='-')

        # Plot training time with different line style and markers
        ax2.plot(selected_models, times, marker='x', linestyle='--', label=f"{dataset_name} Training Time", color=palette[i])

    # Set labels and title with increased font size for readability
    ax1.set_xlabel('Model', fontsize=14, weight='bold')
    ax1.set_ylabel('Accuracy', fontsize=14, weight='bold')
    ax2.set_ylabel('Training Time (s)', fontsize=14, weight='bold')

    ax1.set_xticks(range(len(selected_models)))
    ax1.set_xticklabels(selected_models, rotation=45, fontsize=12)

    # Adjust legend
    ax1.legend(loc='upper left', bbox_to_anchor=(1.05, 1), title="Accuracy", fontsize=12)
    ax2.legend(loc='lower left', bbox_to_anchor=(1.05, 0), title="Training Time", fontsize=12)

    # Ensure tight layout so everything fits
    plt.tight_layout()
    plt.show()

# Plot the model performance
plot_model_performance(datasets)


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Selected models to plot (these should match the models available in each dataset)
selected_models = [
    'RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
    'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
    'GaussianNB', 'ExtraTreesClassifier'
]

# Dataset results for each dataset
datasets = {
    'Dataset 1': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.65, 0.64, 0.64, 0.62, 0.61, 0.56, 0.63, 0.60, 0.52, 0.63],
        'time': [1.66, 1.97, 1.55, 0.33, 0.14, 0.09, 6.17, 0.49, 0.02, 1.45],
    },
    'Dataset 2': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.69, 0.68, 0.69, 0.67, 0.63, 0.61, 0.63, 0.63, 0.44, 0.69],
        'time': [1.31, 4.64, 2.10, 0.31, 0.34, 0.09, 3.15, 0.05, 0.03, 1.50],
    },
    'Dataset 3': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.97, 0.97, 0.96, 0.94, 0.81, 0.89, 0.86, 0.54, 0.49, 0.96],
        'time': [2.98, 34.4, 9.66, 8.72, 32.57, 0.70, 13.10, 0.20, 0.09, 1.14],
    },
    'Dataset 4': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.72, 0.76, 0.75, 0.73, 0.75, 0.67, 0.75, 0.68, 0.62, 0.79],
        'time': [0.09, 0.22, 0.09, 0.05, 0.07, 0.09, 0.02, 0.32, 0.02, 1.03],
    },
    'Dataset 5': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.63, 0.64, 0.65, 0.63, 0.58, 0.57, 0.63, 0.58, 0.59, 0.61],
        'time': [0.35, 1.26, 0.20, 0.06, 0.05, 0.02, 0.21, 0.08, 0.01, 0.29],
    },
}

# Colorblind-friendly colors for each dataset
colors = ['#D55E00', '#0072B2', '#F0E442', '#009E73', '#CC79A7']

# Function to plot model comparison
def plot_all_datasets(dataset_results):
    fig, ax1 = plt.subplots(figsize=(12, 8))

    ax2 = ax1.twinx()

    for i, (dataset_name, result) in enumerate(dataset_results.items()):
        accuracies_selected = [result['accuracies'][result['models'].index(model)] if model in result['models'] else None
                               for model in selected_models]
        time_selected = [result['time'][result['models'].index(model)] if model in result['models'] else None
                         for model in selected_models]

        # Plot accuracy as bar plots
        ax1.bar(np.arange(len(selected_models)) + i * 0.15, accuracies_selected,
                width=0.15, color=colors[i], alpha=0.8, label=f"{dataset_name} Accuracy")

        # Plot training time as line plots
        ax2.plot(np.arange(len(selected_models)) + i * 0.15, time_selected,
                 marker='o', color=colors[i], linewidth=2, label=f"{dataset_name} Training Time")

    ax1.set_xlabel('Model')
    ax1.set_ylabel('Test Accuracy', fontsize=12)
    ax2.set_ylabel('Training Time (s)', fontsize=12)

    ax1.set_xticks(np.arange(len(selected_models)))
    ax1.set_xticklabels(selected_models, rotation=45, ha='right')

    ax1.legend(loc='upper left', bbox_to_anchor=(1.05, 1), title="Accuracy")
    ax2.legend(loc='lower left', bbox_to_anchor=(1.05, 0), title="Training Time")

    plt.tight_layout()
    plt.show()

# Plot all datasets on one plot
plot_all_datasets(datasets)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Dataset results for each dataset
datasets = {
    'Dataset 1': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.65, 0.64, 0.64, 0.62, 0.61, 0.56, 0.63, 0.60, 0.52, 0.63],
        'time': [1.66, 1.97, 1.55, 0.33, 0.14, 0.09, 6.17, 0.49, 0.02, 1.45],
    },
    'Dataset 2': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.69, 0.68, 0.69, 0.67, 0.63, 0.61, 0.63, 0.63, 0.44, 0.69],
        'time': [1.31, 4.64, 2.10, 0.31, 0.34, 0.09, 3.15, 0.05, 0.03, 1.50],
    },
    'Dataset 3': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.97, 0.97, 0.96, 0.94, 0.81, 0.89, 0.86, 0.54, 0.49, 0.96],
        'time': [2.98, 34.4, 9.66, 8.72, 32.57, 0.70, 13.10, 0.20, 0.09, 1.14],
    },
    'Dataset 4': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.72, 0.76, 0.75, 0.73, 0.75, 0.67, 0.75, 0.68, 0.62, 0.79],
        'time': [0.09, 0.22, 0.09, 0.05, 0.07, 0.09, 0.02, 0.32, 0.02, 1.03],
    },
    'Dataset 5': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.63, 0.64, 0.65, 0.63, 0.58, 0.57, 0.63, 0.58, 0.59, 0.61],
        'time': [0.35, 1.26, 0.20, 0.06, 0.05, 0.02, 0.21, 0.08, 0.01, 0.29],
    },
}

# Function to plot accuracy and training time side by side for each dataset
def plot_accuracy_time_side_by_side(dataset_results):
    fig, axes = plt.subplots(len(dataset_results), 2, figsize=(15, len(dataset_results) * 5))
    fig.subplots_adjust(hspace=0.5)

    for i, (dataset_name, result) in enumerate(dataset_results.items()):
        models = result['models']
        accuracies = result['accuracies']
        time = result['time']

        # Plot Accuracy
        axes[i, 0].bar(models, accuracies, color='skyblue', alpha=0.8)
        axes[i, 0].set_title(f"{dataset_name} - Accuracy")
        axes[i, 0].set_ylabel('Accuracy')
        axes[i, 0].set_xlabel('Models')
        axes[i, 0].set_ylim(0, 1)
        axes[i, 0].tick_params(axis='x', rotation=45)

        # Plot Training Time
        axes[i, 1].bar(models, time, color='orange', alpha=0.8)
        axes[i, 1].set_title(f"{dataset_name} - Training Time")
        axes[i, 1].set_ylabel('Training Time (s)')
        axes[i, 1].set_xlabel('Models')
        axes[i, 1].tick_params(axis='x', rotation=45)

    plt.tight_layout()
    plt.show()

# Plot the datasets
plot_accuracy_time_side_by_side(datasets)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Dataset results for each dataset
datasets = {
    'Dataset 1': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.65, 0.64, 0.64, 0.62, 0.61, 0.56, 0.63, 0.60, 0.52, 0.63],
        'time': [1.66, 1.97, 1.55, 0.33, 0.14, 0.09, 6.17, 0.49, 0.02, 1.45],
    },
    'Dataset 2': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.69, 0.68, 0.69, 0.67, 0.63, 0.61, 0.63, 0.63, 0.44, 0.69],
        'time': [1.31, 4.64, 2.10, 0.31, 0.34, 0.09, 3.15, 0.05, 0.03, 1.50],
    },
    'Dataset 3': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.97, 0.97, 0.96, 0.94, 0.81, 0.89, 0.86, 0.54, 0.49, 0.96],
        'time': [2.98, 34.4, 9.66, 8.72, 32.57, 0.70, 13.10, 0.20, 0.09, 1.14],
    },
    'Dataset 4': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.72, 0.76, 0.75, 0.73, 0.75, 0.67, 0.75, 0.68, 0.62, 0.79],
        'time': [0.09, 0.22, 0.09, 0.05, 0.07, 0.09, 0.02, 0.32, 0.02, 1.03],
    },
    'Dataset 5': {
        'models': ['RandomForestClassifier', 'XGBClassifier', 'LGBMClassifier', 'KNeighborsClassifier',
                   'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'AdaBoostClassifier',
                   'GaussianNB', 'ExtraTreesClassifier'],
        'accuracies': [0.63, 0.64, 0.65, 0.63, 0.58, 0.57, 0.63, 0.58, 0.59, 0.61],
        'time': [0.35, 1.26, 0.20, 0.06, 0.05, 0.02, 0.21, 0.08, 0.01, 0.29],
    },
}

# Models are consistent across datasets
models = list(datasets['Dataset 1']['models'])

# Combine accuracies and training times across datasets
accuracy_data = [datasets[ds]['accuracies'] for ds in datasets]
time_data = [datasets[ds]['time'] for ds in datasets]
dataset_names = list(datasets.keys())

# Create grouped bar plots
x = np.arange(len(models))  # Positions for the models
width = 0.15  # Width of each bar

fig, axes = plt.subplots(2, 1, figsize=(16, 12))

# Accuracy plot
for i, dataset_name in enumerate(dataset_names):
    axes[0].bar(x + i * width, accuracy_data[i], width, label=dataset_name)
axes[0].set_title('Model Accuracy Across Datasets', fontsize=16)
axes[0].set_ylabel('Accuracy', fontsize=12)
axes[0].set_xticks(x + width * (len(dataset_names) - 1) / 2)
axes[0].set_xticklabels(models, rotation=45, ha='right', fontsize=10)
axes[0].set_ylim(0, 1)
axes[0].legend(title='Datasets', fontsize=10)

# Training time plot
for i, dataset_name in enumerate(dataset_names):
    axes[1].bar(x + i * width, time_data[i], width, label=dataset_name)
axes[1].set_title('Training Time Across Datasets', fontsize=16)
axes[1].set_ylabel('Training Time (s)', fontsize=12)
axes[1].set_xticks(x + width * (len(dataset_names) - 1) / 2)
axes[1].set_xticklabels(models, rotation=45, ha='right', fontsize=10)
axes[1].legend(title='Datasets', fontsize=10)

# Adjust layout and display the plot
plt.tight_layout()
plt.show()