In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

def get_dataset(path):
    """Loads the dataset from an Excel file."""
    df = pd.read_excel(path)
    return df

def split_dataset(df, test_size=0.2, random_state=42):
    """
    Splits the dataset into training and testing sets.

    Parameters:
    - df: The complete dataframe.
    - test_size: The proportion of the dataset to include in the test split.
    - random_state: Controls the shuffling applied to the data before splitting.

    Returns:
    - X_train, X_test, y_train, y_test: Split data (features and labels).
    """
    # Assuming 'species' is the label column and the rest are features. Adjust this as necessary.
    X = df.drop(columns=['species'])  # Replace 'species' with the actual label column
    y = df['species']  # Replace 'species' with the actual label column

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

def main():
    # File path to your dataset
    path = '/content/bird_species_features.xlsx'

    # Load the dataset
    df = get_dataset(path)

    # Split the dataset
    X_train, X_test, y_train, y_test = split_dataset(df, test_size=0.2, random_state=42)

    # Display the split data
    print("X_train shape:", X_train.shape)
    print("X_test shape:", X_test.shape)
    print("y_train shape:", y_train.shape)
    print("y_test shape:", y_test.shape)

# Run the main function
if __name__ == "__main__":
    main()


X_train shape: (77, 33)
X_test shape: (20, 33)
y_train shape: (77,)
y_test shape: (20,)


In [12]:
#A7

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def load_dataset(path):
    """Loads the dataset from an Excel file."""
    return pd.read_excel(path)

def prepare_data(df, label_column, drop_columns):
    """
    Prepares the features and labels from the DataFrame.

    Parameters:
    - df: DataFrame containing the data.
    - label_column: Name of the label column.
    - drop_columns: List of columns to drop from the DataFrame (non-features).

    Returns:
    - X: Feature matrix.
    - y: Labels.
    """
    X = df.drop(columns=drop_columns)  # Feature matrix
    y = df[label_column]  # Labels
    return X, y

def split_and_scale_data(X, y, test_size=0.3, random_state=42):
    """
    Splits the data into training and testing sets, and scales the feature matrix.

    Parameters:
    - X: Feature matrix.
    - y: Labels.
    - test_size: Proportion of the data to include in the test split.
    - random_state: Random seed for reproducibility.

    Returns:
    - X_train_scaled: Scaled training features.
    - X_test_scaled: Scaled testing features.
    - y_train: Training labels.
    - y_test: Testing labels.
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Standardize the feature matrix
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, X_test_scaled, y_train, y_test

def train_knn_classifier(X_train, y_train, k=3):
    """
    Trains a k-NN classifier.

    Parameters:
    - X_train: Scaled training features.
    - y_train: Training labels.
    - k: Number of neighbors to use for k-NN.

    Returns:
    - knn: Trained k-NN model.
    """
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    return knn

def evaluate_model(knn, X_test, y_test):
    """
    Evaluates the k-NN classifier on the test data.

    Parameters:
    - knn: Trained k-NN model.
    - X_test: Scaled testing features.
    - y_test: Testing labels.

    Returns:
    - y_pred: Predicted labels for the test set.
    - accuracy: Accuracy of the model.
    - conf_matrix: Confusion matrix.
    - class_report: Classification report.
    """
    y_pred = knn.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)

    return y_pred, accuracy, conf_matrix, class_report

def study_prediction_behavior(y_test, y_pred):
    """
    Analyzes the prediction behavior of the classifier on the test set.

    Parameters:
    - y_test: Actual labels for the test set.
    - y_pred: Predicted labels for the test set.

    Prints:
    - Per-instance comparison of actual vs predicted.
    - Summary of misclassifications.
    """
    correct_predictions = 0
    incorrect_predictions = 0
    misclassified_samples = []

    print("Instance-wise prediction behavior:\n")

    for i in range(len(y_test)):
        actual = y_test.iloc[i]
        predicted = y_pred[i]

        # Track correct vs incorrect predictions
        if actual == predicted:
            correct_predictions += 1
        else:
            incorrect_predictions += 1
            misclassified_samples.append((i, actual, predicted))

        # Print instance-wise comparison
        print(f"Test Sample {i+1}: Actual = {actual}, Predicted = {predicted}")

    # Print summary
    print("\nSummary of prediction behavior:")
    print(f"Total correct predictions: {correct_predictions}")
    print(f"Total incorrect predictions: {incorrect_predictions}")

    # Print misclassified samples
    if misclassified_samples:
        print("\nMisclassified instances:")
        for sample in misclassified_samples:
            print(f"Index: {sample[0]}, Actual: {sample[1]}, Predicted: {sample[2]}")
    else:
        print("No misclassified samples found.")

def main():
    # Load the dataset
    path = '/content/bird_species_features (1).xlsx'
    features_df = load_dataset(path)

    # Prepare the data
    drop_columns = ['filename', 'species']
    X, y = prepare_data(features_df, label_column='species', drop_columns=drop_columns)

    # Split and scale the data
    X_train_scaled, X_test_scaled, y_train, y_test = split_and_scale_data(X, y)

    # Train the k-NN classifier
    knn = train_knn_classifier(X_train_scaled, y_train, k=3)

    # Evaluate the classifier
    y_pred, accuracy, conf_matrix, class_report = evaluate_model(knn, X_test_scaled, y_test)

    # Display evaluation results
    print(f"Accuracy: {accuracy:.2f}")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("Classification Report:")
    print(class_report)

    # Study prediction behavior
    study_prediction_behavior(y_test, y_pred)

if __name__ == "__main__":
    main()


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Index: 27325, Actual: Painted Francolin, Predicted: Black Francolin
Index: 27333, Actual: Painted Francolin, Predicted: Black Francolin
Index: 27340, Actual: Northern Shoveler, Predicted: Black Francolin
Index: 27341, Actual: Lesser Whistling Duck, Predicted: Indian Spot-billed Duck
Index: 27342, Actual: Grey Francolin, Predicted: Black Francolin
Index: 27343, Actual: Lesser Whistling Duck, Predicted: Black Francolin
Index: 27348, Actual: Grey Francolin, Predicted: Ruddy Shelduck
Index: 27357, Actual: Lesser Whistling Duck, Predicted: Black Francolin
Index: 27358, Actual: Bar-headed Goose, Predicted: Grey Francolin
Index: 27361, Actual: Indian Spot-billed Duck, Predicted: Black Francolin
Index: 27368, Actual: Indian Spot-billed Duck, Predicted: Garganey
Index: 27369, Actual: Northern Shoveler, Predicted: Black Francolin
Index: 27380, Actual: Painted Francolin, Predicted: Black Francolin
Index: 27384, Actual: Lesser Whistl

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def load_dataset(path):
    """Loads the dataset from an Excel file."""
    return pd.read_excel(path)

def prepare_data(df, label_column, drop_columns):
    """
    Prepares the features and labels from the DataFrame.

    Parameters:
    - df: DataFrame containing the data.
    - label_column: Name of the label column.
    - drop_columns: List of columns to drop from the DataFrame (non-features).

    Returns:
    - X: Feature matrix.
    - y: Labels.
    """
    X = df.drop(columns=drop_columns)  # Feature matrix
    y = df[label_column]  # Labels
    return X, y

def split_and_scale_data(X, y, test_size=0.3, random_state=42):
    """
    Splits the data into training and testing sets, and scales the feature matrix.

    Parameters:
    - X: Feature matrix.
    - y: Labels.
    - test_size: Proportion of the data to include in the test split.
    - random_state: Random seed for reproducibility.

    Returns:
    - X_train_scaled: Scaled training features.
    - X_test_scaled: Scaled testing features.
    - y_train: Training labels.
    - y_test: Testing labels.
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Standardize the feature matrix
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, X_test_scaled, y_train, y_test

def train_knn_classifier(X_train, y_train, k=3):
    """
    Trains a k-NN classifier.

    Parameters:
    - X_train: Scaled training features.
    - y_train: Training labels.
    - k: Number of neighbors to use for k-NN.

    Returns:
    - knn: Trained k-NN model.
    """
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    return knn

def evaluate_model(knn, X_test, y_test):
    """
    Evaluates the k-NN classifier on the test data.

    Parameters:
    - knn: Trained k-NN model.
    - X_test: Scaled testing features.
    - y_test: Testing labels.

    Returns:
    - y_pred: Predicted labels for the test set.
    - accuracy: Accuracy of the model.
    - conf_matrix: Confusion matrix.
    - class_report: Classification report.
    """
    y_pred = knn.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)

    return y_pred, accuracy, conf_matrix, class_report

def study_prediction_behavior(y_test, y_pred):
    """
    Analyzes the prediction behavior of the classifier on the test set.

    Parameters:
    - y_test: Actual labels for the test set.
    - y_pred: Predicted labels for the test set.

    Prints:
    - Per-instance comparison of actual vs predicted.
    - Summary of misclassifications.
    """
    correct_predictions = 0
    incorrect_predictions = 0
    misclassified_samples = []

    print("Instance-wise prediction behavior:\n")

    for i in range(len(y_test)):
        actual = y_test.iloc[i]
        predicted = y_pred[i]

        # Track correct vs incorrect predictions
        if actual == predicted:
            correct_predictions += 1
        else:
            incorrect_predictions += 1
            misclassified_samples.append((i, actual, predicted))

        # Print instance-wise comparison
        print(f"Test Sample {i+1}: Actual = {actual}, Predicted = {predicted}")

    # Print summary
    print("\nSummary of prediction behavior:")
    print(f"Total correct predictions: {correct_predictions}")
    print(f"Total incorrect predictions: {incorrect_predictions}")

    # Print misclassified samples
    if misclassified_samples:
        print("\nMisclassified instances:")
        for sample in misclassified_samples:
            print(f"Index: {sample[0]}, Actual: {sample[1]}, Predicted: {sample[2]}")
    else:
        print("No misclassified samples found.")

def main():
    # Load the dataset
    path = '/content/bird_species_features (1).xlsx'
    features_df = load_dataset(path)

    # Prepare the data
    drop_columns = ['filename', 'species']
    X, y = prepare_data(features_df, label_column='species', drop_columns=drop_columns)

    # Split and scale the data
    X_train_scaled, X_test_scaled, y_train, y_test = split_and_scale_data(X, y)

    # Train the k-NN classifier
    knn = train_knn_classifier(X_train_scaled, y_train, k=3)

    # Evaluate the classifier
    y_pred, accuracy, conf_matrix, class_report = evaluate_model(knn, X_test_scaled, y_test)

    # Display evaluation results
    print(f"Accuracy: {accuracy:.2f}")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("Classification Report:")
    print(class_report)

    # Study prediction behavior
    study_prediction_behavior(y_test, y_pred)

if __name__ == "__main__":
    main()


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Index: 27325, Actual: Painted Francolin, Predicted: Black Francolin
Index: 27333, Actual: Painted Francolin, Predicted: Black Francolin
Index: 27340, Actual: Northern Shoveler, Predicted: Black Francolin
Index: 27341, Actual: Lesser Whistling Duck, Predicted: Indian Spot-billed Duck
Index: 27342, Actual: Grey Francolin, Predicted: Black Francolin
Index: 27343, Actual: Lesser Whistling Duck, Predicted: Black Francolin
Index: 27348, Actual: Grey Francolin, Predicted: Ruddy Shelduck
Index: 27357, Actual: Lesser Whistling Duck, Predicted: Black Francolin
Index: 27358, Actual: Bar-headed Goose, Predicted: Grey Francolin
Index: 27361, Actual: Indian Spot-billed Duck, Predicted: Black Francolin
Index: 27368, Actual: Indian Spot-billed Duck, Predicted: Garganey
Index: 27369, Actual: Northern Shoveler, Predicted: Black Francolin
Index: 27380, Actual: Painted Francolin, Predicted: Black Francolin
Index: 27384, Actual: Lesser Whistl

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

def get_dataset(path):
    """Loads the dataset from an Excel file."""
    df = pd.read_excel(path)
    return df

def split_dataset(df, test_size=0.2, random_state=42):
    """
    Splits the dataset into training and testing sets using only two classes.

    Parameters:
    - df: The complete dataframe.
    - test_size: The proportion of the dataset to include in the test split.
    - random_state: Controls the shuffling applied to the data before splitting.

    Returns:
    - X_train, X_test, y_train, y_test: Split data (features and labels).
    """
    # Assuming 'species' is the label column and the rest are features. Adjust this as necessary.
    if 'species' not in df.columns:
        raise ValueError("The column 'species' is not present in the DataFrame.")

    X = df.drop(columns=['species'])  # Replace 'species' with the actual label column if different
    y = df['species']  # Replace 'species' with the actual label column if different

    # Get unique classes
    unique_classes = y.unique()
    print("Unique classes in the label column:", unique_classes)

    # If more than two classes, select two classes randomly
    if len(unique_classes) > 2:
        selected_classes = unique_classes[:2]  # Select the first two classes, or modify to select randomly
        df = df[df['species'].isin(selected_classes)]
        y = df['species']
        X = df.drop(columns=['species'])

    # Print unique values in the label column for debugging
    unique_classes = y.unique()
    print("Filtered unique classes in the label column:", unique_classes)

    # Split the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

def main():
    # File path to your dataset
    path = '/content/bird_species_features (1).xlsx'

    # Load the dataset
    df = get_dataset(path)

    # Split the dataset
    X_train, X_test, y_train, y_test = split_dataset(df, test_size=0.2, random_state=42)

    # Display the split data
    print("X_train shape:", X_train.shape)
    print("X_test shape:", X_test.shape)
    print("y_train shape:", y_train.shape)
    print("y_test shape:", y_test.shape)

# Run the main function
if __name__ == "__main__":
    main()


Unique classes in the label column: ['Fulvous Whistling Duck' 'Lesser Whistling Duck' 'Bar-headed Goose'
 'Greylag Goose' 'Greater White-fronted Goose' 'Knob-billed Duck'
 'Common Shelduck' 'Ruddy Shelduck' 'Cotton Pygmy Goose' 'Garganey'
 'Northern Shoveler' 'Gadwall' 'Eurasian Wigeon' 'Indian Spot-billed Duck'
 'Mallard' 'Northern Pintail' 'Eurasian Teal' 'Andaman Teal'
 'Common Pochard' 'Common Merganser' 'Snow Partridge' 'Chukar Partridge'
 'Black Francolin' 'Painted Francolin' 'Grey Francolin' 'Swamp Francolin'
 'Common Quail' 'Rain Quail']
Filtered unique classes in the label column: ['Fulvous Whistling Duck' 'Lesser Whistling Duck']
X_train shape: (5861, 17)
X_test shape: (1466, 17)
y_train shape: (5861,)
y_test shape: (1466,)


ValueError: The feature or label data is empty after filtering.

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

def get_dataset(path):
    """Loads the dataset from an Excel file."""
    df = pd.read_excel(path)
    return df

def split_dataset(df, test_size=0.2, random_state=42):
    """
    Splits the dataset into training and testing sets using only two classes.

    Parameters:
    - df: The complete dataframe.
    - test_size: The proportion of the dataset to include in the test split.
    - random_state: Controls the shuffling applied to the data before splitting.

    Returns:
    - X_train, X_test, y_train, y_test: Split data (features and labels).
    """
    # Assuming 'species' is the label column and the rest are features. Adjust this as necessary.
    if 'species' not in df.columns:
        raise ValueError("The column 'species' is not present in the DataFrame.")

    X = df.drop(columns=['species'])  # Replace 'species' with the actual label column if different
    y = df['species']  # Replace 'species' with the actual label column if different

    # Get unique classes
    unique_classes = y.unique()
    print("Unique classes in the label column:", unique_classes)

    # If more than two classes, select two classes randomly
    if len(unique_classes) > 2:
        selected_classes = unique_classes[:2]  # Select the first two classes, or modify to select randomly
        df = df[df['species'].isin(selected_classes)]
        y = df['species']
        X = df.drop(columns=['species'])

    # Print unique values in the label column for debugging
    unique_classes = y.unique()
    print("Filtered unique classes in the label column:", unique_classes)

    # Split the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

def train_knn_classifier(X_train, y_train, X_test, y_test, k=3):
    """
    Trains a k-NN classifier and evaluates its performance.

    Parameters:
    - X_train: Training features.
    - y_train: Training labels.
    - X_test: Testing features.
    - y_test: Testing labels.
    - k: Number of neighbors for k-NN.

    Returns:
    - None
    """
    # Initialize the k-NN classifier with k=3
    knn = KNeighborsClassifier(n_neighbors=k)

    # Fit the classifier to the training data
    knn.fit(X_train, y_train)

    # Predict on the test set
    y_pred = knn.predict(X_test)

    # Evaluate the classifier
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    print(f"Accuracy of k-NN classifier with k={k}: {accuracy:.2f}")
    print("Classification Report:")
    print(report)

def main():
    # File path to your dataset
    path = '/content/bird_species_features (1).xlsx'

    # Load the dataset
    df = get_dataset(path)

    # Split the dataset
    X_train, X_test, y_train, y_test = split_dataset(df, test_size=0.2, random_state=42)

    # Train and evaluate the k-NN classifier
    train_knn_classifier(X_train, y_train, X_test, y_test, k=3)

# Run the main function
if __name__ == "__main__":
    main()


Unique classes in the label column: ['Fulvous Whistling Duck' 'Lesser Whistling Duck' 'Bar-headed Goose'
 'Greylag Goose' 'Greater White-fronted Goose' 'Knob-billed Duck'
 'Common Shelduck' 'Ruddy Shelduck' 'Cotton Pygmy Goose' 'Garganey'
 'Northern Shoveler' 'Gadwall' 'Eurasian Wigeon' 'Indian Spot-billed Duck'
 'Mallard' 'Northern Pintail' 'Eurasian Teal' 'Andaman Teal'
 'Common Pochard' 'Common Merganser' 'Snow Partridge' 'Chukar Partridge'
 'Black Francolin' 'Painted Francolin' 'Grey Francolin' 'Swamp Francolin'
 'Common Quail' 'Rain Quail']
Filtered unique classes in the label column: ['Fulvous Whistling Duck' 'Lesser Whistling Duck']


ValueError: could not convert string to float: 'D:\\bird_songs\\351852.mp3'

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the features DataFrame from the Excel file
features_df = pd.read_excel('/content/bird_species_features (1).xlsx')

# Prepare features and labels
X = features_df.drop(columns=['filename', 'species'])  # Feature matrix
y = features_df['species']  # Labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the feature matrix
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the k-NN classifier
k = 3  # You can adjust the number of neighbors
knn = KNeighborsClassifier(n_neighbors=k)

# Train the classifier
knn.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = knn.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.75
Confusion Matrix:
[[   48     2    40     0     0     0     0     0     0     0     0     1
      0     5     1    33     3    10     0     4     0     0     0    12
      3     3     1     8]
 [    3   440   120    10     0     0     0     0     2     0     4     0
      1     5     2    48    12    33     0    10     0     0     1    22
      8    13     2    18]
 [   11    32 10590   105     7     1     4     0     1     0     4     1
      3    32     1   210    15   168     1    39     0     0     3   279
     70    38    15    39]
 [    2     4   160   893     0     0     0     0     1     0     4     0
      4     1     1    11     2     7     0     8     0     0     2    20
      6     5     5     1]
 [    0     4    71     2    49     0     0     0     0     0     1     0
      0     7     0     5     1     6     0     3     0     1     0    11
      6     1     0     2]
 [    0     1    11     0     0    24     0     0     0     0     0     0
      0     2     

  _warn_prf(average, modifier, msg_start, len(result))
