In [22]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
import pickle
import os
from collections import Counter
import random

# Ensure CUDA visibility if using GPU
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = '2'


In [25]:
class TripletDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
        self.labels_set = set(labels.numpy())
        self.label_to_indices = {label: np.where(labels.numpy() == label)[0]
                                 for label in self.labels_set}

    def __getitem__(self, index):
        anchor = self.features[index]
        anchor_label = self.labels[index].item()
        positive_index = index
        while positive_index == index:
            positive_index = random.choice(self.label_to_indices[anchor_label])
        negative_label = random.choice(list(self.labels_set - {anchor_label}))
        negative_index = random.choice(self.label_to_indices[negative_label])
        positive = self.features[positive_index]
        negative = self.features[negative_index]
        return anchor, positive, negative

    def __len__(self):
        return len(self.features)


In [26]:
all_blocks = []
all_labels = []
all_files_path = glob('merged_data_pkl/*')

for path in tqdm(all_files_path):
    with open(path, 'rb') as file:
        data_dict = pickle.load(file)
        all_blocks.append(data_dict['data'])
        all_labels.append(data_dict['label'])

X = np.array(all_blocks).reshape((-1, 1, 32, 250))
Y = np.array(all_labels)

tensor_X = torch.tensor(X).float()  # Ensure tensor is float for input to model
tensor_Y = torch.tensor(Y).long()

dataset = TripletDataset(tensor_X, tensor_Y)
train_loader = DataLoader(dataset, batch_size=64, shuffle=True)


100%|██████████| 8400/8400 [00:00<00:00, 14618.95it/s]


In [40]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import os
import numpy as np
from collections import Counter
import random

class EmbeddingCNN(nn.Module):
    def __init__(self):
        super(EmbeddingCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=(3, 3), padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=(3, 3), padding=1)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=(3, 3), padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(64 * 4 * 31, 128)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = self.flatten(x)
        x = self.fc1(x)
        return x

class TripletLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(TripletLoss, self).__init__()
        self.margin = margin

    def forward(self, anchor, positive, negative):
        distance_positive = (anchor - positive).pow(2).sum(1)
        distance_negative = (anchor - negative).pow(2).sum(1)
        losses = F.relu(distance_positive - distance_negative + self.margin)
        return losses.mean()

def train_model(model, train_loader, optimizer, num_epochs, device):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for anchor, positive, negative in train_loader:
            anchor, positive, negative = anchor.to(device), positive.to(device), negative.to(device)
            optimizer.zero_grad()
            anchor_emb = model(anchor)
            positive_emb = model(positive)
            negative_emb = model(negative)
            loss = triplet_loss(anchor_emb, positive_emb, negative_emb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}: Loss {total_loss / len(train_loader)}')

# Set up device, model, optimizer, and loss
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = EmbeddingCNN().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
triplet_loss = TripletLoss()

# Assuming train_loader is defined and set up properly
train_model(model, train_loader, optimizer, 20, device)


Epoch 1: Loss 13344.880397004741
Epoch 2: Loss 5.146810209751129
Epoch 3: Loss 1.67455412092663
Epoch 4: Loss 1.1888260994638715
Epoch 5: Loss 1.260702571414766
Epoch 6: Loss 1.1301394672620864
Epoch 7: Loss 1.1509428268387205
Epoch 8: Loss 1.0799174700464522
Epoch 9: Loss 1.0306017052559626
Epoch 10: Loss 1.024756306126004
Epoch 11: Loss 1.0249864481744313
Epoch 12: Loss 1.0417901958738054
Epoch 13: Loss 1.0278615037600198
Epoch 14: Loss 1.0094020026070731
Epoch 15: Loss 1.0078024574688502
Epoch 16: Loss 1.0125827397618974
Epoch 17: Loss 1.0556970210302443
Epoch 18: Loss 1.3372499465942382
Epoch 19: Loss 1.0589542922519501
Epoch 20: Loss 1.0601246300197782


In [41]:
test_data = pd.read_excel('/home/project/new/Test.xlsx')

In [42]:
import numpy as np
import pandas as pd
import torch

# Assuming `test_data` is already loaded and is a Pandas DataFrame
all_test_blocks = []
block_height = 32
num_columns_in_test = 750  # Columns per test block

# Iterate over the dataset in chunks of 32 rows
for start_row in range(0, test_data.shape[0], block_height):
    if start_row + block_height > test_data.shape[0]:
        continue  # Skip incomplete blocks
    chunk = test_data.iloc[start_row:start_row + block_height]

    # Split each 750-column block into three 250-column blocks
    for i in range(3):  # Since 750/250 = 3
        start_col = i * 250
        end_col = start_col + 250
        block = chunk.iloc[:, start_col:end_col]
        all_test_blocks.append(block.values.reshape(1, block_height, 250))  # Adding channel dimension

# Convert blocks to PyTorch tensors
X_test_blocks = torch.tensor(all_test_blocks, dtype=torch.float)


  X_test_blocks = torch.tensor(all_test_blocks, dtype=torch.float)


In [43]:
X_test_blocks = X_test_blocks.to(device)  # Move the test data to the same device as the model
with torch.no_grad():  # Ensure no gradients are computed during inference
    predictions = model(X_test_blocks)
    predicted_classes = torch.argmax(predictions, dim=1).cpu().numpy()  # Move predictions back to CPU and convert to numpy


In [44]:
import pandas as pd

# `predicted_classes` is the numpy array containing the class indices predicted by the model
predictions_df = pd.DataFrame(predicted_classes, columns=['PredictedClass'])


In [45]:
# Specify the path and name of the CSV file
csv_file_path = './predictions.csv'

# Save the DataFrame to a CSV file without the index column
predictions_df.to_csv(csv_file_path, index=False)


In [46]:
label_map = {
    0: "Tiger",
    1: "Snake",
    2: "Wolf",
    3: "Bear",
    4: "Rabbit",
    5: "Monkey",
    6: "Eagle",
    7: "Dolphin",
    8: "Koala"
}


In [47]:
import pandas as pd

# Assuming `predicted_classes` is already available as a numpy array
predictions_df = pd.DataFrame(predicted_classes, columns=['PredictedClass'])

# Replace numeric labels with animal names using the mapping
predictions_df['PredictedClass'] = predictions_df['PredictedClass'].replace(label_map)
# Specify the path and name of the CSV file
csv_file_path = './animal_predictions.csv'

# Save the DataFrame to a CSV file without the index column
predictions_df.to_csv(csv_file_path, index=False)
import pandas as pd
import numpy as np
from collections import Counter
import random

# Load predictions
predictions_df = pd.read_csv('./animal_predictions.csv')

# Function to determine the majority or random in case of a tie
def majority_or_random(labels):
    count = Counter(labels)
    max_freq = max(count.values())
    candidates = [label for label, freq in count.items() if freq == max_freq]
    return random.choice(candidates)

# Group predictions in groups of 3 and apply the function
grouped_labels = []
for i in range(0, len(predictions_df), 3):
    labels = predictions_df['PredictedClass'][i:i+3].tolist()
    if len(labels) == 3:  # Ensure it's a full group of 3
        grouped_label = majority_or_random(labels)
        grouped_labels.append(grouped_label)

# Create new DataFrame
final_predictions_df = pd.DataFrame(grouped_labels, columns=['MajorityVotedClass'])
# Save the DataFrame to a CSV file
final_csv_file_path = './final_animal_predictions.csv'
final_predictions_df.to_csv(final_csv_file_path, index=False)

import pandas as pd

# Load the CSV file
file_path = './final_animal_predictions.csv'
predictions_df = pd.read_csv(file_path)
# Generate the 'ID' column
predictions_df['ID'] = ['id_' + str(index + 1) for index in predictions_df.index]
# Reorder columns to make 'ID' the first column
column_order = ['ID', 'MajorityVotedClass']
predictions_df = predictions_df[column_order]

# Save the updated DataFrame to a new CSV file
updated_csv_file_path = './updated_final_animal_predictions.csv'
predictions_df.to_csv(updated_csv_file_path, index=False)


In [31]:
def validate_embeddings(model, val_loader, device):
    model.eval()
    embeddings = []
    labels = []

    with torch.no_grad():
        for anchor, positive, negative in val_loader:
            anchor = anchor.to(device)
            emb = model(anchor)
            embeddings.append(emb)
            labels.append(anchor[1])  # Assuming you pass labels in your dataset loader

    # Convert lists to tensors
    embeddings = torch.cat(embeddings)
    labels = torch.cat(labels)

    # Here we use a simple nearest neighbors approach on the embeddings
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.metrics import accuracy_score

    classifier = KNeighborsClassifier(n_neighbors=1)
    embeddings = embeddings.cpu().numpy()  # Convert embeddings to numpy array
    labels = labels.cpu().numpy()
    classifier.fit(embeddings, labels)  # Train KNN on embeddings

    # Validation on the same embeddings (for demonstration, split or use separate data ideally)
    predicted_labels = classifier.predict(embeddings)
    accuracy = accuracy_score(labels, predicted_labels)

    model.train()  # Set model back to training mode
    return accuracy


In [35]:
from torch.utils.data import DataLoader, Dataset, Subset, random_split


In [36]:
from sklearn.model_selection import train_test_split

# Split indices for training and validation
indices = list(range(len(tensor_X)))  # Assuming tensor_X is your full dataset tensor
train_indices, val_indices = train_test_split(indices, test_size=0.2, random_state=42)

# Create training and validation subsets using these indices
train_subset = torch.utils.data.Subset(dataset, train_indices)
val_subset = torch.utils.data.Subset(dataset, val_indices)

# Create DataLoaders for train and validation subsets
train_loader = DataLoader(train_subset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_subset, batch_size=64, shuffle=False)


In [38]:
def validate_embeddings(model, val_loader, device):
    model.eval()
    embeddings = []
    labels = []

    with torch.no_grad():
        for anchor, _, _ in val_loader:  # Adjusted to process anchor only
            anchor = anchor.to(device)
            emb = model(anchor)
            embeddings.append(emb)
            # Assuming labels are also passed in the loader and correct
            # If not, you'll need to adjust how labels are passed or handled

    # Convert list of tensors to a single tensor
    embeddings = torch.cat(embeddings, dim=0)  # Ensure correct dimension concatenation
    # Assume labels are collected somehow, if not, adjust accordingly

    # Since we're using embeddings directly, ensure they're in the right shape for scikit-learn
    embeddings = embeddings.view(embeddings.size(0), -1).cpu().numpy()  # Flatten embeddings and move to CPU and numpy
    labels = labels.cpu().numpy()  # Make sure labels are numpy array

    # Use KNN to validate
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.metrics import accuracy_score

    classifier = KNeighborsClassifier(n_neighbors=1)
    classifier.fit(embeddings, labels)  # Train KNN on embeddings

    predicted_labels = classifier.predict(embeddings)
    accuracy = accuracy_score(labels, predicted_labels)
    model.train()  # Set model back to train mode
    return accuracy


In [39]:
    for anchor, _, _ in val_loader:
        anchor = anchor.to(device)
        emb = model(anchor)
        embeddings.append(emb)
        # You need to adjust here to collect labels if not being done so already


NameError: name 'embeddings' is not defined

In [37]:
# Assuming tensor_X and tensor_Y are already defined as your dataset
dataset = TripletDataset(tensor_X, tensor_Y)

# Split the dataset into training and validation
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create DataLoaders for training and validation
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Define the model, optimizer, and loss function
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = EmbeddingCNN().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
triplet_loss = TripletLoss()

# Training the model
def train_model(model, train_loader, val_loader, optimizer, num_epochs, device):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for anchor, positive, negative in train_loader:
            anchor, positive, negative = anchor.to(device), positive.to(device), negative.to(device)
            optimizer.zero_grad()
            anchor_emb = model(anchor)
            positive_emb = model(positive)
            negative_emb = model(negative)
            loss = triplet_loss(anchor_emb, positive_emb, negative_emb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        val_accuracy = validate_embeddings(model, val_loader, device)
        print(f'Epoch {epoch+1}: Loss {total_loss / len(train_loader)} - Val Accuracy: {val_accuracy * 100:.2f}%')

train_model(model, train_loader, val_loader, optimizer, 10, device)

ValueError: Found array with dim 3. KNeighborsClassifier expected <= 2.