In [None]:
import argparse
import os
!pip install kaggle
from PIL import Image
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset
import numpy as np
import pickle
import sys
import matplotlib.pyplot as plt

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from google.colab import drive

In [None]:
class LogisticRegressionModel:
    def __init__(self, C=1.0, max_iter=100, random_state=42):
        self.model = LogisticRegression(C=C, max_iter=max_iter, random_state=random_state)

    def train(self, X_train, y_train):
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        return self.model.predict(X_test)

    def evaluate(self, X_test, y_test):
        predictions = self.model.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        return accuracy

    def get_params(self):
        return self.model.get_params()




class RandomForestModel:
    def __init__(self, n_estimators=100, max_depth=None, random_state=42):
        self.model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)

    def train(self, X_train, y_train):
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        return self.model.predict(X_test)

    def evaluate(self, X_test, y_test):
        predictions = self.model.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        return accuracy

    def get_params(self):
        return self.model.get_params()




class SVMModel:
    def __init__(self, kernel='rbf', C=1.0, gamma='scale', random_state=42):
        self.model = SVC(kernel=kernel, C=C, gamma=gamma, random_state=random_state)

    def train(self, X_train, y_train):
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        return self.model.predict(X_test)

    def evaluate(self, X_test, y_test):
        predictions = self.model.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        return accuracy

    def get_params(self):
        return self.model.get_params()


In [None]:
# code for importing and processing the imported data directly from Kaggle
drive.mount('/content/drive')
os.environ["KAGGLE_CONFIG_DIR"] = "/content/drive/MyDrive/kaggle"
!kaggle competitions download -c vub-ml-project-2024-animal-classification
!mkdir -p /content/drive/MyDrive/kaggle
!unzip vub-ml-project-2024-animal-classification.zip -d /content/

In [None]:

sys.path.append('/content/vub-ml-project-2024-animal-classification/starterskit/')

# Path to your feature file
feature_file_path = '/content/vub-ml-project-2024-animal-classification/starterskit/features/train/train_features_vgg.pkl'

# Load the pickle file
with open(feature_file_path, 'rb') as f:
    features_data = pickle.load(f)

# Check the type of the loaded data
print(type(features_data))

# Check the number of feature vectors in the list
print(f"Number of features: {len(features_data)}")
# Inspect the first feature vector
first_feature = features_data[0]
print(f"Type of the first feature: {type(first_feature)}")
print(f"First feature content: {first_feature}")
import features
if isinstance(first_feature, features.ImageFeatures):
    print(f"Label of the first feature: {first_feature.label}")  # Replace 'label' with the correct attribute name
# Check the shape of a feature vector
first_feature_data = first_feature.data
print(f"Shape of the first feature vector: {first_feature_data.shape}")

# Optionally, check a few more examples
for i in range(5):
    print(f"Feature {i}: Shape: {features_data[i].data.shape}, Label: {features_data[i].label}")
# Extract labels
labels = [feature.label for feature in features_data]
label_df = pd.Series(labels)

# Print the distribution of labels
print("Label distribution:")
print(label_df.value_counts())

# Plot the distribution of labels
label_df.value_counts().plot(kind='bar', figsize=(12, 6))
plt.title('Label Distribution')
plt.xlabel('Label')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Check dimensions of all feature vectors
feature_shapes = [feature.data.shape for feature in features_data]

# Identify unique shapes
unique_shapes = set(feature_shapes)
print(f"Unique feature shapes: {unique_shapes}")

# Optionally, count occurrences of each shape
from collections import Counter
shape_counts = Counter(feature_shapes)
print(f"Shape counts: {shape_counts}")


In [None]:
# padding the image
# Determine the max height and width
max_height = max(feature.data.shape[0] for feature in features_data)
max_width = max(feature.data.shape[1] for feature in features_data)

# Pad all feature matrices to the size (max_height, max_width)
X_padded = np.array([np.pad(feature.data, 
                            ((0, max_height - feature.data.shape[0]), 
                             (0, max_width - feature.data.shape[1])), 
                            mode='constant', constant_values=0).flatten() 
                     for feature in features_data])

# Check the shape of X (should be num_samples x feature_length)
print(f"Feature matrix shape: {X_padded.shape}")


In [None]:
# Assume X and y are your feature matrix and target labels
X = X_padded  # Feature matrix (after scaling)
y = np.array([feature.label for feature in features_data])  # Labels

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model you want to use
model = RandomForestModel(n_estimators=100, max_depth=10)

# Training the model
model.train(X_train, y_train)

# Evaluating the model
accuracy = model.evaluate(X_test, y_test)
print(f"Model Accuracy: {accuracy:.4f}")
