# Loading data and libraries

In [1]:
# Import necessary libraries
import numpy as np 
import pandas as pd 
import os
from glob import glob
from sklearn.neighbors import KNeighborsRegressor

# Number of outputs
num_output = 4

In [2]:
#Download dataset

!wget -q https://s3.waw2-1.cloudferro.com/swift/v1/AUTH_afccea586afd4ef3bb11fe37dd1ddfbf/Download_KPLabs_Chellenge/train_data.zip
!wget -q https://s3.waw2-1.cloudferro.com/swift/v1/AUTH_afccea586afd4ef3bb11fe37dd1ddfbf/Download_KPLabs_Chellenge/test_data.zip

In [3]:
# Extracting dataset

!unzip -q train_data.zip
!unzip -q test_data.zip

In [4]:
class SpectralCurveFiltering():
    """
    Create a histogram (a spectral curve) of a 3D cube, using the merge_function
    to aggregate all pixels within one band. The return array will have
    the shape of [CHANNELS_COUNT]
    """

    def __init__(self, merge_function = np.mean):
        self.merge_function = merge_function

    def __call__(self, sample: np.ndarray):
        return self.merge_function(sample, axis=(1, 2))

In [5]:
def load_data(directory: str):
    """Load each cube, reduce its dimensionality and append to array.

    Args:
        directory (str): Directory to either train or test set
    Returns:
        [type]: A list with spectral curve for each sample.
    """
    data = []
    filtering = SpectralCurveFiltering(np.sum)
    all_files = np.array(
        sorted(
            glob(os.path.join(directory, "*.npz")),
            key=lambda x: int(os.path.basename(x).replace(".npz", "")),
        )
    )
    for file_name in all_files:
        with np.load(file_name) as npz:
            arr = np.ma.MaskedArray(**npz)
        arr = filtering(arr)
        data.append(arr)
    return np.array(data)


def load_gt(file_path: str):
    """Load labels for train set from the ground truth file.
    Args:
        file_path (str): Path to the ground truth .csv file.
    Returns:
        [type]: 2D numpy array with soil properties levels
    """
    gt_file = pd.read_csv(file_path)
    labels = gt_file[["P", "K", "Mg", "pH"]].values
    return labels


X_train = load_data("/kaggle/working/train_data/train_data")
y_train = load_gt("/kaggle/working/train_data/train_gt.csv")
X_test = load_data("/kaggle/working/test_data")

print(f"Train data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

Train data shape: (1732, 150)
Test data shape: (1154, 150)


# Preprocessing

## Scaler

In [6]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model

In [7]:
#This algorithm follows a pseudo-labeling strategy, where predictions made on the test data by the model are incorporated into the training set 
#to improve model performance. The process is repeated for each output dimension, 
#allowing the model to adapt to the specific characteristics of each target variable.
def pseudoLabelModels(params):
    # List to store predictions for each output dimension
    y = []  

    for i in range(num_output):
        # Set model hyperparameters for the i-th output
        model = KNeighborsRegressor(**params[i])
        
        # Train the model on the i-th output of the training data
        model.fit(X_train, y_train[:, i]) 
        
        # Generate pseudo labels for the test data
        pseudo_labels = model.predict(X_test)  
        
        # Combine training and test data along the rows
        combined_x = np.vstack((X_train, X_test)) 
        
        # Combine true labels and pseudo labels
        combined_y = np.concatenate((y_train[:, i], pseudo_labels)) 
        
        # Re-fit the model using the combined dataset
        model.fit(combined_x, combined_y)  
        
        # Generate final predictions for the test data
        y_pred = model.predict(X_test)  
        
        # Append predictions for the i-th output to the list
        y.append(y_pred)  
    
    # Return the list of predictions for all output dimensions
    return y  

In [8]:
# Defining the parameters for each model

params_knn = [{'n_neighbors': 224, 'metric': 'cosine'},
              {'n_neighbors': 55, 'metric': 'cosine'},
              {'n_neighbors': 49, 'metric': 'cosine'},
              {'n_neighbors': 80, 'metric': 'chebyshev'}]

In [9]:
# Execute train and prediction for each model using p

y = pseudoLabelModels(params_knn)

# Submission

In [10]:
# Convert the dataframe into the required format for submission
def submission(df, file_name = 'subsmission.csv'):
    # Itere sobre as linhas do DataFrame original
    data = []
    for index, row in df.iterrows():
        ID = int(row['ID'])
        for col_name in df.columns:
            if col_name != 'ID':
                col_value = row[col_name]
                sample_index = f"{ID}_{col_name}"
                data.append([sample_index, col_value])
    df_result = pd.DataFrame(data, columns=['sample_index', 'Target'])
    df_result.to_csv(file_name, index=False)

In [11]:
# Generate submission file
df_knn = pd.DataFrame(data = (np.array([y[0], y[1], y[2], y[3]]).T)/[70.3026558891455,227.9885103926097,159.28123556581986,6.782719399538106], columns=["P", "K", "Mg", "pH"])
df_knn['ID'] = range(0, len(df_knn))
df_knn = df_knn[['ID', 'P', 'K', 'Mg', 'pH']]
submission(df_knn, "final_submission.csv")