In [1]:
# GENERAL UTILITIES
import os
import pywt
import numpy as np
import pandas as pd
from glob import glob
import seaborn as sns
from  tqdm.notebook import tqdm
from matplotlib import pyplot as plt
%matplotlib inline

# MODEL DEVELOPMENT DEPENDENCIES


from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing



In [2]:
!wget "https://s3.waw2-1.cloudferro.com/swift/v1/AUTH_afccea586afd4ef3bb11fe37dd1ddfbf/Download_KPLabs_Chellenge/test_data.zip"
!wget "https://s3.waw2-1.cloudferro.com/swift/v1/AUTH_afccea586afd4ef3bb11fe37dd1ddfbf/Download_KPLabs_Chellenge/train_data.zip"
!unzip test_data.zip
!unzip train_data.zip

--2023-11-09 19:08:47--  https://s3.waw2-1.cloudferro.com/swift/v1/AUTH_afccea586afd4ef3bb11fe37dd1ddfbf/Download_KPLabs_Chellenge/test_data.zip
Resolving s3.waw2-1.cloudferro.com (s3.waw2-1.cloudferro.com)... 185.48.234.249
Connecting to s3.waw2-1.cloudferro.com (s3.waw2-1.cloudferro.com)|185.48.234.249|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1314110621 (1.2G) [application/zip]
Saving to: ‘test_data.zip’


2023-11-09 19:09:28 (30.8 MB/s) - ‘test_data.zip’ saved [1314110621/1314110621]

--2023-11-09 19:09:29--  https://s3.waw2-1.cloudferro.com/swift/v1/AUTH_afccea586afd4ef3bb11fe37dd1ddfbf/Download_KPLabs_Chellenge/train_data.zip
Resolving s3.waw2-1.cloudferro.com (s3.waw2-1.cloudferro.com)... 185.48.234.249
Connecting to s3.waw2-1.cloudferro.com (s3.waw2-1.cloudferro.com)|185.48.234.249|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1970208481 (1.8G) [application/zip]
Saving to: ‘train_data.zip’


2023-11-09 19:10:31 (30.8 M

In [3]:
# SOME CONSTANTS UTILIZED IN THE NOTEBOOK
DEBUG = False 
AUGMENT_CONSTANT_RF=1 
AUGMENT_CONSTANT_KNN=1 
LABEL_NAMES = ["P", "K", "Mg", "pH"] 
#LABEL_MAXS = np.array([325.0, 625.0, 400.0, 7.8]) 
LABEL_MAXS = np.array([1, 1, 1, 1]) 
#Y_BASE_FACT = np.array([121764.2 / 1731.0, 394876.1 / 1731.0, 275875.1 / 1731.0, 11747.67 / 1731.0]) / LABEL_MAXS 
COL_IX = [0, 1, 2, 3] 

In [4]:
def load_data(directory: str, gt_file_path: str, is_train=True, augment_constant: int = 0):
    """Load each cube, reduce its dimensionality and append to array.

    Args:
        directory (str): Directory to either train or test set
        gt_file_path (str): File path for the ground truth labels (expected CVS file)
        is_train (boolean): Binary flag for setting loader for Train (TRUE) or Test (FALSE)
        augment_constant (int): number of augmentation steps to randomly crop from the larger agricultural fields
    Returns:
        [type]: Tuple of lists composed of raw field (data , mask) pairs,
                and if exists: (augmented data, augmented mask) pairs, and ground truth labels
    """
    
    datalist = []
    masklist = []
    aug_datalist = []
    aug_masklist = []
    aug_labellist = []

    if is_train:
        labels = load_gt(gt_file_path)

    all_files = np.array(
        sorted(
            glob(os.path.join(directory, "*.npz")),
            key=lambda x: int(os.path.basename(x).replace(".npz", "")),
        )
    )

    if DEBUG:
        all_files = all_files[:100]
        if is_train:
            labels = labels[:100]

    for idx, file_name in tqdm(enumerate(all_files),total=len(all_files), desc="Loading {} data .."
                               .format("training" if is_train else "test")):
       # We load the data into memory as provided in the example notebook of the challenge
        with np.load(file_name) as npz:
            mask = npz["mask"]
            data = npz["data"]
            datalist.append(data)
            masklist.append(mask)
            
    # for training data we make pre-augmentation by adding some randomly cropped samples
    if is_train: 
        for i in range(augment_constant):
            for idx, file_name in tqdm(enumerate(all_files),total=len(all_files), desc="Loading augmentation {} ..".format(i+1)):
                # print(file_name)
                with np.load(file_name) as npz:
                    flag = True
                    mask = npz["mask"]
                    data = npz["data"]
                    ma = np.max(data, keepdims=True)
                    sh = data.shape[1:]
                    for i in range(0): 
                        # Repeating 11x11 cropping 10 times does not mean we use all croppings:
                        # as seen in the Flag=False below at the end of the loop, 
                        # when we reach at the good crop (not coinciding to the masked area) we stop searching 
                        
                        # Randomly cropping the fields with 11x11 size, 
                        # and adding some noise to the cropped samples 
                        edge = 11  
                        x = np.random.randint(sh[0] + 1 - edge)
                        y = np.random.randint(sh[1] + 1 - edge)
                        
                        # get crops having meaningful pixels, not zeros
                        if np.sum(mask[0, x : (x + edge), y : (y + edge)]) > 120: 
                            aug_data = (data[:, x : (x + edge), y : (y + edge)]
                                        + np.random.uniform(-0.01, 0.01, (150, edge, edge)) * ma)
                            aug_mask = mask[:, x : (x + edge), y : (y + edge)] | np.random.randint(0, 1, (150, edge, edge))
                            
                            flag = False #break the loop when you have a meaningful crop
                            break

                    # After having  11x11 croped sample, get another crop considering 
                    # the minimum edge length: (min_edge,min_edge)
                    if flag: 
                        max_edge = np.max(sh)
                        min_edge = np.min(sh)  # AUGMENT BY SHAPE
                        edge = min_edge  # np.random.randint(16, min_edge)
                        x = np.random.randint(sh[0] + 1 - edge)
                        y = np.random.randint(sh[1] + 1 - edge)
                        aug_data = (data[:, x : (x + edge), y : (y + edge)]
                                    + np.random.uniform(-0.001, 0.001, (150, edge, edge)) * ma)
                        aug_mask = mask[:, x : (x + edge), y : (y + edge)] | np.random.randint(0, 1, (150, edge, edge))

                    aug_datalist.append(aug_data)
                    aug_masklist.append(aug_mask)
                    aug_labellist.append(
                        labels[idx, :]
                        + labels[idx, :] * np.random.uniform(-0.001, 0.001, 4)
                    )

    # do pre-augmentation only for training data
    if is_train: 
        return (datalist,
                masklist,
                labels,
                aug_datalist,
                aug_masklist,
                np.array(aug_labellist))
    else:
        return datalist, masklist


def load_gt(file_path: str):
    """Load labels for train set from the ground truth file.
    Args:
        file_path (str): Path to the ground truth .csv file.
    Returns:
        [type]: 2D numpy array with soil properties levels
    """
    gt_file = pd.read_csv(file_path)
    labels = gt_file[["P", "K", "Mg", "pH"]].values / LABEL_MAXS  # normalize ground-truth between 0-1
    
    return labels

In [5]:
# Please be sure that the directory and file locations are given correctly in your own system
train_data_dir = '/kaggle/working/train_data/train_data'
test_data_dir=  '/kaggle/working/test_data'
gt_data_path = '/kaggle/working/train_data/train_gt.csv'

# Loading training raw data
X_train, M_train, y_train, X_aug_train, M_aug_train, y_aug_train = load_data(train_data_dir, 
                                                                             gt_data_path, 
                                                                             is_train=True, 
                                                                             augment_constant=AUGMENT_CONSTANT_KNN)
# Loading test raw data
X_test, M_test = load_data(test_data_dir, 
                           gt_file_path=None, 
                           is_train=False)

print(f"Train data size: {len(X_train)}")
print(f"Train aug data size: {len(X_aug_train)}")
print(f"Test data size: {len(X_test)}")

Loading training data ..:   0%|          | 0/1732 [00:00<?, ?it/s]

Loading augmentation 1 ..:   0%|          | 0/1732 [00:00<?, ?it/s]

Loading test data ..:   0%|          | 0/1154 [00:00<?, ?it/s]

Train data size: 1732
Train aug data size: 1732
Test data size: 1154


In [6]:
def preprocess(data_list, mask_list, is_for_KNN=False): 
    """Extract high-level features from the raw field data.

    Args:
        data_list: Directory to either train or test set
        mask_list: File path for the ground truth labels (expected CVS file)
        is_for_KNN: Binary flag for determining if the features are generated for KNN (TRUE) or Random Forest (FALSE)
    Returns:
        [type]: Tuple of lists composed of (features , field size) pairs for each field, 
                where field size will be used performance analysis.
    """
        
    def _shape_pad(data):
        # This sub-function makes padding to have square fields sizes.
        # Not mandatory but eliminates the risk of calculation error in singular value decomposition,
        # padding by warping also improves the performance slightly.
        max_edge = np.max(image.shape[1:])
        shape = (max_edge, max_edge)
        padded = np.pad(data,((0, 0), (0, (shape[0] - data.shape[1])), (0, (shape[1] - data.shape[2]))),"wrap")
        return padded
    
    filtering = SpectralCurveFiltering()
    w1 = pywt.Wavelet("sym3")
    w2 = pywt.Wavelet("dmey")

    processed_data = []
    average_edge = []

    for idx, (data, mask) in enumerate(
        tqdm(
            zip(data_list, mask_list),
            total=len(data_list),
            position=0,
            leave=True,
            desc="INFO: Preprocessing data ...",
        )
    ):
        data = data / 2210   # max-max=5419 mean-max=2210
        m = 1 - mask.astype(int)
        image = data * m

        average_edge.append((image.shape[1] + image.shape[2]) / 2)
        image = _shape_pad(image)

        s = np.linalg.svd(image, full_matrices=False, compute_uv=False)
        s0 = s[:, 0]  
        s1 = s[:, 1]  
        s2 = s[:, 2] 
        s3 = s[:, 3]  
        s4 = s[:, 4]   
        dXds1 = s0 / (s1 + np.finfo(float).eps)


        data = np.ma.MaskedArray(data, mask)
        arr = filtering(data)

        cA0, cD0 = pywt.dwt(arr, wavelet=w2, mode="constant")
        cAx, cDx = pywt.dwt(cA0[12:92], wavelet=w2, mode="constant")
        cAy, cDy = pywt.dwt(cAx[15:55], wavelet=w2, mode="constant")
        cAz, cDz = pywt.dwt(cAy[15:35], wavelet=w2, mode="constant")
        cAw2 = np.concatenate((cA0[12:92], cAx[15:55], cAy[15:35], cAz[15:25]), -1)
        cDw2 = np.concatenate((cD0[12:92], cDx[15:55], cDy[15:35], cDz[15:25]), -1)

        cA0, cD0 = pywt.dwt(arr, wavelet=w1, mode="constant")
        cAx, cDx = pywt.dwt(cA0[1:-1], wavelet=w1, mode="constant")
        cAy, cDy = pywt.dwt(cAx[1:-1], wavelet=w1, mode="constant")
        cAz, cDz = pywt.dwt(cAy[1:-1], wavelet=w1, mode="constant")
        cAw1 = np.concatenate((cA0, cAx, cAy, cAz), -1)
        cDw1 = np.concatenate((cD0, cDx, cDy, cDz), -1)

        dXdl = np.gradient(arr, axis=0)
        d2Xdl2 = np.gradient(dXdl, axis=0)
        d3Xdl3 = np.gradient(d2Xdl2, axis=0)


        fft = np.fft.fft(arr)
        real = np.real(fft)
        imag = np.imag(fft)
        ffts = np.fft.fft(s0)
        reals = np.real(ffts)
        imags = np.imag(ffts)

        # The best Feature combination for Random Forest based regression
        out_rf = np.concatenate(
            [
                arr,
                dXdl,
                d2Xdl2,
                d3Xdl3,
                dXds1,
                s0,
                s1,
                s2,
                s3,
                s4,
                real,
                imag,
                reals,
                imags,
                cAw1,
                cAw2,
            ],
            -1,
        )
        
        # The best Feature combination for KNN based regression
        out_knn = np.concatenate(
            [
                arr,
                dXdl,
                d2Xdl2,
                d3Xdl3,
                s0,
                s1,
                s2,
                s3,
                s4,
                real,
                imag,

            ],
            -1,
        )
        
      
        if is_for_KNN:
            processed_data.append(out_knn)
        else:
            processed_data.append(out_rf)

    return np.array(processed_data), np.array(average_edge)



class SpectralCurveFiltering: # Default class provided by the challenge organizers
    """
    Create a histogram (a spectral curve) of a 3D cube, using the merge_function
    to aggregate all pixels within one band. The return array will have
    the shape of [CHANNELS_COUNT]
    """

    def __init__(self, merge_function=np.mean):
        self.merge_function = merge_function

    def __call__(self, sample: np.ndarray):
        return self.merge_function(sample, axis=(1, 2))
    

In [7]:
# preprocessed data for random forest traninig and testing
X_tr_processed_RF, avg_edge_train = preprocess(X_train, M_train, is_for_KNN=False)
X_aug_processed_RF, avg_edge_train_aug_RF = preprocess(X_aug_train[:len(X_train)*AUGMENT_CONSTANT_RF], M_aug_train[:len(X_train)*AUGMENT_CONSTANT_RF], is_for_KNN=False)
X_te_processed_RF, avg_edge_test = preprocess(X_test, M_test, is_for_KNN=False)

# preprocessed data for KNN traninig and testing
#X_tr_processed_KNN, avg_edge_train = preprocess(X_train, M_train, is_for_KNN=True)
#X_aug_processed_KNN, avg_edge_train_aug_KNN = preprocess(X_aug_train, M_aug_train,is_for_KNN=True)
#X_te_processed_KNN, avg_edge_test = preprocess(X_test, M_test, is_for_KNN=True)


INFO: Preprocessing data ...:   0%|          | 0/1732 [00:00<?, ?it/s]

INFO: Preprocessing data ...:   0%|          | 0/1732 [00:00<?, ?it/s]

INFO: Preprocessing data ...:   0%|          | 0/1154 [00:00<?, ?it/s]

In [20]:
# Select set of labels 
y_train_col = y_train[:, COL_IX]  
y_aug_train_col = y_aug_train[:len(y_train_col)*AUGMENT_CONSTANT_RF, COL_IX]
np.shape(y_aug_train_col)

(1732, 4)

In [21]:
# Merge original data with the augmented data on training set
concatente_var= np.concatenate((X_tr_processed_RF, X_aug_processed_RF), axis=0)
concatente_target = np.concatenate((y_train_col, y_aug_train_col), axis=0)
print(np.shape(concatente_var))
print(np.shape(concatente_target))
concatente_target

(3464, 2400)
(3464, 4)


array([[ 45.1       , 188.        , 179.        ,   7.2       ],
       [ 44.8       , 205.        , 188.        ,   7.        ],
       [ 44.4       , 207.        , 145.        ,   6.8       ],
       ...,
       [ 39.38728567, 179.98346239, 122.05096731,   6.495499  ],
       [ 37.26517913, 161.93779433, 127.06955715,   6.49893802],
       [ 29.49152673, 146.09600566, 132.90885945,   6.30248392]])

In [22]:
# Create a DataFrame with dummy column names
num_rows, num_cols =concatente_target.shape
#dummy_columns = [f'feat{i+1}' for i in range(num_cols)]
dummy_columns = ["P", "K", "Mg", "pH"]
y = pd.DataFrame(concatente_target, columns=dummy_columns)
# Now, 'train' is a DataFrame with dummy column names
y

Unnamed: 0,P,K,Mg,pH
0,45.100000,188.000000,179.000000,7.200000
1,44.800000,205.000000,188.000000,7.000000
2,44.400000,207.000000,145.000000,6.800000
3,46.500000,204.000000,143.000000,6.800000
4,52.000000,212.000000,167.000000,6.700000
...,...,...,...,...
3459,40.819368,132.953408,132.096370,6.201042
3460,42.742518,192.121235,125.964054,6.504712
3461,39.387286,179.983462,122.050967,6.495499
3462,37.265179,161.937794,127.069557,6.498938


In [23]:
# Create a DataFrame with dummy column names
num_rows, num_cols =concatente_var.shape
dummy_columns = [f'feat{i+1}' for i in range(num_cols)]
train = pd.DataFrame(concatente_var, columns=dummy_columns)
# Now, 'train' is a DataFrame with dummy column names
train

Unnamed: 0,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,feat9,feat10,...,feat2391,feat2392,feat2393,feat2394,feat2395,feat2396,feat2397,feat2398,feat2399,feat2400
0,0.203683,0.203992,0.202177,0.205636,0.210894,0.220505,0.226073,0.227012,0.228581,0.230678,...,0.805311,0.936756,1.290456,1.626534,1.777969,2.094892,2.382082,2.567942,2.697733,2.733829
1,0.250603,0.249785,0.246786,0.250632,0.255035,0.265170,0.270217,0.269648,0.269498,0.270211,...,0.992393,1.105412,1.483055,1.721859,1.699395,2.483675,4.277375,4.606743,4.764096,4.809254
2,0.191200,0.189831,0.187725,0.191629,0.196252,0.205140,0.209914,0.210256,0.210999,0.211794,...,0.756372,0.862336,1.140623,1.386935,1.473936,1.671600,1.852783,2.015047,2.058019,2.104777
3,0.275959,0.276271,0.273568,0.278322,0.284468,0.296392,0.303449,0.305312,0.308380,0.311888,...,1.090404,1.272584,1.779799,2.184225,2.368559,2.711463,2.792593,3.004298,3.125400,3.193575
4,0.182765,0.181889,0.179319,0.182695,0.186605,0.195498,0.200748,0.201549,0.202935,0.204745,...,0.721746,0.836694,1.217127,1.562283,1.710942,2.026881,2.311239,2.532482,2.615472,2.613160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3459,0.165413,0.165501,0.163484,0.167268,0.171699,0.180410,0.185604,0.186594,0.188374,0.190428,...,0.652480,0.777262,1.134233,1.490317,1.612101,1.845058,2.080594,2.260498,2.348053,2.364815
3460,0.205924,0.206554,0.204983,0.209049,0.214850,0.224855,0.231180,0.232533,0.234535,0.237170,...,0.812914,0.965804,1.380149,1.795328,1.943117,2.221792,2.517689,2.723448,2.864458,2.936748
3461,0.172906,0.173411,0.171933,0.175537,0.180699,0.189676,0.195400,0.196652,0.198448,0.200822,...,0.681913,0.818926,1.195245,1.568797,1.691405,1.928360,2.202616,2.378541,2.494439,2.529544
3462,0.161537,0.161873,0.160385,0.164104,0.168783,0.177496,0.182867,0.184015,0.185860,0.188004,...,0.636854,0.766575,1.115399,1.457031,1.573237,1.804493,2.087973,2.270355,2.377441,2.403942


In [24]:
train2=pd.concat([train,y],axis=1)
train2['sample_index']=train2.index
train2

Unnamed: 0,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,feat9,feat10,...,feat2396,feat2397,feat2398,feat2399,feat2400,P,K,Mg,pH,sample_index
0,0.203683,0.203992,0.202177,0.205636,0.210894,0.220505,0.226073,0.227012,0.228581,0.230678,...,2.094892,2.382082,2.567942,2.697733,2.733829,45.100000,188.000000,179.000000,7.200000,0
1,0.250603,0.249785,0.246786,0.250632,0.255035,0.265170,0.270217,0.269648,0.269498,0.270211,...,2.483675,4.277375,4.606743,4.764096,4.809254,44.800000,205.000000,188.000000,7.000000,1
2,0.191200,0.189831,0.187725,0.191629,0.196252,0.205140,0.209914,0.210256,0.210999,0.211794,...,1.671600,1.852783,2.015047,2.058019,2.104777,44.400000,207.000000,145.000000,6.800000,2
3,0.275959,0.276271,0.273568,0.278322,0.284468,0.296392,0.303449,0.305312,0.308380,0.311888,...,2.711463,2.792593,3.004298,3.125400,3.193575,46.500000,204.000000,143.000000,6.800000,3
4,0.182765,0.181889,0.179319,0.182695,0.186605,0.195498,0.200748,0.201549,0.202935,0.204745,...,2.026881,2.311239,2.532482,2.615472,2.613160,52.000000,212.000000,167.000000,6.700000,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3459,0.165413,0.165501,0.163484,0.167268,0.171699,0.180410,0.185604,0.186594,0.188374,0.190428,...,1.845058,2.080594,2.260498,2.348053,2.364815,40.819368,132.953408,132.096370,6.201042,3459
3460,0.205924,0.206554,0.204983,0.209049,0.214850,0.224855,0.231180,0.232533,0.234535,0.237170,...,2.221792,2.517689,2.723448,2.864458,2.936748,42.742518,192.121235,125.964054,6.504712,3460
3461,0.172906,0.173411,0.171933,0.175537,0.180699,0.189676,0.195400,0.196652,0.198448,0.200822,...,1.928360,2.202616,2.378541,2.494439,2.529544,39.387286,179.983462,122.050967,6.495499,3461
3462,0.161537,0.161873,0.160385,0.164104,0.168783,0.177496,0.182867,0.184015,0.185860,0.188004,...,1.804493,2.087973,2.270355,2.377441,2.403942,37.265179,161.937794,127.069557,6.498938,3462


In [25]:
"""gt_df = pd.read_csv(gt_data_path)
print(gt_df.shape)
gt_df.head()"""

'gt_df = pd.read_csv(gt_data_path)\nprint(gt_df.shape)\ngt_df.head()'

In [26]:
gt_p=train2[['sample_index','P']]
gt_K=train2[['sample_index','K']]
gt_Mg=train2[['sample_index','Mg']]
gt_pH=train2[['sample_index','pH']]
gt_pH.head()

Unnamed: 0,sample_index,pH
0,0,7.2
1,1,7.0
2,2,6.8
3,3,6.8
4,4,6.7


In [27]:
def reshape_dataframe(dataframe, id_vars, var_name, value_name):
    # Melt the DataFrame to reshape it
    melted = pd.melt(dataframe, id_vars=id_vars, var_name=var_name, value_name=value_name)

    # You can create the final 'sample_index' column here if needed
    melted['Target'] = melted['sample_index'].astype(str) + '_' + melted['Target']

    # Drop the 'index' column
    melted.drop('sample_index', axis=1, inplace=True)

    return melted

# Example usage
gt_p = reshape_dataframe(gt_p, id_vars=['sample_index'], var_name='Target', value_name='Value')
gt_K = reshape_dataframe(gt_K, id_vars=['sample_index'], var_name='Target', value_name='Value')
gt_Mg = reshape_dataframe(gt_Mg, id_vars=['sample_index'], var_name='Target', value_name='Value')
gt_pH = reshape_dataframe(gt_pH, id_vars=['sample_index'], var_name='Target', value_name='Value')
print(gt_pH.shape)
gt_pH.head()

(3464, 2)


Unnamed: 0,Target,Value
0,0_pH,7.2
1,1_pH,7.0
2,2_pH,6.8
3,3_pH,6.8
4,4_pH,6.7


In [28]:
# Example usage
gt_p = pd.concat([gt_p,train2],axis=1)
gt_K = pd.concat([gt_K,train2],axis=1)
gt_Mg = pd.concat([gt_Mg,train2],axis=1)
gt_pH = pd.concat([gt_pH,train2],axis=1)
train = pd.concat([gt_p,gt_K,gt_Mg,gt_pH],axis=0)
train.to_csv('train.csv',index=False)
print(train.shape)
train.head()

(13856, 2407)


Unnamed: 0,Target,Value,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,...,feat2396,feat2397,feat2398,feat2399,feat2400,P,K,Mg,pH,sample_index
0,0_P,45.1,0.203683,0.203992,0.202177,0.205636,0.210894,0.220505,0.226073,0.227012,...,2.094892,2.382082,2.567942,2.697733,2.733829,45.1,188.0,179.0,7.2,0
1,1_P,44.8,0.250603,0.249785,0.246786,0.250632,0.255035,0.26517,0.270217,0.269648,...,2.483675,4.277375,4.606743,4.764096,4.809254,44.8,205.0,188.0,7.0,1
2,2_P,44.4,0.1912,0.189831,0.187725,0.191629,0.196252,0.20514,0.209914,0.210256,...,1.6716,1.852783,2.015047,2.058019,2.104777,44.4,207.0,145.0,6.8,2
3,3_P,46.5,0.275959,0.276271,0.273568,0.278322,0.284468,0.296392,0.303449,0.305312,...,2.711463,2.792593,3.004298,3.1254,3.193575,46.5,204.0,143.0,6.8,3
4,4_P,52.0,0.182765,0.181889,0.179319,0.182695,0.186605,0.195498,0.200748,0.201549,...,2.026881,2.311239,2.532482,2.615472,2.61316,52.0,212.0,167.0,6.7,4


In [17]:
np.shape(X_te_processed_RF)

(1154, 2400)

In [18]:
np.shape(X_tr_processed_RF)

(1732, 2400)

In [19]:
# Create a DataFrame with dummy column names
num_rows, num_cols = X_te_processed_RF.shape
dummy_columns = [f'feat{i+1}' for i in range(num_cols)]
test = pd.DataFrame(X_te_processed_RF, columns=dummy_columns)
# Assuming you have a DataFrame 'test', create the 'Target' column
test_P=test.copy()
test_K=test.copy()
test_Mg=test.copy()
test_PH=test.copy()

test_P['Target'] = [f'{i}_P' for i in range(len(test_P))]
test_P['mean_norm'] = 70.3026558891455
#70.3026558891455,227.9885103926097,159.28123556581986,6.782719399538106
test_K['Target'] = [f'{i}_K' for i in range(len(test_K))]
test_K['mean_norm'] = 227.9885103926097

test_Mg['Target'] = [f'{i}_Mg' for i in range(len(test_Mg))]
test_Mg['mean_norm'] = 159.28123556581986

test_PH['Target'] = [f'{i}_pH' for i in range(len(test_PH))]
test_PH['mean_norm'] = 6.782719399538106

test=pd.concat([test_P,test_K,test_Mg,test_PH],axis=0)

# Now, your 'Target' column in the test DataFrame will have values like '0_P', '1_P', '2_P', and so on.
print(test.shape)
test.to_csv('test.csv',index=False)

# Now, 'train' is a DataFrame with dummy column names
test

(4616, 2402)


Unnamed: 0,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,feat9,feat10,...,feat2393,feat2394,feat2395,feat2396,feat2397,feat2398,feat2399,feat2400,Target,mean_norm
0,0.173565,0.171952,0.168504,0.170999,0.173946,0.181525,0.185026,0.184056,0.183710,0.183698,...,1.174493,1.269823,1.064050,1.963784,4.521961,5.065660,5.246762,5.330273,0_P,70.302656
1,0.202796,0.201845,0.199020,0.202500,0.206132,0.214863,0.219335,0.218668,0.218889,0.219440,...,1.248479,1.495482,1.494265,2.381440,4.517328,4.982208,5.206562,5.262480,1_P,70.302656
2,0.164810,0.163459,0.161074,0.163151,0.166365,0.173361,0.176534,0.175363,0.174817,0.174266,...,1.169714,1.269090,0.967839,2.094396,5.501404,6.153006,6.340413,6.413906,2_P,70.302656
3,0.196746,0.195933,0.192789,0.195939,0.199449,0.207141,0.210958,0.209670,0.209044,0.208800,...,1.135058,1.346569,1.292224,2.551541,5.902475,6.501725,6.768157,6.804711,3_P,70.302656
4,0.265508,0.266124,0.264145,0.268839,0.274781,0.286389,0.293448,0.294926,0.297448,0.300688,...,1.668053,2.016291,2.163232,2.443648,2.528219,2.715599,2.822610,2.819261,4_P,70.302656
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1149,0.201331,0.201670,0.199746,0.204116,0.209300,0.219236,0.225269,0.226503,0.228633,0.231030,...,1.355050,1.761454,1.890163,2.161619,2.482829,2.687889,2.808897,2.861287,1149_pH,6.782719
1150,0.187588,0.188071,0.186565,0.190504,0.195775,0.205280,0.211257,0.212602,0.214667,0.217173,...,1.292950,1.703421,1.837563,2.098486,2.399257,2.591536,2.715855,2.773716,1150_pH,6.782719
1151,0.246595,0.247139,0.245130,0.250370,0.256538,0.268103,0.275224,0.276892,0.279748,0.283008,...,1.646616,2.134732,2.322334,2.661394,2.926015,3.165880,3.297552,3.393916,1151_pH,6.782719
1152,0.195533,0.196011,0.194421,0.198574,0.203896,0.213583,0.219580,0.220892,0.222968,0.225395,...,1.307993,1.691528,1.829480,2.100164,2.407549,2.618703,2.759835,2.830244,1152_pH,6.782719
