In [38]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer

# data load

## Target def 

def data_loading():
    """
    This function loads the training and test data, preprocesses it, removes the NaN values and interpolates the missing 
    data using imputation

    Parameters
    ----------
    Returns
    ----------
    X_train: matrix of floats, training input with features
    y_train: array of floats, training output with labels
    X_test: matrix of floats: dim = (100, ?), test input with features
    """
    # Load training data
    train_df = pd.read_csv("train.csv")
    
    print("Training data:")
    print("Shape:", train_df.shape)
    print(train_df.head(2))
    print('\n')
    
    # Load test data
    test_df = pd.read_csv("test.csv")

    print("Test data:")
    print(test_df.shape)
    print(test_df.head(2))

    # Dummy initialization of the X_train, X_test and y_train   
    X_train = np.zeros_like(train_df.drop(['price_CHF'],axis=1))
    y_train = np.zeros_like(train_df['price_CHF'])
    X_test = np.zeros_like(test_df)

    # TODO: Perform data preprocessing, imputation and extract X_train, y_train and X_test

    assert (X_train.shape[1] == X_test.shape[1]) and (X_train.shape[0] == y_train.shape[0]) and (X_test.shape[0] == 100), "Invalid data shape"
    return X_train, y_train, X_test

## Test

 ##  load data

In [None]:
    # Load training data
    train_df = pd.read_csv("train.csv")
    
    print("Training data:")
    print("Shape:", train_df.shape)
    print(train_df.head(2))
    print('\n')

In [None]:
    # Load test data
    test_df = pd.read_csv("test.csv")

    print("Test data:")
    print(test_df.shape)
    print(test_df.head(2))

In [None]:
    # Dummy initialization of the X_train, X_test and y_train   
    X_train = np.zeros_like(train_df.drop(['price_CHF'],axis=1))
    y_train = np.zeros_like(train_df['price_CHF'])
    X_test = np.zeros_like(test_df)

In [None]:
    train_arr = np.array(train_df)
    test_arr = np.array(test_df)
    
    # one hot encoder
    column_to_encode = 0
    train_to_encode = train_arr[:, column_to_encode].reshape(-1, 1)
    test_to_encode = test_arr[:, column_to_encode].reshape(-1, 1)
    encoder = OneHotEncoder()
    encoder.fit(train_to_encode)
    encoded_train_data = encoder.transform(train_to_encode)
    encoded_train_arr = np.concatenate((train_arr[:, :column_to_encode],
                            encoded_train_data.toarray(),
                            train_arr[:, column_to_encode+1:]), axis=1)
    
    encoded_test_data = encoder.transform(test_to_encode)
    encoded_test_arr = np.concatenate((test_arr[:, :column_to_encode],
                            encoded_test_data.toarray(),
                            test_arr[:, column_to_encode+1:]),axis=1)
    
    imputer = KNNImputer(n_neighbors=15, weights = 'uniform')
    imputed_train = imputer.fit_transform(encoded_train_arr)
    imputed_test = imputer.fit_transform(encoded_test_arr)
    
    train_idx = [i for i in range(encoded_train_arr.shape[0]) if np.isnan(encoded_train_arr[i,5]) == False]
    
    imputed_train_refined = imputed_train[train_idx]
    
    X_train = np.delete(imputed_train_refined, 5, 1)
    y_train = imputed_train_refined[:, 5]
    X_test = imputed_test

###  referece

In [None]:
def oneHot_encoding(data:pd.DataFrame) -> pd.DataFrame:
    N = data.shape[0]
    season_encoding_ndarry = np.zeros((N, 4))
    seasons = ['spring', 'summer', 'autumn', 'winter']

    for i in range(N):
        season = [j for j in range(4) if seasons[j] == data['season'][i]]
        assert(len(season) == 1)
        season_encoding_ndarry[i][season[0]] = 1

    season_encoding_df = pd.DataFrame(data=season_encoding_ndarry, columns=seasons)
    price_df = data.drop(['season'],axis=1)
    encoded_data_df = pd.concat([season_encoding_df, price_df], axis=1)
    return encoded_data_df


In [None]:
    encoded_train_df = oneHot_encoding(train_df)
    encoded_test_df = oneHot_encoding(test_df)

In [None]:
    train_idx = [i for i in range(encoded_train_df.shape[0]) if np.isnan(encoded_train_df['price_CHF'][i]) == False]
    print('length: ', len(train_idx))

    X_train_raw = np.delete(imputed_train, 5, 1)
    y_train_raw = imputed_train[:, 5]

    # dt = pd.DataFrame(y_train_raw, columns=['label'])
    # dt.to_csv('y_train_raw.csv', index=False)

    # Dummy initialization of the X_train, X_test and y_train   
    X_train = X_train_raw.take(train_idx, 0)
    y_train = y_train_raw.take(train_idx, 0)
    X_test = imputed_test
    

In [None]:
X_train_raw.shape

In [None]:
test_to_encode.shape

In [None]:
def data_loading():
    """
    This function loads the training and test data, preprocesses it, removes the NaN values and interpolates the missing 
    data using imputation

    Parameters
    ----------
    Returns
    ----------
    X_train: matrix of floats, training input with features
    y_train: array of floats, training output with labels
    X_test: matrix of floats: dim = (100, ?), test input with features
    """
    # Load training data
    train_df = pd.read_csv("train.csv")
    
    print("Training data:")
    print("Shape:", train_df.shape)
    print(train_df.head(2))
    print('\n')
    
    # Load test data
    test_df = pd.read_csv("test.csv")

    print("Test data:")
    print(test_df.shape)
    print(test_df.head(2))

    # Dummy initialization of the X_train, X_test and y_train   
    X_train = np.zeros_like(train_df.drop(['price_CHF'],axis=1))
    y_train = np.zeros_like(train_df['price_CHF'])
    X_test = np.zeros_like(test_df)

    # TODO: Perform data preprocessing, imputation and extract X_train, y_train and X_test
    train_arr = np.array(train_df)
    test_arr = np.array(test_df)
    
    # one hot encoder
    train_arr = np.array(train_df)
    test_arr = np.array(test_df)
    
    # one hot encoder
    column_to_encode = 0
    train_to_encode = train_arr[:, column_to_encode].reshape(-1, 1)
    test_to_encode = test_arr[:, column_to_encode].reshape(-1, 1)
    encoder = OneHotEncoder()
    encoder.fit(train_to_encode)
    encoded_train_data = encoder.transform(train_to_encode)
    encoded_train_arr = np.concatenate((train_arr[:, :column_to_encode],
                            encoded_train_data.toarray(),
                            train_arr[:, column_to_encode+1:]), axis=1)
    
    encoded_test_data = encoder.transform(test_to_encode)
    encoded_test_arr = np.concatenate((test_arr[:, :column_to_encode],
                            encoded_test_data.toarray(),
                            test_arr[:, column_to_encode+1:]),axis=1)
    
    # KNN Imputer
    imputer = KNNImputer(n_neighbors=15, weights = 'uniform')
    imputed_train = imputer.fit_transform(encoded_train_arr)
    imputed_test = imputer.fit_transform(encoded_test_arr)
    
    train_idx = [i for i in range(encoded_train_arr.shape[0]) if np.isnan(encoded_train_arr[i,5]) == False]
    
    imputed_train_refined = imputed_train[train_idx]
    
    X_train = np.delete(imputed_train_refined, 5, 1)
    y_train = imputed_train_refined[:, 5]
    
    X_test = imputed_test
    
    assert (X_train.shape[1] == X_test.shape[1]) and (X_train.shape[0] == y_train.shape[0]) and (X_test.shape[0] == 100), "Invalid data shape"
    return X_train, y_train, X_test