In [76]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer

## data read 

In [22]:
    # Load training data
    train_df = pd.read_csv("train.csv")
     # Load test data
    test_df = pd.read_csv("test.csv")   

# one_hot encoder

### Sklearn encoder

In [2]:
enc = OneHotEncoder(handle_unknown='ignore')

In [3]:
X = [['Male', 1], ['Female', 3], ['Female', 2]]

In [4]:
enc.fit(X)

In [5]:
enc.categories_

[array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]

In [6]:
enc.transform([['Female', 1], ['Male', 4]]).toarray()

array([[1., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [7]:
enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]])

array([['Male', 1],
       [None, 2]], dtype=object)

In [9]:
enc.get_feature_names_out(['gender', 'group'])

array(['gender_Female', 'gender_Male', 'group_1', 'group_2', 'group_3'],
      dtype=object)

In [50]:
data = np.array([[1, 'cat', 4],
                 [2, 'dog', 2],
                 [3, 'cat', 0],
                 [4, 'bird', 1]])

In [51]:
column_to_encode = 1
data_to_encode = data[:, column_to_encode].reshape(-1, 1)

In [52]:
data_to_encode

array([['cat'],
       ['dog'],
       ['cat'],
       ['bird']], dtype='<U21')

In [53]:
encoder = OneHotEncoder()
encoded_data = encoder.fit_transform(data_to_encode)

### test on sklearn

In [27]:
enc_season = OneHotEncoder()

In [35]:
season_data = {'season':['spring','summer','autumn','winter']}
df_season = pd.DataFrame(season_data)

In [36]:
enc_season.fit(df_season)

In [38]:
enc_season.categories_

[array(['autumn', 'spring', 'summer', 'winter'], dtype=object)]

In [37]:
enc_season.get_feature_names_out()

array(['season_autumn', 'season_spring', 'season_summer', 'season_winter'],
      dtype=object)

In [40]:
print(enc_season.transform(df_season).toarray())

[[0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]]


In [54]:
train_arr = np.array(train_df)

In [64]:
train_arr[0:3,0]

array(['spring', 'summer', 'autumn'], dtype=object)

In [65]:
column_to_encode = 0
data_to_encode = train_arr[:, column_to_encode].reshape(-1, 1)

In [66]:
encoder = OneHotEncoder()
encoded_data = encoder.fit_transform(data_to_encode)

In [67]:
new_data = np.concatenate((train_arr[:, :column_to_encode],
                            encoded_data.toarray(),
                            train_arr[:, column_to_encode+1:]), axis=1)

In [68]:
new_data

array([[0.0, 1.0, 0.0, ..., -3.931031226630509, nan, -3.238196806151894],
       [0.0, 0.0, 1.0, ..., nan, nan, -3.212894038068976],
       [1.0, 0.0, 0.0, ..., -4.07384968174626, nan, -3.1140608060213903],
       ...,
       [0.0, 0.0, 1.0, ..., -1.4993613445447886, 3.110638067512592,
        2.230252561735496],
       [1.0, 0.0, 0.0, ..., -1.5477160129737388, 3.105416529245648,
        1.989139721317721],
       [0.0, 0.0, 0.0, ..., nan, 3.272815718725681, 2.080666809994271]],
      dtype=object)

## def

In [12]:
def oneHot_encoding(data:pd.DataFrame) -> pd.DataFrame:
    N = data.shape[0]
    season_encoding_ndarry = np.zeros((N, 4))
    seasons = ['spring', 'summer', 'autumn', 'winter']

    for i in range(N):
        season = [j for j in range(4) if seasons[j] == data['season'][i]]
        assert(len(season) == 1)
        season_encoding_ndarry[i][season[0]] = 1

    season_encoding_df = pd.DataFrame(data=season_encoding_ndarry, columns=seasons)
    price_df = data.drop(['season'],axis=1)
    encoded_data_df = pd.concat([season_encoding_df, price_df], axis=1)
    return encoded_data_df

### Test on def

In [18]:
  print("Training data:")
    print("Shape:", train_df.shape)
    print(train_df.head(5))
    print('\n')

Training data:
Shape: (900, 11)
   season  price_AUS  price_CHF  price_CZE  price_GER  price_ESP  price_FRA  \
0  spring        NaN   9.644028  -1.686248  -1.748076  -3.666005        NaN   
1  summer        NaN   7.246061  -2.132377  -2.054363  -3.295697  -4.104759   
2  autumn  -2.101937   7.620085  -1.910282        NaN  -3.388777        NaN   
3  winter  -2.098475   8.411894  -1.903834        NaN  -3.588235        NaN   
4  spring  -1.969687   8.926884  -1.697257  -1.331049        NaN  -3.911096   

   price_UK  price_ITA  price_POL  price_SVK  
0 -1.822720  -3.931031        NaN  -3.238197  
1 -1.826021        NaN        NaN  -3.212894  
2 -2.034409  -4.073850        NaN  -3.114061  
3 -2.214720  -4.018620  -2.330803        NaN  
4 -2.388092  -4.093946        NaN        NaN  




In [74]:
encoded_train_df = oneHot_encoding(train_df)
encoded_test_df = oneHot_encoding(test_df)

In [17]:
encoded_train_df
print("Training data:")
print("Shape:", encoded_train_df.shape)
print(encoded_train_df.head(5))
print('\n')

Training data:
Shape: (900, 14)
   spring  summer  autumn  winter  price_AUS  price_CHF  price_CZE  price_GER  \
0     1.0     0.0     0.0     0.0        NaN   9.644028  -1.686248  -1.748076   
1     0.0     1.0     0.0     0.0        NaN   7.246061  -2.132377  -2.054363   
2     0.0     0.0     1.0     0.0  -2.101937   7.620085  -1.910282        NaN   
3     0.0     0.0     0.0     1.0  -2.098475   8.411894  -1.903834        NaN   
4     1.0     0.0     0.0     0.0  -1.969687   8.926884  -1.697257  -1.331049   

   price_ESP  price_FRA  price_UK  price_ITA  price_POL  price_SVK  
0  -3.666005        NaN -1.822720  -3.931031        NaN  -3.238197  
1  -3.295697  -4.104759 -1.826021        NaN        NaN  -3.212894  
2  -3.388777        NaN -2.034409  -4.073850        NaN  -3.114061  
3  -3.588235        NaN -2.214720  -4.018620  -2.330803        NaN  
4        NaN  -3.911096 -2.388092  -4.093946        NaN        NaN  




## imputation

In [77]:
    imputer = KNNImputer(n_neighbors=10, weights="uniform")

    imputed_train = imputer.fit_transform(encoded_train_df)
    imputed_test = imputer.fit_transform(encoded_test_df)