In [1]:
import os
import sys

import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.mixture import BayesianGaussianMixture, GaussianMixture
from sklearn.preprocessing import OrdinalEncoder, RobustScaler, StandardScaler
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm, trange

from utils import GroupDataset

In [2]:
SEED = 755

In [7]:
train_dataset = GroupDataset('./data', train=True)

In [4]:
class FrequencyRankingEncoder:
    def __init__(self):
        self.encoding_tables = []
        self.min_values = []

    def fit(self, x: np.array):
        self.encoding_tables.clear()
        self.min_values.clear()
        for col_idx in range(x.shape[1]):
            col = x[:, col_idx]
            # build frequency table
            freq_table = {}
            x_list, freq_list = np.unique(col, return_counts=True)
            for i, freq in enumerate(freq_list):
                if freq not in freq_table:
                    freq_table[freq] = []
                freq_table[freq].append(x_list[i])

            # make encoding table
            self.encoding_tables.append(dict())
            
            # Cumulative Density Function
            # total_cnt = sum(freq_list)
            # current_cum = 0
            # for i, freq in enumerate(sorted(freq_table)):
            #     current_cum += freq * len(freq_table[freq])
            #     for data in freq_table[freq]:
            #         self.encoding_table[data] = current_cum / total_cnt


            encoding_table = self.encoding_tables[-1]
            no_of_ranking = len(list(freq_table.keys()))
            for ranking, freq in enumerate(sorted(freq_table)):
                for i in freq_table[freq]:
                    encoding_table[i] = (ranking + 1) / no_of_ranking
            self.min_values.append(min(encoding_table.values()))

    def transform(self, x: np.array):
        ret = []
        for col_idx in range(x.shape[1]):
            col = x[:, col_idx]
            encoding_table = self.encoding_tables[col_idx]
            temp = np.empty_like(col, dtype=float)
            for idx in range(temp.shape[0]):
                temp[idx] = encoding_table.get(col[idx], 0)
            ret.append(temp)
        ret = np.array(ret).T
        return ret

    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

In [8]:
x_train, y_train, k_train = train_dataset.get_xy()

In [9]:
valid_dataset = GroupDataset('./data', train=False)

In [10]:
x_valid, y_valid, k_valid = valid_dataset.get_xy()

### Phase Encode

In [202]:
scaler = FrequencyRankingEncoder()
x_train_fre = scaler.fit_transform(x_train)
x_valid_fre = scaler.transform(x_valid)

In [27]:
x_train = np.concatenate([x_train, x_train_fre], axis=1)
x_valid = np.concatenate([x_valid, x_valid_fre], axis=1)

### End Phase Encoder

In [11]:
model = RandomForestClassifier(n_jobs=-1, random_state=SEED)
model.fit(x_train, y_train)

RandomForestClassifier(n_jobs=-1, random_state=755)

In [12]:
pred = model.predict(x_valid)

In [13]:
print(classification_report(y_valid, pred, digits=4))

              precision    recall  f1-score   support

           0     0.9517    0.9678    0.9597       590
           1     0.9575    0.9365    0.9469       457

    accuracy                         0.9542      1047
   macro avg     0.9546    0.9522    0.9533      1047
weighted avg     0.9542    0.9542    0.9541      1047

