In [None]:
import cv2
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy.special import digamma
from sklearn.neighbors import KDTree
from geneticalgorithm import geneticalgorithm as ga
import sys
sys.path.append('..')

from features.imtools import get_features as get_img_features
from features.txtools import get_features as get_txt_features
from features.aetools import get_features as get_ae_features
from features.laytools import get_layout_features

In [None]:
def get_radius_kneighbors(x, n_neighbors):
    kd = KDTree(x, metric="chebyshev")
    neigh_dist = kd.query(x, k=n_neighbors+1)[0]
    
    return np.nextafter(neigh_dist[:, -1], 0)


def num_points_within_radius(x, radius):
    kd = KDTree(x, metric="chebyshev")
    nx = kd.query_radius(x, radius, count_only=True, return_distance=False)
    
    return np.array(nx) - 1.0


def preprocess_data(x):
    x = np.array(x, dtype=np.float64)
    if x.ndim == 1:
        x = x.reshape(-1, 1)
    elif x.ndim != 2:
        raise ValueError(f'x.ndim = {x.ndim}, should be 1 or 2')

    means = np.maximum(1e-100, np.mean(np.abs(x), axis=0))

    return (1/means) * x


def compute_mi(x, y, n_neighbors=5):
    # Kraskov
    n_samples = len(x)
    x, y = [preprocess_data(t) for t in [x, y]]
    xy = np.hstack((x, y))
    k = np.full(n_samples, n_neighbors)
    radius = get_radius_kneighbors(xy, n_neighbors)

    mask = (radius == 0)
    if mask.sum() > 0:
        vals, ix, counts = np.unique(
            xy[mask], axis=0, return_inverse=True, return_counts=True
        )
        k[mask] = counts[ix] - 1

    nx = num_points_within_radius(x, radius)
    ny = num_points_within_radius(y, radius)

    mi = max(0, digamma(n_samples) + np.mean(digamma(k))
             - np.mean(digamma(nx + 1)) - np.mean(digamma(ny + 1)))
    return mi

def greedy(x, y, n_neighbors=5):
    idx = []
    rem = np.arange(0, x.shape[1])
    score = 0
    j = -1
    while len(rem)>0:
        mi = np.array([compute_mi(x[:, idx+[i]], y, n_neighbors) for i in rem])
        j = rem[np.argmax(mi)]
        mi = np.max(mi)
        if mi > score:
            score = mi
            rem = np.delete(rem, j)
            idx.append(j)
        else:
            break
        j = -1
    return idx

In [None]:
survey = pd.read_csv('../datasets/FinUI/100_avg_scores.csv')
survey = survey.set_index(np.arange(1, 101))

img_dir = '../datasets/FinUI/images'
csv_dir = '../datasets/FinUI/csv'
ocr_dir = '../datasets/FinUI/ocr'
block_dir = '../datasets/FinUI/block_csv'

images = os.listdir(img_dir)
images = [img for img in images if img.split('.')[1] == 'png']
labels = os.listdir(csv_dir)
txt_labels = os.listdir(ocr_dir)
block_labels = os.listdir(block_dir)

In [None]:
if os.path.exists('../datasets/FinUI/100_img_features.csv'):
    img_features = pd.read_csv('../datasets/FinUI/100_img_features.csv')
    img_features = img_features.set_index(np.arange(1, 101))
    
else:
    img_features = pd.DataFrame()

    for fn in tqdm(images):
        img = cv2.imread(os.path.join(img_dir, fn))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        coef = max(tuple(np.array(img.shape[:2][::-1])))/1200
        sz = tuple(np.array(img.shape[:2][::-1]/coef).astype(int))
        img = cv2.resize(img, sz)

        f = get_img_features(img)
        img_features = pd.concat([img_features, pd.DataFrame(f, index=[int(fn.split('.')[0])])])

    img_features.to_csv('../datasets/FinUI/100_img_features.csv', index=False)

In [None]:
if os.path.exists('../datasets/FinUI/100_txt_features.csv'):
    txt_features = pd.read_csv('../datasets/FinUI/100_txt_features.csv')
    txt_features = txt_features.set_index(np.arange(1, 101))
    
else:
    txt_features = pd.DataFrame()

    for i in tqdm(range(100)):
        fn = images[i]
        img = cv2.imread(os.path.join(img_dir, fn))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        tfn = txt_labels[i]
        txt = pd.read_csv(os.path.join(ocr_dir, tfn))

        f = get_txt_features(img, txt)
        txt_features = pd.concat([txt_features, pd.DataFrame(f, index=[int(fn.split('.')[0])])])

    txt_features.to_csv('../datasets/FinUI/100_txt_features.csv', index=False)

In [None]:
if os.path.exists('../datasets/FinUI/100_ae_features.csv'):
    ae_features = pd.read_csv('../datasets/FinUI/100_ae_features.csv')
    ae_features = ae_features.set_index(np.arange(1, 101))
    
else:
    ae_features = pd.DataFrame()

    for i in tqdm(range(100)):
        fn = images[i]
        img = cv2.imread(os.path.join(img_dir, fn))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        tfn = txt_labels[i]
        txt = pd.read_csv(os.path.join(ocr_dir, tfn))
        
        lfn = labels[i]
        lab = pd.read_csv(os.path.join(csv_dir, lfn))
        
        boxes = pd.concat([lab, txt], ignore_index=True)[['xmin', 'xmax', 'ymin', 'ymax']]

        f = get_ae_features(img, boxes)
        ae_features = pd.concat([ae_features, pd.DataFrame(f, index=[int(fn.split('.')[0])])])

    ae_features.to_csv('../datasets/FinUI/100_ae_features.csv', index=False)

In [None]:
if os.path.exists('../datasets/FinUI/100_aeb_features.csv'):
    aeb_features = pd.read_csv('../datasets/FinUI/100_aeb_features.csv')
    aeb_features = aeb_features.set_index(np.arange(1, 101))
    
else:
    aeb_features = pd.DataFrame()

    for i in tqdm(range(100)):
        fn = images[i]
        img = cv2.imread(os.path.join(img_dir, fn))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
       
        lfn = block_labels[i]
        lab = pd.read_csv(os.path.join(block_dir, lfn))
        
        boxes = lab[['xmin', 'xmax', 'ymin', 'ymax']]

        f = get_ae_features(img, boxes)
        aeb_features = pd.concat([aeb_features, pd.DataFrame(f, index=[int(fn.split('.')[0])])])

    aeb_features.to_csv('../datasets/FinUI/100_aeb_features.csv', index=False)

In [None]:
lo_features = pd.DataFrame()

for i in range(len(images)):
    fn = images[i]
    img = cv2.imread(os.path.join(img_dir, fn))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    [img_h, img_w] = img.shape[:2]
    tfn = txt_labels[i]
    txt = pd.read_csv(os.path.join(ocr_dir, tfn))

    lfn = labels[i]
    lab = pd.read_csv(os.path.join(csv_dir, lfn))

    f, c = get_layout_features(img_h, img_w, lab, txt)
    lo_features = pd.concat([lo_features, pd.DataFrame([f], index=[int(fn.split('.')[0])])])
lo_features.columns = c

In [None]:
features = ['img_' + f for f  in img_features.columns.to_list()] \
    + ['txt_' + f for f  in txt_features.columns.to_list()] \
    + ['ae_' + f for f  in ae_features.columns.to_list()] \
    + ['aeb_' + f for f  in aeb_features.columns.to_list()] \
    + lo_features.columns.to_list()
questions = survey.columns.to_list()

In [None]:
X = np.append(
    np.append(
        np.append(
            np.append(
                img_features.fillna(0).values, 
                txt_features.fillna(0).values, 
                axis=1
            ),
            ae_features.fillna(0).values, 
            axis=1
        ),
        aeb_features.fillna(0), 
        axis=1
    ),
    lo_features.fillna(0),
    axis=1
)
y = survey.values

In [None]:
idx = np.arange(51)
X = X[:, idx]
features = list(np.array(features)[idx])


X_train = X[:70, :]
X_test = X[70:, :]
y_train = y[:70, :]
y_test = y[70:, :]


# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size = 0.3, random_state = 2
# )

# scaler = StandardScaler()
# scaler.fit(X_train_raw)
# X_train = scaler.transform(X_train_raw)
# X_test = scaler.transform(X_test_raw)

In [None]:
# scaler = StandardScaler()
# scaler.fit(X)
# X_scaled = scaler.transform(X)

In [None]:
for j in range(y.shape[1]):
    mi = np.array([compute_mi(X[:, i], y[:, j], 5) for i in range(X.shape[1])])
    if j==0:
        z = mi.reshape(-1,1)
    else:
        z = np.append(z, mi.reshape(-1,1), axis=1)

In [None]:
idx_x = np.arange(z.shape[1])
idx_y = np.arange(z.shape[0])
idx_x, idx_y = np.meshgrid(idx_x, idx_y)

In [None]:
fig = plt.figure(figsize=(16, 12))
plt.contourf(idx_x, idx_y, z, 20, cmap='coolwarm')
cbar = plt.colorbar();
plt.xticks(np.arange(len(questions)), questions, fontsize=16)
plt.yticks(np.arange(len(features)), features, fontsize=12)
# plt.yticks(
#    [0, 20, 35, 51, 67, 97, 127, 157, 187, 217, 247, 277, 307, 337], 
#    ['Img', 'Txt', 'Ae', 'Ae_b', 'LO_all', 'LO_1', 'LO_2', 'LO_3', 'LO_4', 'LO_5', 'LO_6', 'LO_7', 'LO_8', 'LO_9'],
#    va='bottom', fontsize=14
# )
plt.xlabel('Questions', labelpad=10, fontsize=20)
plt.ylabel('Features', labelpad=10, fontsize=20)
cbar.set_label('Mutual Information', labelpad=10, fontsize=20)
cbar.ax.tick_params(labelsize=16)
plt.show()