In [None]:
import cv2
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import sys
sys.path.append('..')

from features.imtools import get_features

In [None]:
df = pd.read_csv('../datasets/FinUI/100_avg_scores.csv')
df = df.set_index(np.arange(1, 101))

In [None]:
img_dir = '../datasets/FinUI/images'
csv_dir = '../datasets/FinUI/csv'
ocr_dir = '../datasets/FinUI/ocr'
images = os.listdir(img_dir)
images = [img for img in images if img.split('.')[1] == 'png']
labels = os.listdir(csv_dir)
txt_labels = os.listdir(ocr_dir)

# Image features

In [None]:
if os.path.exists('../datasets/FinUI/100_img_features.csv'):
    img_features = pd.read_csv('../datasets/FinUI/100_img_features.csv')
    img_features = img_features.set_index(np.arange(1, 101))
    
else:
    img_features = pd.DataFrame()

    for fn in tqdm(images):
        img = cv2.imread(os.path.join(img_dir, fn))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        coef = max(tuple(np.array(img.shape[:2][::-1])))/1200
        sz = tuple(np.array(img.shape[:2][::-1]/coef).astype(int))
        img = cv2.resize(img, sz)

        f = get_features(img)
        img_features = pd.concat([img_features, pd.DataFrame(f, index=[int(fn.split('.')[0])])])

    img_features.to_csv('../datasets/FinUI/100_img_features.csv', index=False)

In [None]:
corr = img_features.corr()
fig = plt.figure(figsize=(16,14))

ax = sns.heatmap(
    corr, 
    xticklabels=corr.columns.values,
    yticklabels=corr.columns.values,
    cmap='coolwarm',
    vmin=-1, vmax=1, annot=True, fmt='.2f'
)
plt.show()

In [None]:
from numpy.random import default_rng
from scipy.special import digamma
from sklearn.neighbors import KDTree


def get_radius_kneighbors(x, n_neighbors):
    """Determine smallest radius around x containing n_neighbors neighbors

    :param x: ndarray, shape (n_samples, n_dim)
    :param n_neighbors: number of neighbors
    :returns: radius, shape (n_samples,)

    """
    # Use KDTree for simplicity (sometimes a ball tree could be faster)
    kd = KDTree(x, metric="chebyshev")

    # Results include point itself, therefore n_neighbors+1
    neigh_dist = kd.query(x, k=n_neighbors+1)[0]

    # Take radius slightly larger than distance to last neighbor
    radius = np.nextafter(neigh_dist[:, -1], 0)
    return radius


def num_points_within_radius(x, radius):
    """For each point, determine the number of other points within a given radius

    :param x: ndarray, shape (n_samples, n_dim)
    :param radius: radius, shape (n_samples,)
    :returns: number of points within radius

    """
    kd = KDTree(x, metric="chebyshev")
    nx = kd.query_radius(x, radius, count_only=True, return_distance=False)
    return np.array(nx) - 1.0


def preprocess_data(x):
    """Preprocess data. Ensure x is 2d ndarray, and scale so that the mean absolute
    amplitude of each column is one.

    :param x: ndarray, shape (n_samples,) or (n_samples, n_features)
    :returns: float ndarray, shape (n_samples, n_features)

    """
    x = np.array(x, dtype=np.float64)

    if x.ndim == 1:
        x = x.reshape(-1, 1)
    elif x.ndim != 2:
        raise ValueError(f'x.ndim = {x.ndim}, should be 1 or 2')

    # Estimate mean absolute amplitude per column
    means = np.maximum(1e-100, np.mean(np.abs(x), axis=0))

    # Scale so that mean absolute amplitude is one
    x = (1/means) * x

    return x


def add_noise(x, rng, noise_type='uniform', amplitude=1e-10):
    """Add noise so that samples are probably unique, and convert to float64"""

    if noise_type == 'uniform':
        x += amplitude * (rng.random(x.shape) - 0.5)
    elif noise_type == 'normal':
        x += amplitude * rng.normal(size=x.shape)
    else:
        raise ValueError('Invalid noise type')

    return x


def compute_mi(x, y, n_neighbors=3, noise_type=None):
    """Compute mutual information between two continuous variables.

    :param x: real ndarray, shape (n_samples,) or (n_samples, n_features)
    :param y: real ndarray, shape (n_samples,) or (n_samples, n_features)
    :param n_neighbors: Number of nearest neighbors
    :param noise_type: add noise of given type (uniform, normal)
    :returns: non-negative estimate of mutual information

    """
    n_samples = len(x)
    x, y = [preprocess_data(t) for t in [x, y]]

    if noise_type:
        rng = default_rng()
        x, y = [add_noise(t, rng, noise_type) for t in [x, y]]

    xy = np.hstack((x, y))
    k = np.full(n_samples, n_neighbors)
    radius = get_radius_kneighbors(xy, n_neighbors)

    if noise_type is None:
        # Where radius is 0, determine multiplicity
        mask = (radius == 0)
        if mask.sum() > 0:
            vals, ix, counts = np.unique(xy[mask], axis=0, return_inverse=True,
                                         return_counts=True)
            k[mask] = counts[ix] - 1

    nx = num_points_within_radius(x, radius)
    ny = num_points_within_radius(y, radius)

    mi = max(0, digamma(n_samples) + np.mean(digamma(k))
             - np.mean(digamma(nx + 1)) - np.mean(digamma(ny + 1)))
    return mi


def compute_cmi(x, y, z, n_neighbors=3, noise_type=None):
    """Compute conditional mutual information I(x;y|z)

    :param x: real ndarray, shape (n_samples,) or (n_samples, n_features)
    :param y: real ndarray, shape (n_samples,) or (n_samples, n_features)
    :param z: real ndarray, shape (n_samples,) or (n_samples, n_features)
    :param n_neighbors: Number of nearest neighbors
    :param noise_type: add noise of given type (uniform, normal)
    :returns: non-negative estimate of conditional mutual information

    """
    n_samples = len(x)
    x, y, z = [preprocess_data(t) for t in [x, y, z]]

    if noise_type:
        rng = default_rng()
        x, y, z = [add_noise(t, rng, noise_type) for t in [x, y, z]]

    xyz = np.hstack((x, y, z))
    k = np.full(n_samples, n_neighbors)
    radius = get_radius_kneighbors(xyz, n_neighbors)

    if noise_type is None:
        # Where radius is 0, determine multiplicity
        mask = (radius == 0)
        if mask.sum() > 0:
            vals, ix, counts = np.unique(xyz[mask], axis=0,
                                         return_inverse=True,
                                         return_counts=True)
            k[mask] = counts[ix] - 1

    nxz = num_points_within_radius(np.hstack((x, z)), radius)
    nyz = num_points_within_radius(np.hstack((y, z)), radius)
    nz = num_points_within_radius(z, radius)

    cmi = max(0, np.mean(digamma(k)) - np.mean(digamma(nxz + 1))
              - np.mean(digamma(nyz + 1)) + np.mean(digamma(nz + 1)))
    return cmi


def compute_batch_mi(x, y, n_neighbors=3, noise_type=None):
    N = len(x)
    batch_size = 500
    n_batches = N//batch_size
    mi = np.zeros(n_batches)

    for i in range(n_batches):
        i0 = i * batch_size
        i1 = i0 + batch_size
        mi[i] = compute_mi(x[i0:i1], y[i0:i1], n_neighbors, noise_type)

    return mi.mean()

In [None]:
mi=[]
for c in img_features.columns:
    mi.append(compute_mi(img_features[c].values, df['3_1'].values))

In [None]:
fig = plt.figure(figsize=(14,8))
plt.bar(img_features.columns, mi)
plt.title('COLOR_SCHEME')
plt.xticks(rotation=90)
plt.show()

In [None]:
mi=[]
for c in img_features.columns:
    mi.append(compute_mi(img_features[c].values, df['3_2'].values))
    
fig = plt.figure(figsize=(14,8))
plt.bar(img_features.columns, mi)
plt.title('INFORMATIOIN')
plt.xticks(rotation=90)
plt.show()

In [None]:
mi=[]
for c in img_features.columns:
    mi.append(compute_mi(img_features[c].values, df['3_3'].values))
    
fig = plt.figure(figsize=(14,8))
plt.bar(img_features.columns, mi)
plt.title('OTHER')
plt.xticks(rotation=90)
plt.show()

In [None]:
mi=[]
for c in img_features.columns:
    mi.append(compute_mi(img_features[c].values, df['3_1'].values))
dff = img_features[img_features.columns[np.array(mi)>0.25]]

In [None]:
from sklearn.linear_model import LinearRegression
X = dff.values
y = df['3_1'].values
reg = LinearRegression().fit(X[:70,:], y[:70])
print(f'Out of sample correlation: {np.corrcoef(reg.predict(X[70:,:]), y[70:])[0, 1]:.3f}')

In [None]:
reg.coef_

In [None]:
from sklearn.linear_model import LinearRegression
from numpy.polynomial.polynomial import polyfit

X = dff.values
y = df['3_1'].values
reg = LinearRegression().fit(X, y)

prediction = reg.predict(X)
b, m = polyfit(y, prediction, 1)


plt.scatter(y, prediction)
plt.plot(y, b + m * y, '-', c='b')
plt.xlabel('Expert Image Scores')
plt.ylabel('Image Score Predictions')
plt.show()

In [None]:
np.corrcoef(prediction, y)[0,1]

# Text features

In [None]:
fn = images[0]
img = cv2.imread(os.path.join(img_dir, fn))
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
tfn = txt_labels[0]
txt = pd.read_csv(os.path.join(ocr_dir, tfn))

In [None]:
from features.txtools import get_features

In [None]:
if os.path.exists('../datasets/FinUI/100_txt_features.csv'):
    txt_features = pd.read_csv('../datasets/FinUI/100_txt_features.csv')
    txt_features = txt_features.set_index(np.arange(1, 101))
    
else:
    txt_features = pd.DataFrame()

    for i in tqdm(range(100)):
        fn = images[i]
        img = cv2.imread(os.path.join(img_dir, fn))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        tfn = txt_labels[i]
        txt = pd.read_csv(os.path.join(ocr_dir, tfn))

        f = get_features(img, txt)
        txt_features = pd.concat([txt_features, pd.DataFrame(f, index=[int(fn.split('.')[0])])])

    txt_features.to_csv('../datasets/FinUI/100_txt_features.csv', index=False)

In [None]:
corr = txt_features.corr()
fig = plt.figure(figsize=(16,14))

ax = sns.heatmap(
    corr, 
    xticklabels=corr.columns.values,
    yticklabels=corr.columns.values,
    cmap='coolwarm',
    vmin=-1, vmax=1, annot=True, fmt='.2f'
)
plt.show()

In [None]:
tf = txt_features.fillna(0)
mi=[]
for c in txt_features.columns:
    mi.append(compute_mi(tf[c].values, df['2_1'].values))

In [None]:
fig = plt.figure(figsize=(14,8))
plt.bar(tf.columns, mi)
plt.title('READABILITY')
plt.xticks(rotation=90)
plt.show()

In [None]:
mi=[]
for c in txt_features.columns:
    mi.append(compute_mi(tf[c].values, df['2_2'].values))
    
fig = plt.figure(figsize=(14,8))
plt.bar(tf.columns, mi)
plt.title('INFORMATION')
plt.xticks(rotation=90)
plt.show()

In [None]:
mi=[]
for c in txt_features.columns:
    mi.append(compute_mi(tf[c].values, df['2_3'].values))
    
fig = plt.figure(figsize=(14,8))
plt.bar(tf.columns, mi)
plt.title('OTHER')
plt.xticks(rotation=90)
plt.show()

In [None]:
mi=[]
for c in txt_features.columns:
    mi.append(compute_mi(tf[c].values, df['2_2'].values))
dff = tf[tf.columns[np.array(mi)>0.05]]
dff.columns

In [None]:
from sklearn.linear_model import LinearRegression
X = dff.values
y = df['2_2'].values
reg = LinearRegression().fit(X[:70,:], y[:70])
print(f'Out of sample correlation: {np.corrcoef(reg.predict(X[70:,:]), y[70:])[0, 1]:.3f}')

In [None]:
from sklearn.linear_model import LinearRegression
from numpy.polynomial.polynomial import polyfit

X = dff.values
y = df['2_2']
reg = LinearRegression().fit(X, y)

prediction = reg.predict(X)
b, m = polyfit(y, prediction, 1)


plt.scatter(y, prediction)
plt.plot(y, b + m * y, '-', c='b')
plt.xlabel('Expert Image Scores')
plt.ylabel('Image Score Predictions')
plt.show()

In [None]:
np.corrcoef(prediction, y)[0,1]

In [None]:
tf = txt_features.fillna(0)
mi=[]
for c in txt_features.columns:
    mi.append(compute_mi(tf[c].values, df['2_1'].values))
dff = tf[tf.columns[np.array(mi)>0.1]]
dff.columns

In [None]:
from sklearn.linear_model import LinearRegression
X = dff.values
y = df['2_1'].values
reg = LinearRegression().fit(X[:70,:], y[:70])
print(f'Out of sample correlation: {np.corrcoef(reg.predict(X[70:,:]), y[70:])[0, 1]:.3f}')

# Aesthetic Features

In [None]:
from features.aetools import get_features

In [None]:
if os.path.exists('../datasets/FinUI/100_ae_features.csv'):
    ae_features = pd.read_csv('../datasets/FinUI/100_ae_features.csv')
    ae_features = ae_features.set_index(np.arange(1, 101))
    
else:
    ae_features = pd.DataFrame()

    for i in tqdm(range(100)):
        fn = images[i]
        img = cv2.imread(os.path.join(img_dir, fn))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        tfn = txt_labels[i]
        txt = pd.read_csv(os.path.join(ocr_dir, tfn))
        
        lfn = labels[i]
        lab = pd.read_csv(os.path.join(csv_dir, lfn))
        
        boxes = pd.concat([lab, txt], ignore_index=True)[['xmin', 'xmax', 'ymin', 'ymax']]

        f = get_features(img, boxes)
        ae_features = pd.concat([ae_features, pd.DataFrame(f, index=[int(fn.split('.')[0])])])

    ae_features.to_csv('../datasets/FinUI/100_ae_features.csv', index=False)

In [None]:
corr = ae_features.corr()
fig = plt.figure(figsize=(16,14))

ax = sns.heatmap(
    corr, 
    xticklabels=corr.columns.values,
    yticklabels=corr.columns.values,
    cmap='coolwarm',
    vmin=-1, vmax=1, annot=True, fmt='.2f'
)
ax.collections[0].colorbar.set_label("Correlation Coefficient", labelpad=10, fontsize=20)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.xlabel('Aesthetic Features', labelpad=10, fontsize=20)
plt.ylabel('Aesthetic Features', labelpad=10, fontsize=20)
plt.show()

In [None]:
aef = ae_features.fillna(0)
mi=[]
for c in ae_features.columns:
    mi.append(compute_mi(aef[c].values, df['1_1'].values))

In [None]:
fig = plt.figure(figsize=(14,8))
plt.bar(aef.columns, mi)
plt.title('DENSITY')
plt.xticks(rotation=90)
plt.show()

In [None]:
mi=[]
for c in ae_features.columns:
    mi.append(compute_mi(aef[c].values, df['1_2'].values))
    
fig = plt.figure(figsize=(14,8))
plt.bar(aef.columns, mi)
plt.title('COLOR_SCHEME')
plt.xticks(rotation=90)
plt.show()

In [None]:
mi=[]
for c in ae_features.columns:
    mi.append(compute_mi(aef[c].values, df['1_3'].values))
    
fig = plt.figure(figsize=(14,8))
plt.bar(aef.columns, mi)
plt.title('ORDER')
plt.xticks(rotation=90)
plt.show()

In [None]:
mi=[]
for c in ae_features.columns:
    mi.append(compute_mi(aef[c].values, df['1_4'].values))
    
fig = plt.figure(figsize=(14,8))
plt.bar(aef.columns, mi)
plt.xticks(rotation=90)
plt.title('BALANCE')
plt.show()

In [None]:
mi=[]
for c in ae_features.columns:
    mi.append(compute_mi(aef[c].values, df['1_5'].values))
    
fig = plt.figure(figsize=(14,8))
plt.bar(aef.columns, mi)
plt.xticks(rotation=90)
plt.title('OTHER')
plt.show()

In [None]:
mi=[]
for c in ae_features.columns:
    mi.append(compute_mi(aef[c].values, df['1_5'].values))
    
dff = aef[aef.columns[np.array(mi)>0.05]]
dff.columns

In [None]:
from sklearn.linear_model import LinearRegression
X = dff.values
y = df['1_5'].values
reg = LinearRegression().fit(X[:70,:], y[:70])
print(f'Out of sample correlation: {np.corrcoef(reg.predict(X[70:,:]), y[70:])[0, 1]:.3f}')

In [None]:
from sklearn.linear_model import LinearRegression
from numpy.polynomial.polynomial import polyfit

X = dff.values
y = df['1_5']
reg = LinearRegression().fit(X, y)

prediction = reg.predict(X)
b, m = polyfit(y, prediction, 1)


plt.scatter(y, prediction)
plt.plot(y, b + m * y, '-', c='b')
plt.xlabel('Expert Layout Scores')
plt.ylabel('AE Score Predictions')
plt.show()

In [None]:
np.corrcoef(prediction, y)[0,1]

# Image Features relationship with layout color scheme

In [None]:
mi=[]
for c in img_features.columns:
    mi.append(compute_mi(img_features[c].values, df['1_2'].values))
    
fig = plt.figure(figsize=(14,8))
plt.bar(img_features.columns, mi)
plt.title('COLOR_SCHEME')
plt.xticks(rotation=90)
plt.show()

# Block aesthetics

In [None]:
block_dir = '../datasets/FinUI/block_csv'
block_labels = os.listdir(block_dir)

In [None]:
if os.path.exists('../datasets/FinUI/100_aeb_features.csv'):
    aeb_features = pd.read_csv('../datasets/FinUI/100_aeb_features.csv')
    aeb_features = aeb_features.set_index(np.arange(1, 101))
    
else:
    aeb_features = pd.DataFrame()

    for i in tqdm(range(100)):
        fn = images[i]
        img = cv2.imread(os.path.join(img_dir, fn))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
       
        lfn = block_labels[i]
        lab = pd.read_csv(os.path.join(block_dir, lfn))
        
        boxes = lab[['xmin', 'xmax', 'ymin', 'ymax']]

        f = get_features(img, boxes)
        aeb_features = pd.concat([aeb_features, pd.DataFrame(f, index=[int(fn.split('.')[0])])])

    aeb_features.to_csv('../datasets/FinUI/100_aeb_features.csv', index=False)

In [None]:
aebf = aeb_features.fillna(0)

In [None]:
mi=[]
for c in aeb_features.columns:
    mi.append(compute_mi(aebf[c].values, df['1_4'].values))
    
fig = plt.figure(figsize=(14,8))
plt.bar(aebf.columns, mi)
plt.xticks(rotation=90)
plt.title('BALANCE')
plt.show()

In [None]:
mi=[]
for c in aeb_features.columns:
    mi.append(compute_mi(aebf[c].values, df['1_4'].values))

dff = aebf[aebf.columns[np.array(mi)>0.05]]
dff.columns

In [None]:
from sklearn.linear_model import LinearRegression
X = dff.values
y = df['1_4'].values
reg = LinearRegression().fit(X[:70,:], y[:70])
print(f'Out of sample correlation: {np.corrcoef(reg.predict(X[70:,:]), y[70:])[0, 1]:.3f}')

In [None]:
from sklearn.linear_model import LinearRegression
from numpy.polynomial.polynomial import polyfit

X = dff.values
y = df['1_4']
reg = LinearRegression().fit(X, y)

prediction = reg.predict(X)
b, m = polyfit(y, prediction, 1)


plt.scatter(y, prediction)
plt.plot(y, b + m * y, '-', c='b')
plt.xlabel('Expert Layout Scores')
plt.ylabel('AE Score Predictions')
plt.show()

In [None]:
np.corrcoef(prediction, y)[0,1]