In [None]:
import cv2
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy.special import digamma
from sklearn.neighbors import KDTree
from geneticalgorithm import geneticalgorithm as ga
from numpy.polynomial.polynomial import polyfit
import sys
sys.path.append('..')

from features.imtools import get_features as get_img_features
from features.txtools import get_features as get_txt_features
from features.aetools import get_features as get_ae_features
from features.laytools import get_layout_features

In [None]:
def get_radius_kneighbors(x, n_neighbors):
    kd = KDTree(x, metric="chebyshev")
    neigh_dist = kd.query(x, k=n_neighbors+1)[0]
    
    return np.nextafter(neigh_dist[:, -1], 0)


def num_points_within_radius(x, radius):
    kd = KDTree(x, metric="chebyshev")
    nx = kd.query_radius(x, radius, count_only=True, return_distance=False)
    
    return np.array(nx) - 1.0


def preprocess_data(x):
    x = np.array(x, dtype=np.float64)
    if x.ndim == 1:
        x = x.reshape(-1, 1)
    elif x.ndim != 2:
        raise ValueError(f'x.ndim = {x.ndim}, should be 1 or 2')

    means = np.maximum(1e-100, np.mean(np.abs(x), axis=0))

    return (1/means) * x


def compute_mi(x, y, n_neighbors=5):
    # Kraskov
    n_samples = len(x)
    x, y = [preprocess_data(t) for t in [x, y]]
    xy = np.hstack((x, y))
    k = np.full(n_samples, n_neighbors)
    radius = get_radius_kneighbors(xy, n_neighbors)

    mask = (radius == 0)
    if mask.sum() > 0:
        vals, ix, counts = np.unique(
            xy[mask], axis=0, return_inverse=True, return_counts=True
        )
        k[mask] = counts[ix] - 1

    nx = num_points_within_radius(x, radius)
    ny = num_points_within_radius(y, radius)

    mi = max(0, digamma(n_samples) + np.mean(digamma(k))
             - np.mean(digamma(nx + 1)) - np.mean(digamma(ny + 1)))
    return mi

def greedy(x, y, n_neighbors=5):
    idx = []
    rem = np.arange(0, x.shape[1])
    score = 0
    j = -1
    while len(rem)>0:
        mi = np.array([compute_mi(x[:, idx+[i]], y, n_neighbors) for i in rem])
        j = rem[np.argmax(mi)]
        mi = np.max(mi)
        if mi > score:
            score = mi
            rem = np.delete(rem, j)
            idx.append(j)
        else:
            break
        j = -1
    return idx

In [None]:
survey = pd.read_csv('100/100_avg_scores.csv')
survey = survey.set_index(np.arange(1, 101))

img_dir = '100\\images'
csv_dir = '100\\csv'
ocr_dir = '100\\ocr'
block_dir = '100\\block_csv'

images = os.listdir(img_dir)
images = [img for img in images if img.split('.')[1] == 'png']
labels = os.listdir(csv_dir)
txt_labels = os.listdir(ocr_dir)
block_labels = os.listdir(block_dir)

In [None]:
if os.path.exists('100/100_img_features.csv'):
    img_features = pd.read_csv('100/100_img_features.csv')
    img_features = img_features.set_index(np.arange(1, 101))
    
else:
    img_features = pd.DataFrame()

    for fn in tqdm(images):
        img = cv2.imread(os.path.join(img_dir, fn))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        coef = max(tuple(np.array(img.shape[:2][::-1])))/1200
        sz = tuple(np.array(img.shape[:2][::-1]/coef).astype(int))
        img = cv2.resize(img, sz)

        f = get_img_features(img)
        img_features = pd.concat([img_features, pd.DataFrame(f, index=[int(fn.split('.')[0])])])

    img_features.to_csv('100/100_img_features.csv', index=False)

In [None]:
if os.path.exists('100/100_txt_features.csv'):
    txt_features = pd.read_csv('100/100_txt_features.csv')
    txt_features = txt_features.set_index(np.arange(1, 101))
    
else:
    txt_features = pd.DataFrame()

    for i in tqdm(range(100)):
        fn = images[i]
        img = cv2.imread(os.path.join(img_dir, fn))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        tfn = txt_labels[i]
        txt = pd.read_csv(os.path.join(ocr_dir, tfn))

        f = get_txt_features(img, txt)
        txt_features = pd.concat([txt_features, pd.DataFrame(f, index=[int(fn.split('.')[0])])])

    txt_features.to_csv('100/100_txt_features.csv', index=False)

In [None]:
if os.path.exists('100/100_ae_features.csv'):
    ae_features = pd.read_csv('100/100_ae_features.csv')
    ae_features = ae_features.set_index(np.arange(1, 101))
    
else:
    ae_features = pd.DataFrame()

    for i in tqdm(range(100)):
        fn = images[i]
        img = cv2.imread(os.path.join(img_dir, fn))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        tfn = txt_labels[i]
        txt = pd.read_csv(os.path.join(ocr_dir, tfn))
        
        lfn = labels[i]
        lab = pd.read_csv(os.path.join(csv_dir, lfn))
        
        boxes = pd.concat([lab, txt], ignore_index=True)[['xmin', 'xmax', 'ymin', 'ymax']]

        f = get_ae_features(img, boxes)
        ae_features = pd.concat([ae_features, pd.DataFrame(f, index=[int(fn.split('.')[0])])])

    ae_features.to_csv('100/100_ae_features.csv', index=False)

In [None]:
if os.path.exists('100/100_aeb_features.csv'):
    aeb_features = pd.read_csv('100/100_aeb_features.csv')
    aeb_features = aeb_features.set_index(np.arange(1, 101))
    
else:
    aeb_features = pd.DataFrame()

    for i in tqdm(range(100)):
        fn = images[i]
        img = cv2.imread(os.path.join(img_dir, fn))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
       
        lfn = block_labels[i]
        lab = pd.read_csv(os.path.join(block_dir, lfn))
        
        boxes = lab[['xmin', 'xmax', 'ymin', 'ymax']]

        f = get_ae_features(img, boxes)
        aeb_features = pd.concat([aeb_features, pd.DataFrame(f, index=[int(fn.split('.')[0])])])

    aeb_features.to_csv('100/100_aeb_features.csv', index=False)

In [None]:
features = ['img_' + f for f  in img_features.columns.to_list()] \
    + ['txt_' + f for f  in txt_features.columns.to_list()] \
    + ['ae_' + f for f  in ae_features.columns.to_list()] \
    + ['aeb_' + f for f  in aeb_features.columns.to_list()]
questions = survey.columns.to_list()

In [None]:
X = np.append(
    np.append(
        np.append(
            img_features.fillna(0).values, 
            txt_features.fillna(0).values, 
            axis=1
        ),
        ae_features.fillna(0).values, 
        axis=1
    ),
    aeb_features.fillna(0), 
    axis=1
)
y = survey.values

In [None]:
idx = np.arange(51)
X = X[:, idx]
features = list(np.array(features)[idx])


X_train = X[:70, :]
X_test = X[70:, :]
y_train = y[:70, :]
y_test = y[70:, :]


# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size = 0.3, random_state = 2
# )

# scaler = StandardScaler()
# scaler.fit(X_train_raw)
# X_train = scaler.transform(X_train_raw)
# X_test = scaler.transform(X_test_raw)

In [None]:
import h2o
from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators import H2OGradientBoostingEstimator

In [None]:
h2o.init()

In [None]:
answer = '3_1'
thresh = 0.05


mi=[]
for f in features:
    mi.append(compute_mi(X_train[:, features.index(f)], y_train[:, questions.index(answer)]))

idx = np.array(mi)>thresh

out = pd.DataFrame(columns=np.array(features)[idx], data=X_train[:, idx])
out['target'] = y_train[:, questions.index(answer)]
train = h2o.H2OFrame(out)
out = pd.DataFrame(columns=np.array(features)[idx], data=X_test[:, idx])
out['target'] = y_test[:, questions.index(answer)]
valid = h2o.H2OFrame(out)

In [None]:
predictors = train.columns[:-1]
response = 'target'

In [None]:
gbm  = H2OGradientBoostingEstimator(
    nfolds=5,
    seed=1111,
    keep_cross_validation_predictions = True,
    learn_rate=0.002,
    ntrees=5000,
    max_depth=20
)
gbm.train(x=predictors, y=response, training_frame=train, validation_frame=valid)

print(
    gbm.r2(), 
    gbm.r2(valid=True), 
    np.corrcoef(gbm.predict(valid).as_data_frame().predict, y_test[:, questions.index(answer)])[0, 1]
)

In [None]:
prediction = gbm.predict(valid).as_data_frame().predict
y = y_test[:, questions.index(answer)]
b, m = polyfit(y, prediction, 1)

plt.scatter(y, prediction)
plt.plot(y, b + m * y, '-', c='b')
plt.xlabel('Expert Layout Scores')
plt.ylabel('Score Predictions')
plt.show()

In [None]:
gbm_params = {
    'learn_rate': [0.05, 0.01, 0.005, 0.002, 0.001],
    'max_depth': [5, 20],
    'sample_rate': [0.9, 1.0],
    'ntrees': [100, 500, 1000, 2000]
}

response = 'target'

results = []

for question in questions:
    if question not in ['2_3']:
        continue
    mi=[]
    for f in features:
        mi.append(compute_mi(X_train[:, features.index(f)], y_train[:, questions.index(question)]))

    for thresh in [0, 0.05, 0.1, 0.15]:
        h2o.remove_all()
        
        res = {}
        res['question'] = question
        res['thresh'] = thresh

        idx = np.array(mi)>thresh

        out = pd.DataFrame(columns=np.array(features)[idx], data=X_train[:, idx])
        out['target'] = y_train[:, questions.index(question)]
        train = h2o.H2OFrame(out)

        out = pd.DataFrame(columns=np.array(features)[idx], data=X_test[:, idx])
        out['target'] = y_test[:, questions.index(question)]
        valid = h2o.H2OFrame(out)

        predictors = train.columns[:-1]
        
        gbm_grid = H2OGridSearch(
            model=H2OGradientBoostingEstimator,
            grid_id='gbm_grid',
            hyper_params=gbm_params
        )

        gbm_grid.train(
            x=predictors, 
            y=response,
            training_frame=train,
            validation_frame=valid,
            seed=1111
        )
        best_model = gbm_grid.models[0]
        res['params'] = [best_model.parms[x]['input_value'] for x in ['learn_rate', 'max_depth', 'sample_rate', 'ntrees']]
        res['metrics'] = [
            best_model.r2(valid=True),
            np.corrcoef(best_model.predict(valid).as_data_frame().predict, y_test[:, questions.index(question)])[0, 1]
        ]
        results.append(res)
        print(res)


In [None]:
mi=[]
y_train_total = y_train.mean(axis=1)
y_test_total = y_test.mean(axis=1)
for f in features:
    mi.append(compute_mi(X_train[:, features.index(f)], y_train_total))

for thresh in [0, 0.05, 0.1, 0.15, 0.2, 0.25]:
    h2o.remove_all()

    res = {}
    res['question'] = 'total'
    res['thresh'] = thresh

    idx = np.array(mi)>thresh

    out = pd.DataFrame(columns=np.array(features)[idx], data=X_train[:, idx])
    out['target'] = y_train_total
    train = h2o.H2OFrame(out)

    out = pd.DataFrame(columns=np.array(features)[idx], data=X_test[:, idx])
    out['target'] = y_test_total
    valid = h2o.H2OFrame(out)

    predictors = train.columns[:-1]

    gbm_grid = H2OGridSearch(
        model=H2OGradientBoostingEstimator,
        grid_id='gbm_grid',
        hyper_params=gbm_params
    )

    gbm_grid.train(
        x=predictors, 
        y=response,
        training_frame=train,
        validation_frame=valid,
        seed=1111
    )
    best_model = gbm_grid.models[0]
    res['params'] = [best_model.parms[x]['input_value'] for x in ['learn_rate', 'max_depth', 'sample_rate', 'ntrees']]
    res['metrics'] = [
        best_model.r2(valid=True),
        np.corrcoef(best_model.predict(valid).as_data_frame().predict, y_test_total)[0, 1]
    ]
    print(res)

In [None]:
params = pd.read_csv('gbm.csv')

In [None]:
response = 'target'
df = pd.DataFrame({'variable': features}).set_index('variable')
cc = []

for question in params.question:

    q_params = params.loc[params.question==question]
    
    if question=='total':
        y_tr = y_train.mean(axis=1)
        y_te = y_test.mean(axis=1)
    else:
        y_tr = y_train[:, questions.index(question)]
        y_te = y_test[:, questions.index(question)]

    if q_params.iloc[0].features=='txt':
        idx = np.arange(20, 35)
    elif q_params.iloc[0].features=='ae':
        idx = np.arange(35, 51)
    elif q_params.iloc[0].features=='img':
        idx = np.arange(20)
    else:
        idx = np.arange(len(features))

    mi=[]
    for f in np.array(features)[idx]:
        mi.append(compute_mi(X_train[:, features.index(f)], y_tr))

    sub_idx = np.array(mi)>q_params.iloc[0].threshold
    idx = idx[sub_idx]


    out = pd.DataFrame(columns=np.array(features)[idx], data=X_train[:, idx])
    out['target'] = y_tr
    train = h2o.H2OFrame(out)
    out = pd.DataFrame(columns=np.array(features)[idx], data=X_test[:, idx])
    out['target'] = y_te
    valid = h2o.H2OFrame(out)

    predictors = train.columns[:-1]

    gbm  = H2OGradientBoostingEstimator(
        seed=1111,
        learn_rate=q_params.iloc[0].learn_rate,
        ntrees=int(q_params.iloc[0].n_trees),
        max_depth=int(q_params.iloc[0].max_depth),
        sample_rate=q_params.iloc[0].sampling_rate
    )

    gbm.train(x=predictors, y=response, training_frame=train, validation_frame=valid)

    varimp = gbm.varimp(use_pandas=True)[['variable', 'scaled_importance']]
    varimp.columns = ['variable', question]
    df = df.join(varimp.set_index('variable'))
    cc.append(np.corrcoef(gbm.predict(valid).as_data_frame().predict, y_te)[0, 1])

print(cc)

In [None]:
df = df.fillna(0)
fig = plt.figure(figsize=(12, 10))

ax = sns.heatmap(
    df.values, 
    xticklabels=df.columns,
    yticklabels=features,
    linecolor='#ededed',
    linewidths=0.1,
    cmap='Blues', vmin=0, vmax=1
)
ax.tick_params(axis='y', colors='black')
ax.set_xlabel('Questions', labelpad=10, fontsize=16)
plt.title('GBM Feature Importance', pad=10, fontsize=18)
plt.show()