In [None]:
import cv2
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from scipy.special import digamma
from sklearn.neighbors import KDTree
from numpy.polynomial.polynomial import polyfit
from scipy import stats
from math import sqrt

In [None]:
def get_radius_kneighbors(x, n_neighbors):
    kd = KDTree(x, metric="chebyshev")
    neigh_dist = kd.query(x, k=n_neighbors+1)[0]
    
    return np.nextafter(neigh_dist[:, -1], 0)


def num_points_within_radius(x, radius):
    kd = KDTree(x, metric="chebyshev")
    nx = kd.query_radius(x, radius, count_only=True, return_distance=False)
    
    return np.array(nx) - 1.0


def preprocess_data(x):
    x = np.array(x, dtype=np.float64)
    if x.ndim == 1:
        x = x.reshape(-1, 1)
    elif x.ndim != 2:
        raise ValueError(f'x.ndim = {x.ndim}, should be 1 or 2')

    means = np.maximum(1e-100, np.mean(np.abs(x), axis=0))

    return (1/means) * x


def compute_mi(x, y, n_neighbors=5):
    # Kraskov
    n_samples = len(x)
    x, y = [preprocess_data(t) for t in [x, y]]
    xy = np.hstack((x, y))
    k = np.full(n_samples, n_neighbors)
    radius = get_radius_kneighbors(xy, n_neighbors)

    mask = (radius == 0)
    if mask.sum() > 0:
        vals, ix, counts = np.unique(
            xy[mask], axis=0, return_inverse=True, return_counts=True
        )
        k[mask] = counts[ix] - 1

    nx = num_points_within_radius(x, radius)
    ny = num_points_within_radius(y, radius)

    mi = max(0, digamma(n_samples) + np.mean(digamma(k))
             - np.mean(digamma(nx + 1)) - np.mean(digamma(ny + 1)))
    return mi

def greedy(x, y, n_neighbors=5):
    idx = []
    rem = np.arange(0, x.shape[1])
    score = 0
    j = -1
    while len(rem)>0:
        mi = np.array([compute_mi(x[:, idx+[i]], y, n_neighbors) for i in rem])
        j = rem[np.argmax(mi)]
        mi = np.max(mi)
        if mi > score:
            score = mi
            rem = np.delete(rem, j)
            idx.append(j)
        else:
            break
        j = -1
    return idx

In [None]:
survey = pd.read_csv('../datasets/FinUI/100_avg_scores.csv')
survey = survey.set_index(np.arange(1, 101))

img_features = pd.read_csv('../datasets/FinUI/100_img_features.csv')
img_features = img_features.set_index(np.arange(1, 101))

txt_features = pd.read_csv('../datasets/FinUI/100_txt_features.csv')
txt_features = txt_features.set_index(np.arange(1, 101))

ae_features = pd.read_csv('../datasets/FinUI/100_ae_features.csv')
ae_features = ae_features.set_index(np.arange(1, 101))

aeb_features = pd.read_csv('../datasets/FinUI/100_aeb_features.csv')
aeb_features = aeb_features.set_index(np.arange(1, 101))

features = ['img_' + f for f  in img_features.columns.to_list()] \
+ ['txt_' + f for f  in txt_features.columns.to_list()] \
+ ['ae_' + f for f  in ae_features.columns.to_list()]

questions = survey.columns.to_list()

In [None]:
X = np.append(
    np.append(
        img_features.fillna(0).values, 
        txt_features.fillna(0).values, 
        axis=1
    ),
    ae_features.fillna(0).values, 
    axis=1
)
y = survey.values

X_train, y_train = X[:60, :], y[:60, :]
X_val, y_val = X[60:80, :], y[60:80, :]
X_test, y_test = X[80:, :], y[80:, :]

# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size = 0.3, random_state = 2
# )

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [None]:
th = np.arange(0.05, 0.23, 0.01)
results = {}
question = '1_1'
mi = []
ft = features
for f in ft:
    mi.append(compute_mi(X_train[:, features.index(f)], y_train[:, questions.index(question)], 5))
res_loc = []
for thresh in th:
    idx = np.array(mi)>thresh
    reg = LinearRegression().fit(X_train[:,idx], y_train[:, questions.index(question)])
    cc = np.corrcoef(reg.predict(X_val[:,idx]), y_val[:, questions.index(question)])[0, 1]
    res_loc.append(cc)

thresh = th[np.argmax(np.array(res_loc))]
print(thresh, max(res_loc))
idx = np.array(mi)>thresh
reg = LinearRegression().fit(X_train[:,idx], y_train[:, questions.index(question)])
cc = np.corrcoef(reg.predict(X_test[:,idx]), y_test[:, questions.index(question)])[0, 1]
print(cc)

In [None]:
prediction = reg.predict(X_test[:,idx])
z = y_test[:, questions.index(question)]
b, m = polyfit(z, prediction, 1)


fig, ax = plt.subplots()
ax.scatter(z, prediction)
ax.plot(z, b + m * z, '-', c='b')
ax.set_xlabel('Expert Scores', labelpad=10, fontsize=14)
ax.set_ylabel('Score Predictions', labelpad=10, fontsize=14)
plt.title(f'Question {question}', pad=10, fontsize=14)

props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)

ax.text(0.05, 0.95, 'ρ = 0.49', transform=ax.transAxes, fontsize=14,
        verticalalignment='top', bbox=props)

plt.show()