In [1]:
import numpy as np
import scipy.stats as st
from scipy.optimize import fmin, differential_evolution
import pandas as pd
import pickle
import math

def clean_data_specific(file_path):
    """
    Inputs: raw data from a specific file
    Outputs: <1> subjective estimates est[i,j,k] --> p_data in main
                 j-th query, and k-th repetition
             <2> queryOrder --> the only order we clean data and generate model predictions
    """
    print('Processing Participant File:', file_path)
    print('________________________________________________________')

    est = np.zeros(shape=(60, 3))
    neg, land, lor, lg = ' not', ' and', ' or', ' given'
    eventAs = [' cold', ' windy', ' warm']
    eventBs = [' rainy', ' cloudy', ' snowy']
    queryOrder = []
    for A, B in zip(eventAs, eventBs):
        queryOrder.append(A)
        queryOrder.append(B)
        queryOrder.append(neg + A)
        queryOrder.append(neg + B)
        queryOrder.append(A + land + B)
        queryOrder.append(B + land + neg + A)
        queryOrder.append(A + land + neg + B)
        queryOrder.append(neg + A + land + neg + B)
        queryOrder.append(A + lor + B)
        queryOrder.append(B + lor + neg + A)
        queryOrder.append(A + lor + neg + B)
        queryOrder.append(neg + A + lor + neg + B)
        queryOrder.append(A + lg + B)
        queryOrder.append(neg + A + lg + B)
        queryOrder.append(A + lg + neg + B)
        queryOrder.append(neg + A + lg + neg + B)
        queryOrder.append(B + lg + A)
        queryOrder.append(neg + B + lg + A)
        queryOrder.append(B + lg + neg + A)
        queryOrder.append(neg + B + lg + neg + A)

    df = pd.read_csv(file_path)
    for j, q in enumerate(queryOrder):
        nowEst = df[df['querydetail'] == q]['estimate']
        nowEstValues = nowEst.values / 100
        for k in range(3):
            est[j, k] = nowEstValues[k]

    return est, queryOrder

def get_truePr_RF(a, b, c, d):
    truePr = []
    base = a + b + c + d
    truePr.append((a + c) / base)
    truePr.append((a + b) / base)
    truePr.append((b + d) / base)
    truePr.append((c + d) / base)
    truePr.append(a / base)
    truePr.append(b / base)
    truePr.append(c / base)
    truePr.append(d / base)
    truePr.append((a + b + c) / base)
    truePr.append((a + b + d) / base)
    truePr.append((a + c + d) / base)
    truePr.append((b + c + d) / base)
    truePr.append((a / (a + b)))
    truePr.append((b / (a + b)))
    truePr.append((c / (c + d)))
    truePr.append((d / (c + d)))
    truePr.append((a / (a + c)))
    truePr.append((c / (a + c)))
    truePr.append((b / (b + d)))
    truePr.append((d / (b + d)))

    return truePr

def generativeModel_RF(params):
    a, b, c, d = [0, 0], [0, 0], [0, 0], [0, 0]
    a[0], b[0], c[0], d[0], a[1], b[1], c[1], d[1] = params
    MSE = 0

    allpredmeans = np.zeros((40,))

    for iter in range(2):
        sum_of_truePr = a[iter] + b[iter] + c[iter] + d[iter]
        MSE += (sum_of_truePr / 100 - 1) ** 2 / 2

        truePr = get_truePr_RF(a[iter], b[iter], c[iter], d[iter])

        for i, trueP in enumerate(truePr):
            allpredmeans[i + iter * 20] = trueP
    return allpredmeans, MSE

def MSE_RF(params):
    allpredmeans, MSE = generativeModel_RF(params)
    for i in range(len(allpredmeans)):
        currentdata = testdata[i, :].flatten()
        MSE += np.mean((allpredmeans[i] - currentdata) ** 2) / 40
    return MSE

def init_fit_RF_specific(pData):
    global testdata
    testdata = pData

    bnds = [(0.0, 100), (0.0, 100), (0.0, 100), (0.0, 100),
            (0.0, 100), (0.0, 100), (0.0, 100), (0.0, 100)]

    fit_all_data = differential_evolution(MSE_RF, bounds=bnds, popsize=30, disp=False, polish=fmin, tol=1e-5)
    print("Optimized Parameters:", fit_all_data.x)
    print("Minimal MSE:", fit_all_data.fun)

    allpredmeans, _ = generativeModel_RF(fit_all_data.x)

    first_b = fit_all_data.x[1]
    A1_and_B1 = allpredmeans[4]

    return fit_all_data.fun, first_b, A1_and_B1

data_file = '/content/drive/MyDrive/all_data/PrEstExp_811_111418_122039.csv'
participant_data, query_order = clean_data_specific(data_file)
MSE, first_b_estimate, A1_and_B1_estimate = init_fit_RF_specific(participant_data)

print("Results:")
print(f"MSE: {MSE:.5f}")
print(f"First index of vector 'b': {first_b_estimate:.5f}")
print(f"Estimate for 'A1 and B1' (cold and rainy): {A1_and_B1_estimate:.5f}")


Processing Participant File: /content/drive/MyDrive/all_data/PrEstExp_811_111418_122039.csv
________________________________________________________
Optimized Parameters: [36.38427409 19.09066396 25.73234486 18.78181226 37.11831024 20.04718619
 24.2561333  18.59914332]
Minimal MSE: 0.03867973335554067
Results:
MSE: 0.03868
First index of vector 'b': 19.09066
Estimate for 'A1 and B1' (cold and rainy): 0.36388
