In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from math import sqrt, exp, pi
from sklearn.naive_bayes import GaussianNB
from scipy.stats import weibull_min, weibull_max, genextreme
import itertools

In [6]:
nb_training = 5

all_frames_uniform = [1, 2, 4, 5, 6, 36, 38, 40, 42, 45, 47, 50, 52, 55, 57, 60, 62, 65, 67, 70,
                      72, 74, 76, 78, 80, 84, 86, 87, 100, 142]

all_frames_enriched = [1, 2, 4, 5, 6, 36, 38, 40, 42, 45, 47, 50, 52, 55, 57, 60, 62, 65, 67, 70,
                      72, 74, 76, 78, 80, 86, 101, 111, 121, 132]

all_frames_long = [1, 2, 4, 5, 6, 36, 38, 40, 42, 45, 47, 50, 52, 55, 57, 60, 62, 65, 67, 70,
                   72, 74, 76, 78, 80, 140, 141, 142]

human_test_ind = [36, 38, 40, 42, 45, 47, 50, 52, 55, 57, 60, 62, 65, 67, 70, 72, 74, 76, 78, 80,]

human_uniform_prob = [0.074786, 0.057692, 0.066239, 0.104701, 0.123932,
                      0.188034, 0.258547, 0.388889, 0.485043, 0.602564,
                      0.690171, 0.782051, 0.856838, 0.884615, 0.899573,
                      0.931624, 0.938034, 0.955128, 0.950855, 0.972222]
human_enrich_prob = [0.119048, 0.062907, 0.075922, 0.097614, 0.132321,
                     0.201735, 0.288503, 0.392625, 0.496746, 0.613883,
                     0.722343, 0.800434, 0.872017, 0.924078, 0.952278,
                     0.952278, 0.958696, 0.976087, 0.973913, 0.980435]

human_long_prob = [0.06572769953051644, 0.018823529411764704, 0.009411764705882352,
                   0.011764705882352941, 0.02358490566037736, 0.030660377358490566,
                   0.04245283018867924, 0.0589622641509434, 0.09669811320754718,
                   0.12028301886792453, 0.18632075471698112, 0.2169811320754717,
                   0.2783018867924528, 0.33490566037735847, 0.42924528301886794,
                   0.5047169811320755, 0.5825471698113207, 0.6279620853080569,
                   0.7156398104265402, 0.7535545023696683]

In [38]:
# Calculate the mean of a list of numbers
def mean(numbers):
    return sum(numbers)/float(len(numbers))




# Calculate the standard deviation of a list of numbers
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
    return sqrt(variance)


In [52]:
def evt_predict(data,
                shape,
                loc,
                scale,
                distance_choice,
                distribution):
    """

    :param data:
    :param shape:
    :param loc:
    :param scale:
    :return:
    """
    if distance_choice == "to_A":
        # Whether to flip with maximum value
        if distribution == "weibull":
            sample_w_frame = data[["frame", "distance_to_A"]]
            frames = sample_w_frame['frame'].unique()
        if distribution == "reversed_weibull":
            sample_w_frame = data[["frame", "distance_to_A"]]
            
            max_value = data["distance_to_A"].max()
            sample_w_frame["distance_to_A"] = max_value - sample_w_frame["distance_to_A"]
            
            frames = sample_w_frame['frame'].unique()
        
    else:
        sample_w_frame = data[["frame", "distance_to_B"]]
        frames = sample_w_frame['frame'].unique()


    prob_dict = {}
    prob_one_frame = []

    for one_frame in frames:
        frames = sample_w_frame.loc[sample_w_frame['frame'] == one_frame]
        
        if distance_choice == "to_A":
            dist = frames["distance_to_A"].tolist()
        else:
            dist = frames["distance_to_B"].tolist()

        for one_dist in dist:
            if distribution == "weibull":
                prob = weibull_min.cdf(one_dist, shape, loc, scale)
            elif distribution == "reversed_weibull":
                prob = weibull_max.cdf(one_dist, shape, loc, scale)
            else:
                raise Exception('Distribution not implemented.')

            prob_one_frame.append(prob)

        prob_dict[one_frame] = mean(prob_one_frame)

    return prob_dict

In [41]:
def evt_model(training_data_path,
              test_data_path,
              tail_type,
              tail_ratio,
              distribution,
              distance_choice,
              sampling,
              params=None,
              fit_curve=False):
    """

    :return:
    """

    # Load training data for class A and B, and testing data
    training_data = pd.read_csv(training_data_path)
    test_data = pd.read_csv(test_data_path)  

    """
    We can choose to fit a distribution, currently choose between "weibull" and "reversed_weibull".
    This can give estimates on the parameters
    But we can also grid search and tune these parameters instead.
    Currently we choose to do parameter search.
    
    weibull: weibull_min. When data is bounded from below.
    reversed weibull:weibull_max. When data is bounded from above.
    
    fit func returns shape, location and scale parameters
    """
    if fit_curve:
        # We are not using this currently.
        
        pass

    else:
        """
        Use the parameters for our testing samples.
        Probability for each sample is produced by CDF func.
        """
        shape_b = params[0]
        scale_b = params[1]
        loc_b = params[2]
        

        # EVT Uniform
        if sampling == "uniform":
            prob_dict_b = evt_predict(data=training_data,
                                      shape=shape_b,
                                      loc=loc_b,
                                      scale=scale_b,
                                      distribution=distribution,
                                      distance_choice=distance_choice)

        
        # EVT enriched tail
        elif sampling == "enrich_tail":
            # First, calculate how many samples we will use based on tail size
            nb_sample = int(tail_ratio * training_data.shape[0])
            print("nb_sample for B: ", nb_sample)
            
            # Then, use tail type to decide what samples we are taking
            data_for_b_left = training_data[:int(0.5*nb_sample)]
            data_for_b_right = training_data[-int(0.5*nb_sample):]
            frames = [data_for_b_left, data_for_b_right]
            data_for_b = pd.concat(frames)
                
            print(data_for_b.shape)
            prob_dict_b = evt_predict(data=data_for_b,
                                      shape=shape_b,
                                      loc=loc_b,
                                      scale=scale_b,
                                      distance_choice=distance_choice,
                                      distribution=distribution)
            
        # EVT long tail (new)
        elif sampling == "long_tail":
            # First, calculate how many samples we will use based on tail size
            nb_sample = int(tail_ratio * training_data.shape[0])
            print("nb_sample for B: ", nb_sample)
            
            data_for_b = training_data[-nb_sample:]
            
            prob_dict_b = evt_predict(data=data_for_b,
                                      shape=shape_b,
                                      loc=loc_b,
                                      scale=scale_b,
                                      distribution=distribution,
                                      distance_choice=distance_choice)
            
        else:
            print("Invalid sampling method.")
            
        
        prob_dict_test = evt_predict(data=test_data,
                                     shape=shape_b,
                                     loc=loc_b,
                                     scale=scale_b,
                                     distribution=distribution,
                                     distance_choice="to_B")

        prob_dict = {**prob_dict_b, **prob_dict_test}
        prob_dict = dict(sorted(prob_dict.items()))

    return prob_dict

In [42]:
class_b_uniform_path = "/Users/kiyoshi/Desktop/jov_everything/face_morph_v4_5_sets_modeling_with_3_samplings/sampled_data_0227/EVT/sample_b_uniform_use_for_both_ab.csv"

class_b_enrich_dist_to_a_path = "/Users/kiyoshi/Desktop/jov_everything/face_morph_v4_5_sets_modeling_with_3_samplings/sampled_data_0227/EVT/sample_b_enrich_sort_by_distance_to_a.csv"
class_b_enrich_dist_to_b_path = "/Users/kiyoshi/Desktop/jov_everything/face_morph_v4_5_sets_modeling_with_3_samplings/sampled_data_0227/EVT/sample_b_enrich_sort_by_distance_to_b.csv"

class_b_long_dist_to_a_path = "/Users/kiyoshi/Desktop/jov_everything/face_morph_v4_5_sets_modeling_with_3_samplings/sampled_data_0227/EVT/sample_b_long_sort_by_distance_to_a.csv"
class_b_long_dist_to_b_path = "/Users/kiyoshi/Desktop/jov_everything/face_morph_v4_5_sets_modeling_with_3_samplings/sampled_data_0227/EVT/sample_b_long_sort_by_distance_to_b.csv"

test_sample_path = "/Users/kiyoshi/Desktop/jov_everything/face_morph_v4_5_sets_modeling_with_3_samplings/sampled_data_0227/EVT/test_samples.csv"

In [43]:
# EVT 
shape_uniform = 9.0
scale_uniform = 0.2
location_uniform = 40

shape_enrich = 100.0
scale_enrich = 0.1
location_enrich = 39

tail_type = "left"
tail_size = 0.5

In [74]:
"""
Probability of inclusion and exclusion:

- Inclusion: always use distance to B + weibull

- Exclusion option 1: use distance to A + weibull
- Exclusion option 2: use (max - distance to A) + reverse weibull

"""

# Uniform: inclusion -- distance to B + weibull
evt_uniform_inclusion = evt_model(training_data_path=class_b_uniform_path,
                                  test_data_path=test_sample_path,
                                  tail_type=None,
                                  tail_ratio=None,
                                  distribution="weibull",
                                  distance_choice="to_B",
                                  sampling="uniform",
                                  params=[12.0, 0.9, 55])
print(evt_uniform_inclusion)

{36: 0.8497524165363479, 38: 0.8363586260461582, 40: 0.806802351960078, 42: 0.7884350985692787, 45: 0.7709321829117264, 47: 0.7555983817322449, 50: 0.7413066371579164, 52: 0.7251562940059751, 55: 0.7061445059752591, 57: 0.6888688658325036, 60: 0.6718200628439596, 62: 0.6556642414919324, 65: 0.6346714219269255, 67: 0.6142351871164352, 70: 0.592603448445238, 72: 0.5719381160261828, 74: 0.5514942257079615, 76: 0.5307612462024598, 78: 0.5109007639088556, 80: 0.4925755240898183, 86: 0.011884094967247918, 101: 0.0, 111: 0.0, 121: 0.0, 132: 0.009507275973798334}


In [45]:
# Uniform: inclusion -- distance to B + reverse weibull
evt_uniform_inclusion = evt_model(training_data_path=class_b_uniform_path,
                                  test_data_path=test_sample_path,
                                  tail_type=None,
                                  tail_ratio=None,
                                  distribution="reversed_weibull",
                                  distance_choice="to_B",
                                  sampling="uniform",
                                  params=[shape_uniform, scale_uniform, location_uniform])
print(evt_uniform_inclusion)

{36: 1.0, 38: 1.0, 40: 1.0, 42: 1.0, 45: 1.0, 47: 1.0, 50: 0.9999999899387183, 52: 0.9986111023074896, 55: 0.9975308563720895, 57: 0.9966666596237694, 60: 0.9949499446082786, 62: 0.9925930047798109, 65: 0.989577300267964, 67: 0.9855598740583474, 70: 0.9798558824544576, 72: 0.9734494178133716, 74: 0.9632463721418867, 76: 0.9510906879155706, 78: 0.9378753749355188, 80: 0.925074199062105, 86: 0.18827158054776594, 101: 0.14942528735632185, 111: 0.06, 121: 0.07602339181286549, 132: 0.15061726443821274}


In [78]:
# Uniform: exclusion option 1
evt_uniform_exclusion_weibull = evt_model(training_data_path=class_b_uniform_path,
                                          test_data_path=test_sample_path,
                                          tail_ratio=None,
                                          tail_type=None,
                                          distribution="weibull",
                                          distance_choice="to_A",
                                          sampling="uniform",
                                          params=[1.0, 0.9, 55])
print(evt_uniform_exclusion_weibull)

{36: 0.8675910131318305, 38: 0.8474495429988709, 40: 0.8208830390789809, 42: 0.7986502870968871, 45: 0.7796615203686061, 47: 0.7630762527815564, 50: 0.7461464008222137, 52: 0.7291664853845181, 55: 0.711166627024748, 57: 0.6933169064635695, 60: 0.6758079963632462, 62: 0.6585638067624442, 65: 0.6376646776770586, 67: 0.6168067246709029, 70: 0.5949400630968812, 72: 0.5738221508706861, 74: 0.5535556305514985, 76: 0.5329947497661232, 78: 0.5131359758752422, 80: 0.4941275146107459, 86: 0.8028434104272109, 101: 0.7830630362321067, 111: 0.8813592993963361, 121: 0.8775696126103536, 132: 0.8419859586997256}


In [84]:
# Uniform: exclusion option 2
evt_uniform_exclusion_reverse_weibull = evt_model(training_data_path=class_b_uniform_path,
                                                  test_data_path=test_sample_path,
                                                  tail_ratio=None,
                                                  tail_type=None,
                                                  distribution="reversed_weibull",
                                                  distance_choice="to_A",
                                                  sampling="uniform",
                                                  params=[5.0, 0.5, 30])
print(evt_uniform_exclusion_reverse_weibull)

{36: 1.0, 38: 1.0, 40: 1.0, 42: 1.0, 45: 1.0, 47: 1.0, 50: 1.0, 52: 1.0, 55: 1.0, 57: 1.0, 60: 1.0, 62: 1.0, 65: 1.0, 67: 1.0, 70: 0.99950381226205, 72: 0.9988403795512274, 74: 0.9982549977475688, 76: 0.997734658366531, 78: 0.9972690915519182, 80: 0.9965414012633446, 86: 0.3255752999276181, 101: 0.35925396254801395, 111: 0.23797720820395088, 121: 0.24265668976885954, 132: 0.2644521889481431}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_w_frame["distance_to_A"] = max_value - sample_w_frame["distance_to_A"]


In [None]:
# Enrich tail inclusion