In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
import pickle
from sklearn.metrics import jaccard_score
import itertools
from scipy import stats

In [8]:
MODULE_PATH = Path(os.getcwd()).parents[2].resolve()

combined_path = "combined_ba.pickle"

path_combined = 'combined_dataset.csv'
path_sts = 'sts.csv'
path_qqp = 'qqp.csv'
path_sample_qqp = 'sample_qqp.csv'

PATH_DATA = MODULE_PATH / 'data'
PATH_COMBINED = PATH_DATA / 'datasets' / path_combined
PATH_STS = PATH_DATA / 'datasets' / path_sts
PATH_SAMPLE_QQP = PATH_DATA / 'datasets' / path_sample_qqp

# with open(MODULE_PATH / 'data' / 'bad_annotators' / combined_path, 'rb') as f:
#     combined_ba = pickle.load(f)

In [46]:
filtering_heuristics = list(combined_ba.keys())[:-1]
filtering_heuristics

['duration',
 'random_honeypot',
 'low_std',
 'high_random',
 'unpopular',
 'sentiment_inconsistent']

# EDA on the various Filtering methods

What we want to explore:

1) check number and percentage of unique annotators per condition, and percentage of data filtered<br>
2) Jaccard Score of the various categories (how much overlap do they have) <br>
3) Each individual scores impact on the inner-dataset (and intra-dataset) baseline and RF Increase <br>
4) What combinations provide the best increase on baseline and increases <br>
5) Sidestep filtering: removing the condition with the highest improvement, than the second highest if the improvement crosees an improvement threshold, than the 3rd, 4th etc.

In [45]:
df = pd.read_csv(PATH_COMBINED, index_col=0)

num_rows_total = df.shape[0]
num_annotator_total = df.annotator.nunique()
pair_id_agg = df.groupby('pair_id').size()
pair_id_agg = pair_id_agg[pair_id_agg > 2].index
pair_id_count_total = len(pair_id_agg)

print(f"Total Dataset has:\nSize:\t{num_rows_total} \
        Num Annotators:\t{num_annotator_total} \
        Num of unique pairs:\t{pair_id_count_total}")

Total Dataset has:
Size:	35912         Num Annotators:	460         Num of unique pairs:	11960


In [43]:
# 1) check number and percentage of unique annotators per condition, and percentage of data filtered
for key in filtering_heuristics:
    df_filt = df[~df.annotator.isin(combined_ba[key])]

    num_rows_filt = df_filt.shape[0]
    percent_rows_kept = np.round(num_rows_filt/num_rows_total * 100,2)
    num_annotator_filt = df_filt.annotator.nunique()
    percent_annotator_filt = np.round(((num_annotator_filt / num_annotator_total)*100),2)
    pair_id_agg = df_filt.groupby('pair_id').size()
    pair_id_agg = pair_id_agg[pair_id_agg > 2].index
    pair_id_count_filt = len(pair_id_agg)

    print(f"Filtered Dataset on __{key}__ has:\nSize:\t{num_rows_filt} \
            Percentage Rows kept:\t{percent_rows_kept} \
            Num Annotators:\t{num_annotator_filt} \
            Percent Annotator Kept:\t{percent_annotator_filt} \
            Num of unique pairs:\t{pair_id_count_filt}\n")

Filtered Dataset on __duration__ has:
Size:	33768             Percentage Rows kept:	94.03             Num Annotators:	428             Percent Annotator Kept:	93.04             Num of unique pairs:	9958

Filtered Dataset on __random_honeypot__ has:
Size:	31622             Percentage Rows kept:	88.05             Num Annotators:	442             Percent Annotator Kept:	96.09             Num of unique pairs:	8127

Filtered Dataset on __low_std__ has:
Size:	23441             Percentage Rows kept:	65.27             Num Annotators:	364             Percent Annotator Kept:	79.13             Num of unique pairs:	3286

Filtered Dataset on __high_random__ has:
Size:	30382             Percentage Rows kept:	84.6             Num Annotators:	437             Percent Annotator Kept:	95.0             Num of unique pairs:	7209

Filtered Dataset on __unpopular__ has:
Size:	35901             Percentage Rows kept:	99.97             Num Annotators:	458             Percent Annotator Kept:	99.57             Num 

In [84]:
def jaccard_similarity(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / min(len(list1),len(list2))

In [85]:
comb2 = itertools.combinations(filtering_heuristics,2)
for filtA, filtB in comb2:
    jac_score = jaccard_similarity(combined_ba[filtA],combined_ba[filtB])
    print("%-20s %-25s %-10s" % (filtA,filtB, np.round(jac_score,2)))

duration             random_honeypot           0.17      
duration             low_std                   0.22      
duration             high_random               0.13      
duration             unpopular                 0.0       
duration             sentiment_inconsistent    0.12      
random_honeypot      low_std                   1.0       
random_honeypot      high_random               1.0       
random_honeypot      unpopular                 0.0       
random_honeypot      sentiment_inconsistent    0.11      
low_std              high_random               0.78      
low_std              unpopular                 0.5       
low_std              sentiment_inconsistent    0.08      
high_random          unpopular                 0.0       
high_random          sentiment_inconsistent    0.17      
unpopular            sentiment_inconsistent    0.0       
