# Prepare Datasets

In [1]:
import numpy as np
import pandas as pd
# Study 1c: Study on 50 U.S. states. state - name of the state, city - name of the city asked
# Every state is asked with it's true capital
study_1c = pd.read_csv('~/DATA_1030/Final_Project/crowd_wisdom_data/study1c.csv')
# Study 2: Trivia. qname - the topic of the trivia question (39 participants, 80 unique qnames)
study_2 = pd.read_csv('~/DATA_1030/Final_Project/crowd_wisdom_data/study2.csv')
# Study 3: Dermatologists diagnosing lesions as malignant or benign
study_3 = pd.read_csv('~/DATA_1030/Final_Project/crowd_wisdom_data/study3.csv')

# Group the datasets based on questions

# Calculating REAL minority values / cutoffs

In [2]:
# study_1c, for 33 participants
grouped_1c = study_1c.groupby('state').sum()
grouped_1c["minority_response"] = 0
count = study_1c.groupby('state').count()
for ind, row in grouped_1c.iterrows():
    size = count.loc[ind][0]
    if grouped_1c.loc[ind]["own"] > size//2:
        study_1c.loc[study_1c["state"]==ind, "minority_response"] = 0
        study_1c.loc[study_1c["state"]==ind, "minority_number"] = size-grouped_1c.loc[ind]["own"]
    else:
        study_1c.loc[study_1c["state"]==ind, "minority_response"] = 1
        study_1c.loc[study_1c["state"]==ind, "minority_number"] = grouped_1c.loc[ind]["own"]


# study_2, for 39 participants
grouped_2 = study_2.groupby('qtext').sum()
grouped_2["minority_response"] = 0
count = study_2.groupby('qtext').count()
for ind, row in grouped_2.iterrows():
    size = count.loc[ind][0]
    if grouped_2.loc[ind]["own"] > size//2:
        study_2.loc[study_2["qtext"]==ind, "minority_response"] = 0
        study_2.loc[study_2["qtext"]==ind, "minority_number"] = size-grouped_2.loc[ind]["own"]
    else:
        study_2.loc[study_2["qtext"]==ind, "minority_response"] = 1
        study_2.loc[study_2["qtext"]==ind, "minority_number"] = grouped_2.loc[ind]["own"]
        
# study_3, for 25 participants
grouped_3 = study_3.groupby('image').sum()
grouped_3["minority_response"] = 0
count = study_3.groupby('image').count()
for ind, row in grouped_3.iterrows():
    size = count.loc[ind][0]
    if grouped_3.loc[ind]["own"] > size//2:
        study_3.loc[study_3["image"]==ind, "minority_response"] = 0
        study_3.loc[study_3["image"]==ind, "minority_number"] = size-grouped_3.loc[ind]["own"]
    else:
        study_3.loc[study_3["image"]==ind, "minority_response"] = 1
        study_3.loc[study_3["image"]==ind, "minority_number"] = grouped_3.loc[ind]["own"]

In [3]:
avg = []
for index, row in study_1c.iterrows():
    avg.append(grouped_1c.loc[row["state"]]["own"])  

In [4]:
avg = []
for index, row in study_2.iterrows():
    avg.append(grouped_2.loc[row["qtext"]]["own"])  

In [5]:
avg = []
for index, row in study_3.iterrows():
    avg.append(grouped_3.loc[row["image"]]["own"])

# Minority classification

In [6]:
study_1c["in_minority"] = np.where(study_1c['own'] == study_1c['minority_response'], 'yes', 'no') #so that 0,0 doesn't count as an expert
study_1c["true_guess"] = np.where(study_1c['own'] == study_1c['actual'], 'yes', 'no ')
study_1c["expert"] = np.where(study_1c['in_minority'] == study_1c['true_guess'], 'yes', 'no')

In [7]:
study_2["in_minority"] = np.where(study_2['own'] == study_2['minority_response'], 'yes', 'no') #so that 0,0 doesn't count as an expert
study_2["true_guess"] = np.where(study_2['own'] == study_2['actual'], 'yes', 'no ')
study_2["expert"] = np.where(study_2['in_minority'] == study_2['true_guess'], 'yes', 'no')

In [8]:
study_3["in_minority"] = np.where(study_3['own'] == study_3['minority_response'], 'yes', 'no') #so that 0,0 doesn't count as an expert
study_3["true_guess"] = np.where(study_3['own'] == study_3['actual'], 'yes', 'no ')
study_3["expert"] = np.where(study_3['in_minority'] == study_3['true_guess'], 'yes', 'no')

# Create self-consensus

In [9]:
study_1c["self_consensus"] = np.where(study_1c['own'] == 0, 1-study_1c['meta'], study_1c['meta'])
study_2["self_consensus"] = np.where(study_2['own'] == 0, 1-study_2['meta'], study_2['meta'])
study_3["self_consensus"] = np.where(study_3['own'] == 0, 1-study_3['meta'], study_3['meta'])

# Create Self-Concensus difference 

In [10]:
study_1c["c-sc"] = study_1c["confidence"]-study_1c["self_consensus"]
study_2["c-sc"] = study_2["confidence"]-study_2["self_consensus"]
study_3["c-sc"] = study_3["confidence"]-study_3["self_consensus"]

# Expert to Non-Expert Ratio:

In [11]:
experts1 = study_1c[study_1c["expert"] == 'yes']
non_experts1 = study_1c[study_1c["expert"] == 'no']
print("Expert to Non-Expert Ratio study_1c:", experts1.shape[0]/non_experts1.shape[0])

Expert to Non-Expert Ratio study_1c: 0.17857142857142858


In [12]:
experts2 = study_2[study_2["expert"] == 'yes']
non_experts2 = study_2[study_2["expert"] == 'no']
print("Expert to Non-Expert Ratio study_2:", experts2.shape[0]/non_experts2.shape[0])

Expert to Non-Expert Ratio study_2: 0.18143939393939393


In [13]:
experts3 = study_3[study_3["expert"] == 'yes']
non_experts3 = study_3[study_3["expert"] == 'no']
print("Expert to Non-Expert Ratio study_3:", experts3.shape[0]/non_experts3.shape[0])

Expert to Non-Expert Ratio study_3: 0.07913669064748201


In [14]:
experts = pd.concat([experts1, experts2, experts3])
non_experts = pd.concat([non_experts1, non_experts2, non_experts3])
print("Expert to Non-Expert Ratio total:", experts.shape[0]/non_experts.shape[0])

Expert to Non-Expert Ratio total: 0.15451197053406998


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


# Krueger Statictics calculator

In [15]:
def krueger_statictic(dta, predicting, cut):
    from scipy.stats import ttest_ind
    from statistics import stdev 
    import statistics
    
    if predicting == False:
        experts = dta[dta["expert"] == 'yes']
        non_experts = dta[dta["expert"] == 'no']
    else:
#         import pdb; pdb.set_trace()
        question_title = dta.index
        dta.reset_index(inplace=True, drop=True)
        experts = dta[dta["in_minority"] == "yes"][:cut]
        non_experts = dta.drop(experts.index)
        experts["question"] = question_title[0]
        experts.index = experts["question"]
        experts.drop(["question"], axis=1)
        
        non_experts["question"] = question_title[0]
        non_experts.index = non_experts["question"]
        non_experts.drop(["question"], axis=1)
            
    ''' p values less than 0.05, 95% confidence interval
    '''
    if experts.shape[0] == 0:
        average_std = stdev(non_experts["c-sc"])
        mean_non_experts_sc = non_experts["c-sc"].mean()
        krueger_test_statistic = (0-mean_non_experts_sc)/average_std
        return krueger_test_statistic
    if experts.shape[0] == 1:
        average_std = stdev(dta["c-sc"])
        mean_non_experts_sc = non_experts["c-sc"].mean()
        mean_experts_sc = experts["c-sc"].mean()
        krueger_test_statistic = (mean_experts_sc-mean_non_experts_sc)/average_std
        return krueger_test_statistic
    if experts.shape[0] > 1:
        if predicting:
            try:
                determine = stdev(experts["c-sc"])/stdev(non_experts["c-sc"]) < 2 or stdev(experts["c-sc"])/stdev(non_experts["c-sc"]) > 0.5
            except (ZeroDivisionError, statistics.StatisticsError):
#                 import pdb; pdb.set_trace()
                determine = False
            if determine:
                average_std = (stdev(experts["c-sc"])+stdev(non_experts["c-sc"]))/2
                mean_experts_sc = experts["c-sc"].mean()
                mean_non_experts_sc = non_experts["c-sc"].mean()
                krueger_test_statistic = (mean_experts_sc-mean_non_experts_sc)/average_std
                return krueger_test_statistic
            else:
                return "Sd more than double away"
        else:
            t_test = ttest_ind(experts["c-sc"], non_experts["c-sc"])
            t_test_p_value = t_test[1]
            if t_test_p_value < 0.05:
                try:
                    determine = stdev(experts["c-sc"])/stdev(non_experts["c-sc"]) < 2 or stdev(experts["c-sc"])/stdev(non_experts["c-sc"]) > 0.5
                except ZeroDivisionError:
                    determine = False
                if determine:
                    average_std = (stdev(experts["c-sc"])+stdev(non_experts["c-sc"]))/2
                    mean_experts_sc = experts["c-sc"].mean()
                    mean_non_experts_sc = non_experts["c-sc"].mean()
                    krueger_test_statistic = (mean_experts_sc-mean_non_experts_sc)/average_std
                    return krueger_test_statistic
                else:
                    return "Sd more than double away"
            else:
                return f"t-test not significant: {t_test_p_value}"

### Unique Questions in each dataset

In [16]:
states = study_1c["state"].unique()
study_1c = study_1c.sort_values(['state', 'c-sc'], ascending=False)

In [17]:
questions = study_2["qtext"].unique()
study_2 = study_2.sort_values(['qtext', 'c-sc'], ascending=False)

In [18]:
images = study_3["image"].unique()
study_3 = study_3.sort_values(['image', 'c-sc'], ascending=False)

# Calculating K-Stats for all datasets

In [19]:
from scipy.stats import ttest_ind
from statistics import stdev
import warnings
warnings.filterwarnings('ignore')

In [20]:
study_1c = study_1c.drop(['expt city', 'subject', 'own', 'meta', 'confidence', 'actual',
               'minority_response', 'true_guess', 'self_consensus'], axis=1)

study_2 = study_2.drop(['subject', 'qname', 'own', 'meta', 'actual', 'confidence',
                        'minority_response','true_guess','self_consensus'], axis=1)

study_3 = study_3.drop(['subject', 'own', 'actual', 'meta', 'confidence',
                        'minority_response', 'true_guess', 'self_consensus'], axis=1)

In [21]:
study_1c.columns = ['question', 'minority_number', 'in_minority','expert','c-sc']
study_2.columns = ['question', 'minority_number', 'in_minority','expert','c-sc']
study_3.columns = ['question', 'minority_number', 'in_minority','expert','c-sc']

In [22]:
df_questions = pd.concat([study_1c, study_2, study_3])
df_questions = df_questions.reset_index()
df_questions = df_questions.drop(["index"], axis=1)

In [23]:
# mark expert empty datas
df_questions["has_at_least_one_expert"] = 0
df_questions["real_expert_number"] = 0
for q in df_questions["question"].unique():
    expert_number = df_questions[df_questions["question"] == q]["expert"].str.count("yes").sum()
    if expert_number > 0:
        df_questions.loc[df_questions["question"] == q, "has_at_least_one_expert"] = 1
    df_questions.loc[df_questions["question"] == q, "real_expert_number"] = expert_number

In [24]:
real_expert_df = df_questions.groupby(["question"]).mean().drop(['minority_number', 'c-sc',
       'has_at_least_one_expert'], axis=1)

In [25]:
def MSE(real_values, predicted_values):
    return ((real_values - predicted_values) ** 2).mean() ** .5
'''
Random refers to which rows will be selected as expert when the selections is being done.
Default, non random value is to sort the dataframe in ascending order on c-sc, and then
select k amount of rows based on the cutoff percentage. 
'''

def predict_expert_numbers(df_question, random, X_train):
    df_question["best_krueger_stat_for_the_prediction"] = 0
    df_question["diff_krueger_stat_real_prediction"] = 0
    df_question["predicted_expert_number"] = 0
    df_question["predicted_expert_number_PERCENTAGE"] = 0
    best_score_list = []
    questions_list = df_question.index.unique()
    
    for index in range(len(questions_list)):
        q_id = questions_list[index]
        df = df_question.loc[q_id]
        
        score_list = []
        
        
        if random == False:
            df = df.sort_values(['c-sc'], ascending=False)
            cutoffs = list(range(1, 101))
            cutoffs_l = [round(x/100*(int(df["minority_number"].mean()))) for x in cutoffs]
            cutoff_dict= dict(zip(cutoffs, cutoffs_l))

            for perc, cut in cutoff_dict.items():
#                 import pdb; pdb.set_trace()
                k_stat = krueger_statictic(df, True, cut)
                percentage_of_total_population = cut/len(df)
                best_d = best_distance(k_stat, X_train)
                threshold_used = best_d[0][1]
                score_list.append([q_id,k_stat, best_d[0][0], threshold_used,
                                   cut,percentage_of_total_population])
        else:
            for x in range(0,10):
#                 df = df.sample(frac=1, random_state=int(x**3)).reset_index(drop=True)
                df = df.sample(frac=1, random_state=int(x**3))
                cutoffs = list(range(1, 101))
    
                try:
                    cutoffs_l = [round(x/100*(int(df["minority_number"].mean()))) for x in cutoffs]
                except ValueError:
                    import pdb; pdb.set_trace()

                cutoffs_l = [round(x/100*(int(df["minority_number"].mean()))) for x in cutoffs]
                cutoff_dict= dict(zip(cutoffs, cutoffs_l))

                for perc, cut in cutoff_dict.items():
                    k_stat = krueger_statictic(df, True, cut)
                    percentage_of_total_population = cut/len(df)
                    best_d = best_distance(k_stat, X_train)
                    threshold_used = best_d[0][1]
                    score_list.append([q_id,k_stat, best_d[0][0], threshold_used,
                                       cut,percentage_of_total_population])
                                        
        score_list.sort(key=lambda tup: tup[2],reverse=False)
        best_score_list.append(score_list[0])
        
    '''
    Checks for the datasets which have a minority grou however, the minority group is wrong. 
    This means there are 0 experts in the dataset as per our definition of an expert (actual==own, in_minority=1).
    '''
        
    for i in range(len(best_score_list)):
        q = best_score_list[i][0]
        df_question.loc[q, "best_krueger_stat_for_the_prediction"] = best_score_list[i][3]
        df_question.loc[q, "diff_krueger_stat_real_prediction"] = best_score_list[i][2]
        if best_score_list[i][3] < 0: #if the threshold used is closer to the threshold of 0s
            expert_number = 0
            expert_perc = 0
        else:
            expert_number = best_score_list[i][4]
            expert_perc = best_score_list[i][5]
        df_question.loc[q, "predicted_expert_number"] = expert_number
        df_question.loc[q, "predicted_expert_number_PERCENTAGE"] = expert_perc
        
def best_distance(k_stat, X_train):
    thresholds = X_train["k_stat_thresh"].unique()
    best_distance = []
    comp1 = abs(float(thresholds[0]-k_stat))
    comp2 = abs(float(thresholds[1]-k_stat))
    if comp1 < comp2:
        best_distance.append([comp1,thresholds[0]])
    else:
        best_distance.append([comp2,thresholds[1]])
    return best_distance

def create_test_train(df_questions):
    df_questions = df_questions.set_index("question")
    size = 210
    
    qs = df_questions.index.unique()
    
    dt1 = np.random.choice(qs, size//5, replace=False)
    dt1 = df_questions.loc[dt1]
    df_questions = df_questions.drop(dt1.index)

    qs = df_questions.index.unique()
    dt2 = np.random.choice(qs, size//5, replace=False)
    dt2 = df_questions.loc[dt2]
    df_questions = df_questions.drop(dt2.index)

    qs = df_questions.index.unique()
    dt3 = np.random.choice(qs, size//5, replace=False)
    dt3 = df_questions.loc[dt3]
    df_questions = df_questions.drop(dt3.index)

    qs = df_questions.index.unique()
    dt4 = np.random.choice(qs, size//5, replace=False)
    dt4 = df_questions.loc[dt4]
    df_questions = df_questions.drop(dt4.index)

    qs = df_questions.index.unique()
    dt5 = np.random.choice(qs, len(qs), replace=False)
    dt5 = df_questions.loc[dt5]
    df_questions = df_questions.drop(dt5.index) 
    
    datasets = [dt1, dt2, dt3, dt4, dt5]
    
    return datasets

In [26]:
for q in df_questions["question"].unique():
    df_questions.loc[df_questions["question"] == q, "minority_percntge"] = (df_questions.loc[df_questions["question"] == q]["minority_number"].mean()/len(df_questions.loc[df_questions["question"] == q]))

# Calculating

In [27]:
'''
Create test and train dataset
Get an average Krueger score (of the 5) 
    AND Run predictions 5 times, see which is more accurate
'''
no_minority_situations = df_questions[df_questions["minority_number"] == 0.0]
df_questions = df_questions.drop(no_minority_situations.index)

random = [True,False]
trainKStat_expert_list = []
trainKStat_noExpert_list = []

data_sets = create_test_train(df_questions)

for i in range(len(data_sets)):
    print(f"Round {i} started.")
    test_data = data_sets[i]
    X_test = test_data.drop(["real_expert_number"], axis=1)
    y_test = test_data["real_expert_number"]
    
    train_data = []
    for index in range(len(data_sets)):
        if index != i:
            train_data.append(data_sets[index])

    X_train = pd.concat(train_data)    
    X_train["k_stat_thresh"] = 0
    
    trainKStat_expert = krueger_statictic(X_train[X_train["has_at_least_one_expert"] == 1], False, 0)
    print("Krueger train works.")
    trainKStat_noExpert = krueger_statictic(X_train[X_train["has_at_least_one_expert"] == 0], False, 0)
    trainKStat_expert_list.append(trainKStat_expert)
    trainKStat_noExpert_list.append(trainKStat_noExpert)
    
    X_train.loc[X_train["has_at_least_one_expert"] == 1, "k_stat_thresh"] = trainKStat_expert
    X_train.loc[X_train["has_at_least_one_expert"] == 0, "k_stat_thresh"] = trainKStat_noExpert

    for rand in random:
        predict_expert_numbers(X_test, rand, X_train)
        if rand:
            comparing_data = X_test.groupby('question').mean()
            comparing_data = comparing_data.drop(['minority_number', 
                                              'c-sc','has_at_least_one_expert',
                                              'best_krueger_stat_for_the_prediction',
                                              'diff_krueger_stat_real_prediction',
                                              'predicted_expert_number_PERCENTAGE'], axis=1)
        else:
            comparing_data["predict_random_F"] = X_test.groupby('question').mean()["predicted_expert_number"]
    print(f"Round {i} prediction ended.")
            
    comparing_data["real_expert_number"] = 0
    comparing_data.update(real_expert_df)
    comparing_data.columns = ["minority_percentage", "real_expert_number", "P_expert_number_rT", "P_expert_number_rF"]
    no_minority_situations = no_minority_situations.groupby("question").mean()
    if i == 0:
        final_data_sets = comparing_data
    else:
        final_data_sets = pd.concat([final_data_sets, comparing_data])
        
for ind, row in no_minority_situations.iterrows():
    final_data_sets.loc[ind] = [0,0,0,0]

Round 0 started.
Krueger train works.
Round 0 prediction ended.
Round 1 started.
Krueger train works.
Round 1 prediction ended.
Round 2 started.
Krueger train works.
Round 2 prediction ended.
Round 3 started.
Krueger train works.
Round 3 prediction ended.
Round 4 started.
Krueger train works.
Round 4 prediction ended.


In [28]:
k_expert_mean = sum(trainKStat_expert_list)/5
k_nonexpert_mean = sum(trainKStat_noExpert_list)/5

In [29]:
mse = MSE(final_data_sets["real_expert_number"], final_data_sets["P_expert_number_rT"])
mse_rF = MSE(final_data_sets["real_expert_number"], final_data_sets["P_expert_number_rF"])

In [42]:
final_data_sets.to_csv("krueger+minority.csv")

In [32]:
'''
Do it with their means
'''

data_sets = create_test_train(df_questions)
X_test = pd.concat(data_sets) 
X_train = pd.concat(data_sets) 
X_train["k_stat_thresh"] = 0

X_train.loc[X_train["has_at_least_one_expert"] == 1, "k_stat_thresh"] = k_expert_mean
X_train.loc[X_train["has_at_least_one_expert"] == 0, "k_stat_thresh"] = k_nonexpert_mean

for rand in random:
    predict_expert_numbers(X_test, rand, X_train)
    if rand:
        comparing_data = X_test.groupby('question').mean()
        comparing_data = comparing_data.drop(['minority_number', 
                                          'c-sc','has_at_least_one_expert',
                                          'best_krueger_stat_for_the_prediction',
                                          'diff_krueger_stat_real_prediction',
                                          'predicted_expert_number_PERCENTAGE'], axis=1)
    else:
        comparing_data["predict_random_F"] = X_test.groupby('question').mean()["predicted_expert_number"]

comparing_data["real_expert_number"] = 0
comparing_data.update(real_expert_df)
comparing_data.columns = ["real_expert_number", "minority_percentage", "P_expert_number_rT", "P_expert_number_rF"]
no_minority_situations = no_minority_situations.groupby("question").mean()

for ind, row in no_minority_situations.iterrows():
    comparing_data.loc[ind] = [0,0,0,0]

In [67]:
comparing_data.columns = ["real_expert_number", "minority_percentage", "MEAN_P_expert_rT", "MEAN_P_expert_rF"]
comparing_data.sort_index(inplace=True)
final_data_sets.sort_index(inplace=True)

In [68]:
mse = MSE(final_data_sets["real_expert_number"], final_data_sets["P_expert_number_rT"])
mse_rF = MSE(final_data_sets["real_expert_number"], final_data_sets["P_expert_number_rF"])
mse_m = MSE(final_data_sets["real_expert_number"], final_data_sets["MEAN_P_expert_rT"])
mse_rF_m = MSE(final_data_sets["real_expert_number"], final_data_sets["MEAN_P_expert_rF"])
mse, mse_rF, mse_m, mse_rF_m

(5.233591137113807, 5.848890818738525, 3.0237157840738176, 5.333630944077313)

In [69]:
final_data_sets["MEAN_P_expert_rT"] = comparing_data["MEAN_P_expert_rT"]
final_data_sets["MEAN_P_expert_rF"] = comparing_data["MEAN_P_expert_rF"]

In [70]:
final_data_sets.to_csv("krueger+minority.csv")