# Analysis of Science Cliques output to find missing runs (there should be 7200 entries, 5 are missing)

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

In [2]:
datafile_path = "../data/output.csv"
full_data_df = pd.read_csv(datafile_path)

In [3]:
full_data_df

Unnamed: 0,num_individuals,num_neighbors,num_facts,starting_knowledge,investigation_probability,philosophy,truth_mean,truth_total,false_mean,false_total
0,8,8,16,15,0.0,skeptical,0.691540,53,0.308460,25
1,8,8,16,15,0.0,skeptical,0.452304,34,0.547696,44
2,8,8,16,15,0.0,skeptical,0.676573,53,0.323427,29
3,8,8,16,15,0.0,skeptical,0.497285,36,0.502715,37
4,8,8,16,15,0.0,skeptical,0.489836,40,0.510164,43
...,...,...,...,...,...,...,...,...,...,...
7190,100,8,1500,15,0.8,indirect,0.571538,50981,0.428462,38215
7191,100,8,1500,15,0.8,indirect,0.620185,53868,0.379815,33013
7192,100,8,1500,15,0.8,indirect,0.577887,49945,0.422113,36488
7193,100,8,1500,15,0.8,indirect,0.665077,56524,0.334923,28454


The .6 investigation probability for some reason ran as 0.6000000000000001, so I'm going to replace all instances of 0.6000000000000001 with .6 to make it easier to query.

In [4]:
full_data_df = full_data_df.replace(to_replace=0.6000000000000001, value=.6)
full_data_df

Unnamed: 0,num_individuals,num_neighbors,num_facts,starting_knowledge,investigation_probability,philosophy,truth_mean,truth_total,false_mean,false_total
0,8,8,16,15,0.0,skeptical,0.691540,53,0.308460,25
1,8,8,16,15,0.0,skeptical,0.452304,34,0.547696,44
2,8,8,16,15,0.0,skeptical,0.676573,53,0.323427,29
3,8,8,16,15,0.0,skeptical,0.497285,36,0.502715,37
4,8,8,16,15,0.0,skeptical,0.489836,40,0.510164,43
...,...,...,...,...,...,...,...,...,...,...
7190,100,8,1500,15,0.8,indirect,0.571538,50981,0.428462,38215
7191,100,8,1500,15,0.8,indirect,0.620185,53868,0.379815,33013
7192,100,8,1500,15,0.8,indirect,0.577887,49945,0.422113,36488
7193,100,8,1500,15,0.8,indirect,0.665077,56524,0.334923,28454


In [5]:
full_data_df.groupby(['num_individuals']).size()

num_individuals
8      1200
20     1197
40     1200
60     1198
80     1200
100    1200
dtype: int64

We need to figure out what is missing from the 20 individuals set and the 60 individuals set

## 20 individuals

In [6]:
num_individuals_20 = full_data_df[full_data_df['num_individuals'] == 20]
num_individuals_20

Unnamed: 0,num_individuals,num_neighbors,num_facts,starting_knowledge,investigation_probability,philosophy,truth_mean,truth_total,false_mean,false_total
1200,20,8,16,15,0.0,skeptical,0.616616,127,0.383384,80
1201,20,8,16,15,0.0,skeptical,0.657134,127,0.342866,68
1202,20,8,16,15,0.0,skeptical,0.486187,103,0.513813,107
1203,20,8,16,15,0.0,skeptical,0.622789,122,0.377211,75
1204,20,8,16,15,0.0,skeptical,0.628977,134,0.371023,78
...,...,...,...,...,...,...,...,...,...,...
2392,20,8,1500,15,0.8,indirect,0.714079,3956,0.285921,1584
2393,20,8,1500,15,0.8,indirect,0.595665,3216,0.404335,2183
2394,20,8,1500,15,0.8,indirect,0.685036,3630,0.314964,1669
2395,20,8,1500,15,0.8,indirect,0.629368,3386,0.370632,1994


In [7]:
num_individuals_20.groupby(['num_facts']).size()

num_facts
16      200
300     199
600     200
900     199
1200    199
1500    200
dtype: int64

One each from 300, 900, and 1200

In [8]:
num_individuals_20_num_facts_300 = num_individuals_20[num_individuals_20['num_facts']==300]
num_individuals_20_num_facts_900 = num_individuals_20[num_individuals_20['num_facts']==900]
num_individuals_20_num_facts_1200 = num_individuals_20[num_individuals_20['num_facts']==1200]

In [9]:
num_individuals_20_num_facts_300.groupby(['investigation_probability']).size()

investigation_probability
0.0    40
0.2    40
0.4    40
0.6    39
0.8    40
dtype: int64

In [10]:
num_individuals_20_num_facts_900.groupby(['investigation_probability']).size()

investigation_probability
0.0    39
0.2    40
0.4    40
0.6    40
0.8    40
dtype: int64

In [11]:
num_individuals_20_num_facts_1200.groupby(['investigation_probability']).size()

investigation_probability
0.0    40
0.2    40
0.4    40
0.6    40
0.8    39
dtype: int64

We are missing a .6 investigation probability from 300, a 0 from 900, and a .8 from 1200. Lets find philosophies.

In [14]:
num_individuals_20_num_facts_300_invest_prob_6 = num_individuals_20_num_facts_300[num_individuals_20_num_facts_300['investigation_probability']==0.6]
num_individuals_20_num_facts_300_invest_prob_6.groupby(['philosophy']).size()

philosophy
direct       10
indirect     10
reid          9
skeptical    10
dtype: int64

***Missing one 20 individuals, 300 facts, investigation probability .6, philosophy reid.***


In [16]:
num_individuals_20_num_facts_900_invest_prob_0 = num_individuals_20_num_facts_900[num_individuals_20_num_facts_900['investigation_probability']==0]
num_individuals_20_num_facts_900_invest_prob_0.groupby(['philosophy']).size()

philosophy
direct       10
indirect     10
reid          9
skeptical    10
dtype: int64

***Missing one 20 individuals, 900 facts, investigation probability 0, philosophy reid.***


In [17]:
num_individuals_20_num_facts_1200_invest_prob_8 = num_individuals_20_num_facts_1200[num_individuals_20_num_facts_1200['investigation_probability']==.8]
num_individuals_20_num_facts_1200_invest_prob_8.groupby(['philosophy']).size()

philosophy
direct       10
indirect     10
reid         10
skeptical     9
dtype: int64

***Missing one 20 individuals, 1200 facts, investigation probability .8, philosophy skeptical.***


### CONCLUSION

Missing:

20 individuals, 300 facts, .6 investigation probability, reid philosophy

20 individuals, 900 facts, 0 investigation probability, reid philosophy

20 individuals, 1200 facts, .8 investigation probability, skeptical philosophy

## 60 Individuals

In [18]:
num_individuals_60 = full_data_df[full_data_df['num_individuals'] == 60]
num_individuals_60

Unnamed: 0,num_individuals,num_neighbors,num_facts,starting_knowledge,investigation_probability,philosophy,truth_mean,truth_total,false_mean,false_total
3597,60,8,16,15,0.0,skeptical,0.672121,396,0.327879,194
3598,60,8,16,15,0.0,skeptical,0.599098,366,0.400902,238
3599,60,8,16,15,0.0,skeptical,0.587032,355,0.412968,243
3600,60,8,16,15,0.0,skeptical,0.614323,362,0.385677,223
3601,60,8,16,15,0.0,skeptical,0.604806,361,0.395194,232
...,...,...,...,...,...,...,...,...,...,...
4790,60,8,1500,15,0.8,indirect,0.591159,23252,0.408841,16080
4791,60,8,1500,15,0.8,indirect,0.593683,23999,0.406317,16423
4792,60,8,1500,15,0.8,indirect,0.618626,24550,0.381374,15134
4793,60,8,1500,15,0.8,indirect,0.651369,26294,0.348631,14075


In [19]:
num_individuals_60.groupby(['num_facts']).size()

num_facts
16      198
300     200
600     200
900     200
1200    200
1500    200
dtype: int64

Only missing from num_facts 16

In [20]:
num_individuals_60_num_facts_16 = num_individuals_60[num_individuals_60['num_facts']==16]
num_individuals_60_num_facts_16.groupby(['investigation_probability']).size()

investigation_probability
0.0    38
0.2    40
0.4    40
0.6    40
0.8    40
dtype: int64

Only from 0 investigation probability

In [22]:
num_individuals_60_num_facts_16_inv_prob_0 = num_individuals_60_num_facts_16[num_individuals_60_num_facts_16['investigation_probability']==0]
num_individuals_60_num_facts_16_inv_prob_0.groupby('philosophy').size()

philosophy
direct       10
indirect     10
reid         10
skeptical     8
dtype: int64

## CONCLUSION
Missing:

2 runs, 60 individuals, 16 facts, 0 investigation probability, skeptical philosophy

# Overall Conclusion

Missing 5 runs with parameters as follows:

num_individuals	num_neighbors	num_facts	starting_knowledge	investigation_probability	philosophy

20, 8, 300, 15, .6, reid

20, 8, 900, 15, 0, reid

20, 8, 1200, 15, .8, skeptical

60, 8, 16, 0, 15, 0, skeptical

60, 8, 16, 0, 15, 0, skeptical


# Ran the model again with above parameters, lets make sure it worked

In [23]:
new_full_data_df = pd.read_csv(datafile_path)
new_full_data_df

Unnamed: 0,num_individuals,num_neighbors,num_facts,starting_knowledge,investigation_probability,philosophy,truth_mean,truth_total,false_mean,false_total
0,8,8,16,15,0.0,skeptical,0.691540,53,0.308460,25
1,8,8,16,15,0.0,skeptical,0.452304,34,0.547696,44
2,8,8,16,15,0.0,skeptical,0.676573,53,0.323427,29
3,8,8,16,15,0.0,skeptical,0.497285,36,0.502715,37
4,8,8,16,15,0.0,skeptical,0.489836,40,0.510164,43
...,...,...,...,...,...,...,...,...,...,...
7195,20,8,300,15,0.6,reid,0.427249,1615,0.572751,2165
7196,20,8,900,15,0.0,reid,0.716798,3627,0.283202,1433
7197,20,8,1200,15,0.0,skeptical,0.608571,181,0.391429,115
7198,60,8,16,15,0.0,skeptical,0.519184,307,0.480816,278


In [24]:
new_full_data_df.groupby(['num_individuals']).size()

num_individuals
8      1200
20     1200
40     1200
60     1200
80     1200
100    1200
dtype: int64

In [25]:
new_full_data_df.groupby(['num_facts']).size()

num_facts
16      1200
300     1200
600     1200
900     1200
1200    1200
1500    1200
dtype: int64

In [26]:
new_full_data_df.groupby(['investigation_probability']).size()

investigation_probability
0.0    1441
0.2    1440
0.4    1440
0.6       1
0.6    1439
0.8    1439
dtype: int64

The .6's are fine (one is .6 and the other is 0.6000000000000001,I'm fine combining those. There is something weird happening with the 0 and the .8 though. I'll look into that in a second

In [27]:
new_full_data_df.groupby(['philosophy']).size()

philosophy
direct       1800
indirect     1800
reid         1800
skeptical    1800
dtype: int64

Ok, lets see whats happening with the .8 and the 0

In [30]:
new_full_data_df.groupby(['investigation_probability', 'philosophy']).size()

investigation_probability  philosophy
0.0                        direct        360
                           indirect      360
                           reid          360
                           skeptical     361
0.2                        direct        360
                           indirect      360
                           reid          360
                           skeptical     360
0.4                        direct        360
                           indirect      360
                           reid          360
                           skeptical     360
0.6                        reid            1
                           direct        360
                           indirect      360
                           reid          359
                           skeptical     360
0.8                        direct        360
                           indirect      360
                           reid          360
                           skeptical     359
dtype: int64

We have an extra skeptical investigation probability 0.0 and are missing a skeptical investigation probability 0.8. Checked the data I accidentally ran the 20 individual .8 probability skeptical with .0. Deleting that entry and running it correctly now.

# One more time

In [32]:
newer_full_data_df = pd.read_csv(datafile_path)
newer_full_data_df

Unnamed: 0,num_individuals,num_neighbors,num_facts,starting_knowledge,investigation_probability,philosophy,truth_mean,truth_total,false_mean,false_total
0,8,8,16,15,0.0,skeptical,0.691540,53,0.308460,25
1,8,8,16,15,0.0,skeptical,0.452304,34,0.547696,44
2,8,8,16,15,0.0,skeptical,0.676573,53,0.323427,29
3,8,8,16,15,0.0,skeptical,0.497285,36,0.502715,37
4,8,8,16,15,0.0,skeptical,0.489836,40,0.510164,43
...,...,...,...,...,...,...,...,...,...,...
7195,20,8,300,15,0.6,reid,0.427249,1615,0.572751,2165
7196,20,8,900,15,0.0,reid,0.716798,3627,0.283202,1433
7197,60,8,16,15,0.0,skeptical,0.519184,307,0.480816,278
7198,60,8,16,15,0.0,skeptical,0.633720,386,0.366280,227


In [33]:
newer_full_data_df.groupby(['num_individuals']).size()

num_individuals
8      1200
20     1200
40     1200
60     1200
80     1200
100    1200
dtype: int64

In [34]:
newer_full_data_df.groupby(['num_facts']).size()

num_facts
16      1200
300     1200
600     1200
900     1200
1200    1200
1500    1200
dtype: int64

In [35]:
newer_full_data_df.groupby(['investigation_probability']).size()

investigation_probability
0.0    1440
0.2    1440
0.4    1440
0.6       1
0.6    1439
0.8    1440
dtype: int64

In [36]:
newer_full_data_df.groupby(['philosophy']).size()

philosophy
direct       1800
indirect     1800
reid         1800
skeptical    1800
dtype: int64

***Fine!***