In [1]:
import pandas as pd
import os
from start import data_path
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

  from collections import Sequence


In [2]:
hand_data = pd.read_csv(os.path.join(data_path, 'validation.csv'))
hand_data = hand_data[['title', 'gold_standard']]
print('Length of hand coded validation set: ', len(hand_data))
hand_data.sample(5)

Length of hand coded validation set:  30


Unnamed: 0,title,gold_standard
23,Miami ISD,"[25.0811, 25.0812, 11.251, 28.004, 21.352, 21...."
25,Dew ISD,"[25.0811, 25.081, 21.401, 21.003]"
20,River Road ISD,"[25.0811, 25.0812, 21.003, 21.055, 21.057, 25...."
21,Jayton-Girard ISD,"[25.0811, 25.0812, 21.003, 21.053, 21.057, 25...."
0,Evant ISD,"[21.401, 25.081, 25.081, 21.003, 25.0811, 21.003]"


In [3]:
final_data = pd.read_csv(os.path.join(data_path, 'doi_exemptions_list.csv'))
final_data = final_data[['title', 'possible_laws']]

In [4]:
validation = hand_data.merge(final_data, on = 'title', how = 'left')
print("Full Validation Data")
validation

Full Validation Data


Unnamed: 0,title,gold_standard,possible_laws
0,Evant ISD,"[21.401, 25.081, 25.081, 21.003, 25.0811, 21.003]","[25.0811, 21.003, 21.401, 25.081]"
1,Buna ISD,"[25.0811, 25.0812, 25.112, 21.003, 21.053, 21....","[25.0811, 21.003, 21.057, 21.451, 21.458, 25.1..."
2,Anson ISD,"[21.003, 21.057, 25.081, 25.0811, 25.092, 21.4...","[25.0811, 11.253, 28.0216, 21.003, 21.057, 21...."
3,White Deer ISD,"[25.0811, 21.003, 21.053, 21.057, 21.401, 25.0...","[37.008, 25.082, 25.001, 21.003, 21.053, 21.40..."
4,Bryan ISD,"[25.0811, 21.003, 25.092, 25.111, 25.112, 25.113]","[25.0811, 28.0216, 21.0031, 21.003, 25.113, 21..."
5,La Vega ISD,"[11.251, 11.252, 11.253, 21.003, 21.0031, 21.0...","[11.251, 11.252, 11.253, 21.003, 21.0031, 21.0..."
6,Sidney ISD,"[25.0811, 21.003, 25.081, 25.082, 21.102]","[25.081, 21.102, 21.003, 25.082, 25.0811]"
7,Valley View ISD,"[11.251, 11.252, 11.253, 21.003, 21.102, 21.40...","11.251, 11.252, 11.253, 21.003, 21.102, 21.401..."
8,Burkburnett ISD,"[25.092, 25.081, 25.082, 37.0012, 25.0811, 25....","[37.0012, 25.082, 11.251, 11.252, 25.081, 25.0..."
9,Community ISD,"[21.003, 21.057, 21.102, 21.351, 21.352, 21.35...","[37.0012, 21.003, 44.902, 25.036, 21.102, 21.4..."


In [5]:
def make_list(string):
    new_list = [i.replace('[','').replace(']','').replace('\n', '') for i in string.split(', ')]
    return new_list

In [6]:
gold_laws = []
for l in validation.gold_standard:
    for item in make_list(l):
        if item not in gold_laws:
            gold_laws.append(item)
print("All laws in hand-coded validation set. Lenghth: ", len(gold_laws))
gold_laws

All laws in hand-coded validation set. Lenghth:  47


['21.401',
 '25.081',
 '21.003',
 '25.0811',
 '25.0812',
 '25.112',
 '21.053',
 '21.055',
 '21.057',
 '21.102',
 '21.451',
 '21.458',
 '25.092',
 '21.404',
 '11.253',
 '25.082',
 '37.008',
 '25.036',
 '25.111',
 '25.113',
 '11.251',
 '11.252',
 '21.0031',
 '21.158',
 '25.083',
 '37.0012',
 '44.031',
 '44.0331',
 '44.0352',
 '44.042',
 '44.043',
 '44.047',
 '28.004',
 '37.0081',
 '37.0082',
 '21.351',
 '21.352',
 '21.3541',
 '28.025',
 '29.151',
 '44.902',
 '25.114',
 '22.011',
 '21.203',
 '21.353',
 '21.002',
 '21.044']

# Create gold standard dataframe


In [7]:
gold_df = validation[['title', 'gold_standard']]
gold_df['gold_standard'] = gold_df['gold_standard'].apply(make_list)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [8]:
mlb = MultiLabelBinarizer()
gold_df = gold_df.join(pd.DataFrame(mlb.fit_transform(gold_df.pop('gold_standard')),
                          columns=mlb.classes_,
                          index=gold_df.index))
gold_df.head(10)

Unnamed: 0,title,11.251,11.252,11.253,21.002,21.003,21.0031,21.044,21.053,21.055,...,37.008,37.0081,37.0082,44.031,44.0331,44.0352,44.042,44.043,44.047,44.902
0,Evant ISD,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Buna ISD,0,0,0,0,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,Anson ISD,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,White Deer ISD,0,0,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
4,Bryan ISD,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,La Vega ISD,1,1,1,0,1,1,0,1,1,...,0,0,0,1,1,1,1,1,1,0
6,Sidney ISD,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,Valley View ISD,1,1,1,0,1,0,0,0,0,...,1,1,1,0,0,0,0,0,0,0
8,Burkburnett ISD,1,1,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9,Community ISD,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [9]:
gold_stats = gold_df.describe()
gold_stats = pd.DataFrame(gold_stats.loc['mean'])
gold_stats['gold_count'] = gold_stats['mean'] * len(gold_df)
gold_stats = gold_stats.sort_values(by = 'mean', ascending = False)
gold_stats = pd.DataFrame(gold_stats['gold_count'])
top_gold = gold_stats.head(20)
top_gold = list(top_gold.index)
gold_stats

Unnamed: 0,gold_count
21.003,30.0
25.0811,30.0
21.102,19.0
25.112,16.0
21.057,16.0
25.081,15.0
25.113,14.0
25.082,13.0
21.401,12.0
21.053,11.0


In [10]:
print("Only calculating precision and recall for top-twenty laws, ",
     gold_stats.head(20).gold_count.sum(), " in total")


Only calculating precision and recall for top-twenty laws,  243.0  in total


In [11]:
top_gold

['21.003',
 '25.0811',
 '21.102',
 '25.112',
 '21.057',
 '25.081',
 '25.113',
 '25.082',
 '21.401',
 '21.053',
 '25.092',
 '25.0812',
 '37.0012',
 '21.352',
 '25.036',
 '11.251',
 '25.111',
 '21.458',
 '11.252',
 '21.3541']

# Create automated dataframe

In [12]:
auto_df = validation[['title', 'possible_laws']]
auto_df.loc[:,'possible_laws'] = np.where((auto_df.possible_laws.isnull()), '[]', auto_df.possible_laws)
auto_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


Unnamed: 0,title,possible_laws
0,Evant ISD,"[25.0811, 21.003, 21.401, 25.081]"
1,Buna ISD,"[25.0811, 21.003, 21.057, 21.451, 21.458, 25.1..."
2,Anson ISD,"[25.0811, 11.253, 28.0216, 21.003, 21.057, 21...."
3,White Deer ISD,"[37.008, 25.082, 25.001, 21.003, 21.053, 21.40..."
4,Bryan ISD,"[25.0811, 28.0216, 21.0031, 21.003, 25.113, 21..."
5,La Vega ISD,"[11.251, 11.252, 11.253, 21.003, 21.0031, 21.0..."
6,Sidney ISD,"[25.081, 21.102, 21.003, 25.082, 25.0811]"
7,Valley View ISD,"11.251, 11.252, 11.253, 21.003, 21.102, 21.401..."
8,Burkburnett ISD,"[37.0012, 25.082, 11.251, 11.252, 25.081, 25.0..."
9,Community ISD,"[37.0012, 21.003, 44.902, 25.036, 21.102, 21.4..."


In [13]:
auto_df = auto_df[['title', 'possible_laws']]
auto_df['extracted'] = auto_df['possible_laws'].apply(make_list)
auto_df

Unnamed: 0,title,possible_laws,extracted
0,Evant ISD,"[25.0811, 21.003, 21.401, 25.081]","[25.0811, 21.003, 21.401, 25.081]"
1,Buna ISD,"[25.0811, 21.003, 21.057, 21.451, 21.458, 25.1...","[25.0811, 21.003, 21.057, 21.451, 21.458, 25.1..."
2,Anson ISD,"[25.0811, 11.253, 28.0216, 21.003, 21.057, 21....","[25.0811, 11.253, 28.0216, 21.003, 21.057, 21...."
3,White Deer ISD,"[37.008, 25.082, 25.001, 21.003, 21.053, 21.40...","[37.008, 25.082, 25.001, 21.003, 21.053, 21.40..."
4,Bryan ISD,"[25.0811, 28.0216, 21.0031, 21.003, 25.113, 21...","[25.0811, 28.0216, 21.0031, 21.003, 25.113, 21..."
5,La Vega ISD,"[11.251, 11.252, 11.253, 21.003, 21.0031, 21.0...","[11.251, 11.252, 11.253, 21.003, 21.0031, 21.0..."
6,Sidney ISD,"[25.081, 21.102, 21.003, 25.082, 25.0811]","[25.081, 21.102, 21.003, 25.082, 25.0811]"
7,Valley View ISD,"11.251, 11.252, 11.253, 21.003, 21.102, 21.401...","[11.251, 11.252, 11.253, 21.003, 21.102, 21.40..."
8,Burkburnett ISD,"[37.0012, 25.082, 11.251, 11.252, 25.081, 25.0...","[37.0012, 25.082, 11.251, 11.252, 25.081, 25.0..."
9,Community ISD,"[37.0012, 21.003, 44.902, 25.036, 21.102, 21.4...","[37.0012, 21.003, 44.902, 25.036, 21.102, 21.4..."


In [14]:
mlb = MultiLabelBinarizer()
auto_df = auto_df.join(pd.DataFrame(mlb.fit_transform(auto_df.pop('extracted')),
                          columns=mlb.classes_,
                          index=auto_df.index))
auto_df.head(10)

Unnamed: 0,title,possible_laws,11.251,11.252,11.253,21.002,21.003,21.0031,21.044,21.053,...,37.008,37.0081,37.0082,44.031,44.0331,44.0352,44.042,44.043,44.047,44.902
0,Evant ISD,"[25.0811, 21.003, 21.401, 25.081]",0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Buna ISD,"[25.0811, 21.003, 21.057, 21.451, 21.458, 25.1...",0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,Anson ISD,"[25.0811, 11.253, 28.0216, 21.003, 21.057, 21....",0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,White Deer ISD,"[37.008, 25.082, 25.001, 21.003, 21.053, 21.40...",0,0,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,0
4,Bryan ISD,"[25.0811, 28.0216, 21.0031, 21.003, 25.113, 21...",0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,La Vega ISD,"[11.251, 11.252, 11.253, 21.003, 21.0031, 21.0...",1,1,1,0,1,1,0,1,...,0,0,0,1,1,1,1,1,1,0
6,Sidney ISD,"[25.081, 21.102, 21.003, 25.082, 25.0811]",0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,Valley View ISD,"11.251, 11.252, 11.253, 21.003, 21.102, 21.401...",1,1,1,0,1,0,0,0,...,1,1,1,0,0,0,0,0,0,0
8,Burkburnett ISD,"[37.0012, 25.082, 11.251, 11.252, 25.081, 25.0...",1,1,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9,Community ISD,"[37.0012, 21.003, 44.902, 25.036, 21.102, 21.4...",0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [15]:
auto_stats = auto_df.describe()
auto_stats = pd.DataFrame(auto_stats.loc['mean'])
auto_stats['count_auto'] = auto_stats['mean'] * len(auto_df)
auto_stats = auto_stats.sort_values(by = 'mean', ascending = False)
auto_stats = pd.DataFrame(auto_stats['count_auto'])
top_auto = auto_stats.head(20)
top_auto = list(top_auto.index)
auto_stats

Unnamed: 0,count_auto
25.0811,30.0
21.003,30.0
21.102,20.0
21.057,17.0
25.081,15.0
25.113,14.0
25.112,14.0
25.082,13.0
21.401,12.0
21.053,11.0


# Merge

In [16]:
validate_table = gold_stats.merge(auto_stats, how = 'outer', left_index = True, right_index = True)
validate_table = validate_table.sort_values(by = ['gold_count'], ascending = False)
validate_table

Unnamed: 0,gold_count,count_auto
21.003,30.0,30.0
25.0811,30.0,30.0
21.102,19.0,20.0
25.112,16.0,14.0
21.057,16.0,17.0
25.081,15.0,15.0
25.113,14.0,14.0
25.082,13.0,13.0
21.401,12.0,12.0
21.053,11.0,11.0


# Explore

In [17]:
top = set(top_auto + top_gold)
top

{'11.251',
 '11.252',
 '21.003',
 '21.053',
 '21.057',
 '21.102',
 '21.352',
 '21.3541',
 '21.401',
 '21.458',
 '25.036',
 '25.081',
 '25.0811',
 '25.0812',
 '25.082',
 '25.092',
 '25.111',
 '25.112',
 '25.113',
 '28.0216',
 '37.0012'}

# False Negatives

In [18]:
indices = []
laws = []
count_fn = 0
for index, row in validation.set_index('title').iterrows():
    indices.append(index)
    gold_laws = make_list(row['gold_standard'])
    possible_laws = make_list(row['possible_laws'])
    fn = [law for law in gold_laws if law in top and law not in possible_laws]
    laws.append(fn)
    count_fn = count_fn + len(fn)

print('false negatives', count_fn)
list(zip(indices, laws))

false negatives 4


[('Evant ISD', []),
 ('Buna ISD', []),
 ('Anson ISD', []),
 ('White Deer ISD', []),
 ('Bryan ISD', []),
 ('La Vega ISD', []),
 ('Sidney ISD', []),
 ('Valley View ISD', []),
 ('Burkburnett ISD', []),
 ('Community ISD', []),
 ('Bridgeport ISD', []),
 ('De Leon ISD', []),
 ('White Oak ISD', []),
 ('Carlisle ISD', []),
 ('Blanco ISD', []),
 ('Dublin ISD', []),
 ('Ricardo ISD', []),
 ('Comanche ISD', []),
 ('Hardin ISD', []),
 ('Burkeville ISD', []),
 ('River Road ISD', ['25.112', '25.113']),
 ('Jayton-Girard ISD', []),
 ('La Porte ISD', ['25.112', '25.113']),
 ('Miami ISD', []),
 ('Eanes ISD', []),
 ('Dew ISD', []),
 ('Morton ISD', []),
 ('Mexia ISD', []),
 ('Comfort ISD', []),
 ('Mesquite ISD', [])]

# False Positives

In [19]:
indices = []
laws = []
count_fp = 0
for index, row in validation.set_index('title').iterrows():
    indices.append(index)
    gold_laws = make_list(row['gold_standard'])
    possible_laws = make_list(row['possible_laws'])
    fp = [law for law in possible_laws if  law in top and law not in gold_laws]
    laws.append(fp)
    count_fp = count_fp + len(fp)

print('false positives', count_fp)
list(zip(indices, laws))

false positives 10


[('Evant ISD', []),
 ('Buna ISD', ['25.113']),
 ('Anson ISD', ['28.0216']),
 ('White Deer ISD', []),
 ('Bryan ISD', ['28.0216', '21.057']),
 ('La Vega ISD', []),
 ('Sidney ISD', []),
 ('Valley View ISD', []),
 ('Burkburnett ISD', []),
 ('Community ISD', []),
 ('Bridgeport ISD', ['28.0216']),
 ('De Leon ISD', []),
 ('White Oak ISD', []),
 ('Carlisle ISD', []),
 ('Blanco ISD', []),
 ('Dublin ISD', []),
 ('Ricardo ISD', ['25.113']),
 ('Comanche ISD', []),
 ('Hardin ISD', []),
 ('Burkeville ISD', ['25.0812']),
 ('River Road ISD', ['28.0216']),
 ('Jayton-Girard ISD', []),
 ('La Porte ISD', ['21.102']),
 ('Miami ISD', []),
 ('Eanes ISD', []),
 ('Dew ISD', []),
 ('Morton ISD', []),
 ('Mexia ISD', ['28.0216']),
 ('Comfort ISD', []),
 ('Mesquite ISD', [])]

# True Positives

In [20]:
indices = []
true_positives = []
count_tp = 0
for index, row in validation.set_index('title').iterrows():
    indices.append(index)
    gold_laws = make_list(row['gold_standard'])
    possible_laws = make_list(row['possible_laws'])
    tp = [law for law in possible_laws if law in top and law in gold_laws]
    true_positives.append(tp)
    count_tp = count_tp + len(tp)

print(count_tp)

239


In [21]:
# Precision
precision = count_tp/(count_tp+count_fp)
print(precision)

0.9598393574297188


In [22]:
# Recall
recall = count_tp/(count_tp+count_fn)
print(recall)

0.9835390946502057
