In [1]:
import scipy
import pandas as pd

from helpers import sql

# pandas formatting
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('max_colwidth', 200)

In [2]:
df = sql("SELECT * FROM csas2_csasrequest;")

In [3]:
"""
2 Collapse the long tail until k ≈ Nₑff
Sort vc descending.

Starting from the smallest counts, iteratively merge each rare status into a parent (or “Other”) and recompute k.

Stop when k ≤ Nₑff × (1 to 1.5).
That 1.5 buffer keeps a bit of nuance without re-inflating the list.
"""

def do_stats(counts_dataframe):
    entropy = scipy.stats.entropy(counts_dataframe, base=2)
    n_eff = 2**scipy.stats.entropy(counts_dataframe, base=2)
    redundancy = len(counts_dataframe) / n_eff
    
    return entropy, n_eff, redundancy 


def print_stats(counts_dataframe):
    e, n, r = do_stats(counts_dataframe)
    
    print('\nentropy:\t', round(float(e), 2))
    print('n_eff:\t\t', round(float(n), 2))
    print('redundancy:\t', round(float(r), 2))
    print()
    display(pd.DataFrame(counts_dataframe))
    print()


In [4]:
# base case, no changes
"""

Status List:

    DRAFT = 10
    UNDER_CLIENT_APPROVAL = 20
    AWAITING_CHANGES = 25
    READY_FOR_CSAS_REVIEW = 30
    UNDER_CSAS_REVIEW = 40
    SCREENED_IN = 41
    FLAGGED = 42
    RESCOPING = 43
    REFER_TO_PROCESS = 70
    FULFILLED = 80
    WITHDRAWN = 99

"""

vc = df['status'].value_counts(normalize=True)
print_stats(vc)


entropy:	 2.74
n_eff:		 6.66
redundancy:	 1.65



Unnamed: 0_level_0,proportion
status,Unnamed: 1_level_1
99,0.275715
80,0.217894
70,0.17468
41,0.127815
30,0.065733
10,0.059038
40,0.041388
42,0.021302
43,0.010956
20,0.003652





In [5]:
# 20 and 25 are both 20 (Under Client Approval)
"""

New Status List:

    DRAFT
    UNDER_CLIENT_APPROVAL
    READY_FOR_CSAS_REVIEW
    UNDER_CSAS_REVIEW
    SCREENED_IN
    FLAGGED
    RESCOPING
    REFER_TO_PROCESS
    FULFILLED
    WITHDRAWN

"""

vc2 = vc.copy()
vc2[20] += vc2.pop(25)
print_stats(vc2)


entropy:	 2.73
n_eff:		 6.64
redundancy:	 1.51



Unnamed: 0_level_0,proportion
status,Unnamed: 1_level_1
99,0.275715
80,0.217894
70,0.17468
41,0.127815
30,0.065733
10,0.059038
40,0.041388
42,0.021302
43,0.010956
20,0.005478





In [6]:
# 30, 40, 42, and 43 are all 40 (UNDER_CSAS_REVIEW)
"""

New Status List:

    DRAFT
    UNDER_CLIENT_APPROVAL
    AWAITING_CHANGES
    UNDER_CSAS_REVIEW
    SCREENED_IN
    REFER_TO_PROCESS
    FULFILLED
    WITHDRAWN

"""

vc3 = vc.copy()
vc3[40] += vc3.pop(30)
vc3[40] += vc3.pop(41)
vc3[40] += vc3.pop(42)
vc3[40] += vc3.pop(43)
print_stats(vc3)


entropy:	 2.23
n_eff:		 4.68
redundancy:	 1.5



Unnamed: 0_level_0,proportion
status,Unnamed: 1_level_1
99,0.275715
80,0.217894
70,0.17468
10,0.059038
40,0.267194
20,0.003652
25,0.001826





In [7]:
# both of the above
"""

New Status List:

    DRAFT
    UNDER_CLIENT_APPROVAL
    UNDER_CSAS_REVIEW
    SCREENED_IN
    REFER_TO_PROCESS
    FULFILLED
    WITHDRAWN

"""

vc4 = vc.copy()
vc4[20] += vc4.pop(25)
vc4[40] += vc4.pop(30)
vc4[40] += vc4.pop(41)
vc4[40] += vc4.pop(42)
vc4[40] += vc4.pop(43)
print_stats(vc4)


entropy:	 2.22
n_eff:		 4.67
redundancy:	 1.29



Unnamed: 0_level_0,proportion
status,Unnamed: 1_level_1
99,0.275715
80,0.217894
70,0.17468
10,0.059038
40,0.267194
20,0.005478



