# Association analysis for cluster's sematincs

## Transaction encoding of the dataset

In [None]:
import pandas as pd
import numpy as np
import pickle

from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.compose import make_column_selector, ColumnTransformer

from mlxtend.frequent_patterns import fpgrowth, association_rules

In [None]:
with open('./data/kmeans_pca.jak', 'rb') as outp:  # Overwrites any existing file.
    kmean_feature_norm_pca = pickle.load(outp)
    
with open('./data/agg_ward_kpca.jak', 'rb') as outp:  # Overwrites any existing file.
    agg_ward_kpca = pickle.load(outp)

with open('./data/dbscan_kpca.jak', 'rb') as outp:  # Overwrites any existing file.
    dbscan_kpca = pickle.load(outp)

In [None]:
association_rules_dataset = pd.read_csv('./data/association_dataset.csv')
association_rules_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 742 entries, 0 to 741
Data columns (total 13 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   borough                      742 non-null    object 
 1   location_category            742 non-null    object 
 2   development_type             742 non-null    object 
 3   is_only_water_sewer_charges  742 non-null    bool   
 4   service_start_date           742 non-null    object 
 5   is_federal                   742 non-null    bool   
 6   is_meter_amr                 742 non-null    bool   
 7   #_days                       742 non-null    int64  
 8   consumption_(hcf)            742 non-null    int64  
 9   current_charges              742 non-null    float64
 10  service_end_date             742 non-null    object 
 11  estimated_score              742 non-null    float64
 12  bill_analyzed_score          742 non-null    float64
dtypes: bool(3), float64(

Apply quantile cut, which is equal frequency binning.

In [None]:
def multi_qcutter(df: pd.DataFrame, m): 
    return pd.concat([pd.qcut(df[s], q=m, duplicates='drop') for s in df], axis='columns')

K3_qcutter = FunctionTransformer(
    lambda df, m: 
        OneHotEncoder(sparse_output=False) \
            .set_output(transform='pandas') \
            .fit_transform(multi_qcutter(df, m))
).set_output(transform='pandas').set_params(kw_args={'m':3})

K5_qcutter = FunctionTransformer(
    lambda df, m: 
        OneHotEncoder(sparse_output=False) \
            .set_output(transform='pandas') \
            .fit_transform(multi_qcutter(df, m))
).set_output(transform='pandas').set_params(kw_args={'m':5})

Apply classical pandas binnin, equal width, using a rule outler-sentive like Friedman Diaconis rule to determine the edges of the bins.

In [None]:
def multi_cutter(df: pd.DataFrame):
    cuts = []

    for s in df:
        bins = np.histogram_bin_edges(df[s], bins='fd')
        
        cuts.append(pd.cut(df[s], bins=bins, include_lowest=True))
        
    return pd.concat(cuts, axis='columns')

K_bin_encoder = FunctionTransformer(
    lambda df: 
        OneHotEncoder(sparse_output=False) \
            .set_output(transform='pandas') \
            .fit_transform(multi_cutter(df))
).set_output(transform='pandas')

Define a function to add the opposite of each boolean column.

In [None]:
def create_opposite(df: pd.DataFrame, new_name):
    df.loc[:, new_name] = ~df

    return df

is_not_federal_creator = FunctionTransformer(func=create_opposite, kw_args={'new_name': 'is_not_federal'})
is_not_meter_amr_creator = FunctionTransformer(func=create_opposite, kw_args={'new_name': 'is_not_meter_amr'})
is_not_only_water_sewer_charges_creator = FunctionTransformer(func=create_opposite, kw_args={'new_name': 'is_not_only_water_sewer_charges'})

Transform the dataset.

In [None]:
discretizer = ColumnTransformer([
    ('ohe', OneHotEncoder(sparse_output=False), make_column_selector(dtype_include=object)),
    ('3bin', K3_qcutter, ['#_days',]),
    ('5bin', K5_qcutter, ['consumption_(hcf)', 'current_charges']),
    ('ew', K5_qcutter, ['estimated_score', 'bill_analyzed_score']),
    ('is_not_federal', is_not_federal_creator, ['is_federal']),
    ('is_not_meter_amr', is_not_meter_amr_creator, ['is_meter_amr']),
    ('is_not_only_basic_sewer', is_not_only_water_sewer_charges_creator, ['is_only_water_sewer_charges']),
], remainder='passthrough').set_output(transform='pandas')

te_dataset = discretizer.fit_transform(association_rules_dataset).rename(columns=lambda x: x.split('__')[-1])

## 3-Means with PCA clustering

In [None]:
clusters = pd.Series(kmean_feature_norm_pca['K3'])
clusters.value_counts()

0    295
2    228
1    219
Name: count, dtype: int64

In [None]:
cluster_0_pca_kmeans = fpgrowth(te_dataset[clusters == 0].astype(bool), use_colnames=True, min_support=0.5)
cluster_1_pca_kmeans = fpgrowth(te_dataset[clusters == 1].astype(bool), use_colnames=True, min_support=0.4)
cluster_2_pca_kmeans = fpgrowth(te_dataset[clusters == 2].astype(bool), use_colnames=True, min_support=0.4)

### C0

In [None]:
ar_c0 = association_rules(cluster_0_pca_kmeans, num_itemsets=te_dataset.shape[0], min_threshold=0.7)[
            ['antecedents', 'consequents', 'confidence', 'support', 'lift']
        ]

ar_c0.sort_values('antecedents', ascending=True).head(10)

Unnamed: 0,antecedents,consequents,confidence,support,lift
0,(is_only_water_sewer_charges),(is_federal),0.888031,0.779661,0.99608
87,(is_meter_amr),"(#_days_(5.999, 980.0], location_category_UNSP...",0.734127,0.627119,0.993429
118,"(is_meter_amr, location_category_UNSPECIFIED_T...","(#_days_(5.999, 980.0], is_federal)",0.784689,0.555932,1.006449
119,"(is_federal, location_category_UNSPECIFIED_TYPE)","(#_days_(5.999, 980.0], is_meter_amr)",0.766355,0.555932,1.051511
93,(is_meter_amr),"(is_only_water_sewer_charges, location_categor...",0.734127,0.627119,0.988893
121,"(is_only_water_sewer_charges, is_meter_amr, lo...",(is_federal),0.854054,0.535593,0.957969
58,"(#_days_(5.999, 980.0])","(is_only_water_sewer_charges, location_categor...",0.771654,0.664407,1.039442
122,"(is_only_water_sewer_charges, is_federal, loca...",(is_meter_amr),0.831579,0.535593,0.973475
98,(is_meter_amr),"(is_federal, location_category_UNSPECIFIED_TYPE)",0.714286,0.610169,0.984646
52,(is_meter_amr),(location_category_UNSPECIFIED_TYPE),0.829365,0.708475,0.998623


### C1

In [None]:
ar_c1 = association_rules(cluster_1_pca_kmeans, num_itemsets=te_dataset.shape[0], min_threshold=0.6)[
            ['antecedents', 'consequents', 'confidence', 'support', 'lift']
        ]

ar_c1.sort_values('confidence', ascending=False).head(15)

Unnamed: 0,antecedents,consequents,confidence,support,lift
93,(development_type_FHA),(is_federal),1.0,0.415525,1.111675
86,(borough_QUEENS),(is_federal),1.0,0.479452,1.111675
95,"(development_type_FHA, borough_QUEENS)",(is_federal),1.0,0.415525,1.111675
89,"(#_days_(980.0, 2077.0], borough_QUEENS)",(is_federal),1.0,0.429224,1.111675
97,(development_type_FHA),"(is_federal, borough_QUEENS)",1.0,0.415525,2.085714
94,"(is_federal, development_type_FHA)",(borough_QUEENS),1.0,0.415525,2.085714
91,(development_type_FHA),(borough_QUEENS),1.0,0.415525,2.085714
80,"(is_federal, estimated_score_(-0.001, 0.0197])",(is_meter_amr),0.98913,0.415525,1.353872
71,"(estimated_score_(-0.001, 0.0197])",(is_meter_amr),0.980952,0.47032,1.342679
76,"(estimated_score_(-0.001, 0.0197], #_days_(980...",(is_meter_amr),0.978261,0.410959,1.338995


### C2

In [None]:
ar_c2 = association_rules(cluster_2_pca_kmeans, num_itemsets=te_dataset.shape[0], min_threshold=0.7)[
            ['antecedents', 'consequents', 'confidence', 'support', 'lift']
        ]

ar_c2.sort_values('confidence', ascending=False).sort_values('antecedents')

  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)


Unnamed: 0,antecedents,consequents,confidence,support,lift
3,(is_federal),"(#_days_(2077.0, 4057.0])",1.000000,0.903509,1.000000
186,(is_only_water_sewer_charges),"(#_days_(2077.0, 4057.0], is_federal)",0.856115,0.521930,0.947545
286,(borough_QUEENS),"(#_days_(2077.0, 4057.0], is_federal)",1.000000,0.640351,1.106796
431,"(is_meter_amr, borough_QUEENS)","(#_days_(2077.0, 4057.0], is_federal, estimate...",0.849624,0.495614,1.274436
381,"(is_meter_amr, borough_QUEENS)","(is_federal, estimated_score_(-0.001, 0.0197])",0.849624,0.495614,1.274436
...,...,...,...,...,...
1153,"(#_days_(2077.0, 4057.0], is_meter_amr, servic...","(is_federal, estimated_score_(-0.001, 0.0197])",0.933962,0.434211,1.400943
264,"(is_only_water_sewer_charges, is_meter_amr, #_...",(service_end_date_10/22/2019),0.969388,0.416667,1.221107
1209,"(#_days_(2077.0, 4057.0], development_type_FHA...","(is_meter_amr, service_end_date_10/22/2019, bo...",0.980198,0.434211,1.878026
1271,"(service_end_date_10/22/2019, development_type...",(is_federal),1.000000,0.434211,1.106796


### Comments

Basically every cluster contains very similar associations rules. The only serious difference is the number of days the meter has been registering water consumptions. It's very difficult to understand a semantics for these groups. We will try to understand the better clustering we have peformed.

Basically information that we can derive by any of the clusters: 
- The AMR meter are most of the times positioned on generic locations (the ones we don't have any specific info on);
- When the estimated score is almost 0, we will likely find an AMR meter

Specific clusters like the **second** one adds on these informations, some rules on the most frequent borough, that is QUEENS. Here we discover that  in this borough meter are likely located on FHA buildings (it was predictable).

## Agglomerative ward linkage with Kernel PCA

In [None]:
clusters_ward = pd.Series(agg_ward_kpca['K4'])
clusters_ward.value_counts()

3    581
1     61
0     51
2     49
Name: count, dtype: int64

In [None]:
cluster_0_kpca_agg = fpgrowth(te_dataset[clusters_ward == 0].astype(bool), use_colnames=True, min_support=0.3, max_len=3)
cluster_1_kpca_agg = fpgrowth(te_dataset[clusters_ward == 1].astype(bool), use_colnames=True, min_support=0.3, max_len=3)
cluster_2_kpca_agg = fpgrowth(te_dataset[clusters_ward == 2].astype(bool), use_colnames=True, min_support=0.3, max_len=3)
cluster_3_kpca_agg = fpgrowth(te_dataset[clusters_ward == 3].astype(bool), use_colnames=True, min_support=0.2, max_len=4)

### C0

In [None]:
ar_c0_kpca = association_rules(cluster_0_kpca_agg, num_itemsets=te_dataset.shape[0], min_threshold=0.7)[
            ['antecedents', 'consequents', 'confidence', 'support', 'lift']
        ]

ar_c0_kpca.sort_values('confidence', ascending=False).sort_values('antecedents')

  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)


Unnamed: 0,antecedents,consequents,confidence,support,lift
814,"(service_end_date_10/22/2019, current_charges_...","(estimated_score_(-0.001, 0.0197])",1.000000,0.333333,1.243902
663,(is_not_only_water_sewer_charges),"(estimated_score_(-0.001, 0.0197])",0.869565,0.392157,1.081654
693,(is_not_only_water_sewer_charges),"(#_days_(2077.0, 4057.0], estimated_score_(-0....",0.869565,0.392157,1.081654
12,(is_meter_amr),"(#_days_(2077.0, 4057.0], service_end_date_10/...",0.897959,0.862745,1.040816
2,(is_meter_amr),(service_end_date_10/22/2019),0.897959,0.862745,1.040816
...,...,...,...,...,...
78,(service_start_date_12/13/2012),(is_federal),0.975000,0.764706,1.035937
128,"(#_days_(2077.0, 4057.0], service_start_date_1...",(is_federal),0.975000,0.764706,1.035937
134,"(service_start_date_12/13/2012, is_meter_amr)",(is_federal),0.975000,0.764706,1.035937
326,"(is_federal, development_type_FHA)",(borough_QUEENS),1.000000,0.647059,1.545455


### C1

In [None]:
ar_c1_kpca = association_rules(cluster_1_kpca_agg,num_itemsets=te_dataset.shape[0], min_threshold=0.7)[
            ['antecedents', 'consequents', 'confidence', 'support', 'lift']
        ]

ar_c1_kpca.sort_values('confidence', ascending=False).sort_values('antecedents')

  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)


Unnamed: 0,antecedents,consequents,confidence,support,lift
1003,(location_category_STREET),"(#_days_(2077.0, 4057.0], service_start_date_1...",1.000000,0.327869,1.000000
544,"(consumption_(hcf)_(242.8, 1061.6])","(service_start_date_11/14/2012, borough_QUEENS)",0.928571,0.426230,1.317276
21,"(#_days_(2077.0, 4057.0])","(service_start_date_11/14/2012, service_end_da...",0.901639,0.901639,1.000000
667,"(current_charges_(3040.262, 9610.33])","(consumption_(hcf)_(242.8, 1061.6], is_federal)",0.923077,0.393443,2.010989
291,"(#_days_(2077.0, 4057.0])","(service_start_date_11/14/2012, is_federal)",0.901639,0.901639,1.000000
...,...,...,...,...,...
594,"(current_charges_(3040.262, 9610.33])",(is_meter_amr),1.000000,0.426230,1.016667
593,"(current_charges_(3040.262, 9610.33])",(is_federal),1.000000,0.426230,1.109091
302,"(is_meter_amr, is_federal)","(#_days_(2077.0, 4057.0])",1.000000,0.885246,1.000000
341,"(#_days_(2077.0, 4057.0], borough_QUEENS)",(is_federal),1.000000,0.704918,1.109091


### C2

In [None]:
ar_c2_kpca = association_rules(cluster_2_kpca_agg, num_itemsets=te_dataset.shape[0], min_threshold=0.7)[
            ['antecedents', 'consequents', 'confidence', 'support', 'lift']
        ]

ar_c2_kpca.sort_values('confidence', ascending=False).sort_values('antecedents')

  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)


Unnamed: 0,antecedents,consequents,confidence,support,lift
964,"(bill_analyzed_score_(0.953, 0.969])","(#_days_(2077.0, 4057.0], is_meter_amr)",1.000000,0.306122,1.020833
694,(is_not_only_water_sewer_charges),"(is_federal, estimated_score_(-0.001, 0.0197])",0.826087,0.387755,1.124396
26,"(#_days_(2077.0, 4057.0], service_start_date_0...",(service_end_date_10/22/2019),0.895833,0.877551,0.997633
674,(is_not_only_water_sewer_charges),"(service_end_date_10/22/2019, development_type...",0.826087,0.387755,1.264946
682,(is_not_only_water_sewer_charges),"(service_end_date_10/22/2019, estimated_score_...",0.826087,0.387755,1.037904
...,...,...,...,...,...
429,"(service_start_date_01/14/2013, development_ty...","(#_days_(2077.0, 4057.0])",1.000000,0.734694,1.000000
428,"(#_days_(2077.0, 4057.0], development_type_FHA)",(service_start_date_01/14/2013),1.000000,0.734694,1.020833
423,"(development_type_FHA, estimated_score_(-0.001...",(is_federal),1.000000,0.612245,1.088889
411,"(is_meter_amr, development_type_FHA)",(is_federal),1.000000,0.714286,1.088889


### C3

In [None]:
ar_c3_kpca = association_rules(cluster_3_kpca_agg, num_itemsets=te_dataset.shape[0], min_threshold=0.52) [
            ['antecedents', 'consequents', 'confidence', 'support', 'lift']
        ]

ar_c3_kpca.sort_values('confidence', ascending=False).sort_values('antecedents')

Unnamed: 0,antecedents,consequents,confidence,support,lift
678,"(is_only_water_sewer_charges, development_type...","(is_federal, borough_QUEENS)",1.000000,0.222031,2.371429
346,"(bill_analyzed_score_(0.988, 1.0], location_ca...","(is_meter_amr, development_type_UNSPECIFIED_CA...",0.582524,0.206540,1.221829
106,"(development_type_UNSPECIFIED_CATEGORY, locati...","(is_only_water_sewer_charges, is_federal)",0.707547,0.387263,0.942855
34,(is_federal),"(is_meter_amr, location_category_UNSPECIFIED_T...",0.575290,0.512909,0.941530
313,"(bill_analyzed_score_(0.988, 1.0])","(development_type_UNSPECIFIED_CATEGORY, locati...",0.584559,0.273666,1.068015
...,...,...,...,...,...
431,"(is_federal, estimated_score_(0.75, 1.0])",(is_only_water_sewer_charges),0.867647,0.203098,1.024599
123,"(is_only_water_sewer_charges, is_federal, deve...",(is_meter_amr),0.835498,0.332186,1.050702
64,"(development_type_UNSPECIFIED_CATEGORY, locati...",(is_meter_amr),0.858491,0.469880,1.079617
478,"(is_only_water_sewer_charges, is_meter_amr, #_...",(location_category_UNSPECIFIED_TYPE),0.863874,0.283993,1.200744


### Comments

In this case, the three most "linear shaped" cluster, contains informations about meter located in QUEENS. Every cluster follows a specific period of time, but the same interval of days in which the meter has been registering water consumptions.

The more dense cluster otherwise, contains rules related to brooklyn AND queens. Brooklyn has a stronger association with AMR meters (82%) than Queens (70%), but Queens represents a larger share of the cluster. This might suggest differences in AMR rollout strategies or infrastructure across boroughs.

The really strange thing is the following: the consumptions and charges detected in the denser cluster looks higher than the ones detected in the linear shaped ones that are related to meters that register consumptions for a longer period of time.

## Conclusion

- For each cluster we have identified how for certain consumption range is associated with a certain charges one.
- AMR Meters Are Predominant, but Distribution Varies by Borough
- Groups in the data look like they are formed considering a specific time period or number of day in which the meter Is active
- While in the linear shaped cluster are characterized by a constant #_days interval, the dense cluster is characterized by groups of different and smaller #_days interval but meter in this cluster appear to register high levels of water consumption



## DBSCAN

In [None]:
clusters_dbscan = pd.Series(dbscan_kpca['eps0.9'])
clusters_dbscan.value_counts()

 0    578
 3     61
 2     49
 1     48
-1      6
Name: count, dtype: int64

In [None]:
cluster_0_dbscan = fpgrowth(te_dataset[clusters_dbscan == 0].astype(bool), use_colnames=True, min_support=0.3, max_len=3)
cluster_1_dbscan = fpgrowth(te_dataset[clusters_dbscan == 1].astype(bool), use_colnames=True, min_support=0.3, max_len=3)
cluster_2_dbscan = fpgrowth(te_dataset[clusters_dbscan == 2].astype(bool), use_colnames=True, min_support=0.3, max_len=3)
cluster_3_dbscan = fpgrowth(te_dataset[clusters_dbscan == 3].astype(bool), use_colnames=True, min_support=0.2, max_len=3)

In [None]:
ar_c0_dbscan = association_rules(cluster_0_dbscan, num_itemsets=te_dataset.shape[0], min_threshold=0.7)[
            ['antecedents', 'consequents', 'confidence', 'support', 'lift']
        ]

ar_c0_dbscan.sort_values('confidence', ascending=False).sort_values('antecedents')

Unnamed: 0,antecedents,consequents,confidence,support,lift
111,(borough_QUEENS),(is_only_water_sewer_charges),0.811475,0.342561,0.959167
97,"(#_days_(5.999, 980.0])","(is_only_water_sewer_charges, location_categor...",0.771654,0.339100,1.215302
49,(development_type_UNSPECIFIED_CATEGORY),"(is_only_water_sewer_charges, is_federal)",0.706790,0.396194,0.941301
4,(is_only_water_sewer_charges),(is_meter_amr),0.791411,0.669550,0.996592
46,(development_type_UNSPECIFIED_CATEGORY),"(is_federal, location_category_UNSPECIFIED_TYPE)",0.793210,0.444637,1.291480
...,...,...,...,...,...
62,"(is_only_water_sewer_charges, estimated_score_...",(is_federal),0.878327,0.399654,0.983862
17,"(is_federal, location_category_UNSPECIFIED_TYPE)",(is_only_water_sewer_charges),0.881690,0.541522,1.042161
53,"(estimated_score_(-0.001, 0.0197])",(is_federal),0.883495,0.472318,0.989652
67,"(estimated_score_(-0.001, 0.0197])","(is_meter_amr, location_category_UNSPECIFIED_T...",0.705502,0.377163,1.158466


In [72]:
ar_c1_dbscan = association_rules(cluster_1_dbscan,num_itemsets=te_dataset.shape[0], min_threshold=0.7)[
            ['antecedents', 'consequents', 'confidence', 'support', 'lift']
        ]

ar_c1_dbscan.sort_values('confidence', ascending=False).sort_values('antecedents')

  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)


Unnamed: 0,antecedents,consequents,confidence,support,lift
862,"(service_start_date_12/13/2012, current_charge...",(is_federal),1.000000,0.354167,1.043478
639,(location_category_STREET),"(service_end_date_10/22/2019, development_type...",0.894737,0.354167,1.480944
449,"(#_days_(2077.0, 4057.0], consumption_(hcf)_(1...",(service_end_date_10/22/2019),0.892857,0.520833,0.974026
636,(location_category_STREET),"(service_start_date_12/13/2012, development_ty...",0.894737,0.354167,1.533835
71,(is_meter_amr),(service_start_date_12/13/2012),0.829787,0.812500,1.021277
...,...,...,...,...,...
330,"(service_start_date_12/13/2012, development_ty...","(estimated_score_(-0.001, 0.0197])",0.964286,0.562500,1.128920
335,"(development_type_FHA, estimated_score_(-0.001...",(service_end_date_10/22/2019),0.964286,0.562500,1.051948
41,"(#_days_(2077.0, 4057.0], estimated_score_(-0....",(service_end_date_10/22/2019),0.975610,0.833333,1.064302
321,"(development_type_FHA, estimated_score_(-0.001...",(is_meter_amr),1.000000,0.583333,1.021277


In [73]:
ar_c2_dbscan = association_rules(cluster_2_dbscan,num_itemsets=te_dataset.shape[0], min_threshold=0.7)[
            ['antecedents', 'consequents', 'confidence', 'support', 'lift']
        ]

ar_c2_dbscan.sort_values('confidence', ascending=False).sort_values('antecedents')

  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)


Unnamed: 0,antecedents,consequents,confidence,support,lift
964,"(bill_analyzed_score_(0.953, 0.969])","(#_days_(2077.0, 4057.0], is_meter_amr)",1.000000,0.306122,1.020833
694,(is_not_only_water_sewer_charges),"(is_federal, estimated_score_(-0.001, 0.0197])",0.826087,0.387755,1.124396
26,"(#_days_(2077.0, 4057.0], service_start_date_0...",(service_end_date_10/22/2019),0.895833,0.877551,0.997633
674,(is_not_only_water_sewer_charges),"(service_end_date_10/22/2019, development_type...",0.826087,0.387755,1.264946
682,(is_not_only_water_sewer_charges),"(service_end_date_10/22/2019, estimated_score_...",0.826087,0.387755,1.037904
...,...,...,...,...,...
429,"(service_start_date_01/14/2013, development_ty...","(#_days_(2077.0, 4057.0])",1.000000,0.734694,1.000000
428,"(#_days_(2077.0, 4057.0], development_type_FHA)",(service_start_date_01/14/2013),1.000000,0.734694,1.020833
423,"(development_type_FHA, estimated_score_(-0.001...",(is_federal),1.000000,0.612245,1.088889
411,"(is_meter_amr, development_type_FHA)",(is_federal),1.000000,0.714286,1.088889


In [74]:
ar_c3_dbscan = association_rules(cluster_3_dbscan,num_itemsets=te_dataset.shape[0], min_threshold=0.7)[
            ['antecedents', 'consequents', 'confidence', 'support', 'lift']
        ]

ar_c3_dbscan.sort_values('confidence', ascending=False).sort_values('antecedents')

  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)


Unnamed: 0,antecedents,consequents,confidence,support,lift
65,"(estimated_score_(-0.001, 0.0197])",(is_federal),0.905660,0.786885,1.004460
48,(service_end_date_10/22/2019),"(#_days_(2077.0, 4057.0], is_federal)",0.909091,0.819672,1.008264
715,"(consumption_(hcf)_(242.8, 1061.6], estimated_...",(development_type_FHA),0.913043,0.344262,1.326087
149,(is_only_water_sewer_charges),"(is_meter_amr, service_end_date_10/22/2019)",0.916667,0.540984,1.035494
42,(service_end_date_10/22/2019),"(service_start_date_11/14/2012, is_federal)",0.909091,0.819672,1.008264
...,...,...,...,...,...
766,"(borough_QUEENS, current_charges_(3040.262, 96...",(development_type_FHA),1.000000,0.409836,1.452381
769,"(borough_QUEENS, current_charges_(3040.262, 96...",(service_start_date_11/14/2012),1.000000,0.409836,1.000000
772,"(borough_QUEENS, current_charges_(3040.262, 96...","(#_days_(2077.0, 4057.0])",1.000000,0.409836,1.000000
750,"(service_start_date_11/14/2012, current_charge...",(is_federal),1.000000,0.426230,1.109091
