## Pbc4cip- Python Implementation

In [1]:
import sys
import weka.core.jvm as jvm
import weka.core.packages as packages
from weka.core.classes import complete_classname
from weka.core.converters import Loader
from weka.classifiers import Classifier
import weka.plot.graph as graph  # NB: pygraphviz and PIL are required
from weka.core.classes import Random, from_commandline
import weka.core.serialization as serialization
from weka.filters import Filter
import re
from tqdm import tqdm
import time
import pandas as pd
import pickle

In [2]:
# User specified - Dany
wekafiles_path = "/Users/dannygc/wekafiles"
PBC4CIP_zip_path = "/Users/dannygc/Google Drive File Stream/My Drive/MCCNotes/MCC 3/Adv_ML/PBC_HW4/PBC4cip-1.0-weka.zip"
data_dir = "/Volumes/GoogleDrive/My Drive/MCCNotes/Jlab projects/GITHUB_repositories/DRAE_repositories/Exped_Visualizations/"
arff_file = "weka_dataset_clean_.arff"

In [3]:
# User specified - Michael
#wekafiles_path = "C:/Users/mzenk/wekafiles"
#PBC4CIP_zip_path = "C:/Users/mzenk/Google Drive/ITESM/Maestría/Semestre 3/ML2/Assignment4/PBC4cip-1.0-weka.zip"
#data_dir = "C:/Users/mzenk/Google Drive/ITESM/Maestría/Semestre 3/ML2/Assignment4/Exped_Visualizations/"
#arff_file = "weka_dataset_clean_.arff"

In [4]:
# Criteria for Patterns
updated_PBC4CIP = False
min_support_diff = 0.3
min_count_diff = 30
min_ratio = 0.6
max_patterns = 15

In [5]:
#Parameters
trees = 150
maxDepth = 5
objectsByLeaf = 35

In [6]:

attname = 'Forma del procedimiento'
attnum = 0
filename = attname + str(attnum)

# Dictionary with the nominal attributes in order, and their respective onehot indexes
# Only change this if an attribute is moved or deleted
nom_pos = {
    'Caracter del procedimiento' : [18,19],
    'Forma del procedimiento' : [7,8,9],
    'Operador' : [],
    'Correo electronico' : [],
    'Entidad federativa' : [10,11,12,13,14,15],
    'Tipo de contratación' : [21,22,23,24,25],
    'Articulo' : [18,19,20],
    'Plantilla' : [26,27,28,29,30,31]
}
class_index = len(nom_pos.keys())-1

In [7]:
pickle_file = 'dany.pickle'
load_pickle = True
save_pickle = True

In [8]:
if save_pickle:
    with open(pickle_file, 'wb') as f:
        pickle.dump([   wekafiles_path,
                        PBC4CIP_zip_path,
                        data_dir,
                        arff_file,
                        updated_PBC4CIP], f)

In [9]:
if load_pickle:
    with open(pickle_file, 'rb') as f:
        wekafiles_path, PBC4CIP_zip_path, data_dir, arff_file, updated_PBC4CIP = pickle.load(f)

In [10]:
start_time = time.time()
#jvm.start(packages=True, max_heap_size="12g") #max_heap_size 512m, 4g. packages=true searches for weka packages in installation program
jvm.start(packages=wekafiles_path, max_heap_size='12g')

DEBUG:weka.core.jvm:Adding bundled jars
DEBUG:weka.core.jvm:Classpath=[&#39;/Users/dannygc/anaconda3/lib/python3.7/site-packages/javabridge/jars/rhino-1.7R4.jar&#39;, &#39;/Users/dannygc/anaconda3/lib/python3.7/site-packages/javabridge/jars/runnablequeue.jar&#39;, &#39;/Users/dannygc/anaconda3/lib/python3.7/site-packages/javabridge/jars/cpython.jar&#39;, &#39;/Users/dannygc/anaconda3/lib/python3.7/site-packages/weka/lib/python-weka-wrapper.jar&#39;, &#39;/Users/dannygc/anaconda3/lib/python3.7/site-packages/weka/lib/weka.jar&#39;]
DEBUG:weka.core.jvm:MaxHeapSize=12g
DEBUG:weka.core.jvm:Using alternative Weka home directory: /Users/dannygc/wekafiles
DEBUG:weka.core.jvm:Using alternative Weka home directory: /Users/dannygc/wekafiles


In [11]:
pkg = "PBC4cip"
print(complete_classname("." + pkg))
# install package if necessary 
if not packages.is_installed(pkg):
    print("Installing %s..." % pkg)
    #packages.install_package("http://prdownloads.sourceforge.net/weka/discriminantAnalysis1.0.3.zip?download")
    packages.install_package(PBC4CIP_zip_path)
    print("Installed %s, please re-run script!" % pkg)
    jvm.stop()
    sys.exit(0)
else:
    print(pkg + " is already installed.")

weka.classifiers.trees.PBC4cip
PBC4cip is already installed.


In [12]:
# testing classname completion

print("\n\n\n\n\n")
print(">>> Start...")

loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(data_dir + arff_file)

cmdline = []







&gt;&gt;&gt; Start...


# Preprocess
Here the one-hot attributes not used are deleted.
Also the nominal attribute where the class being evaluated comes from is also deleted.

In [13]:
pos_vector = [False for i in range(43)]
pos_vector[0:8] = [True for i in range(8)]
pos_vector[33:43] = [True for i in range(10)]

# Dict that will contain filtered datasets
filtered_data = {
    'Caracter del procedimiento' : [],
    'Forma del procedimiento' : [],
    'Entidad federativa' : [],
    'Tipo de contratación' : [],
    'Articulo' : [],
    'Plantilla' : []
}
for key, value in nom_pos.items():
    for onehot_att in value:
        # Here modify pos vector then feed it into weka remove
        # Change nom att to false
        # Change only specific onehot to true
        # See how to save it to change the class also
        nominal_att_index = list(nom_pos.keys()).index(key)
        pos_vector[nominal_att_index] = False
        pos_vector[onehot_att-1] = True
        indeces_not_filtered = [i+1 for i, val in enumerate(pos_vector) if val]
        pos_vector[nominal_att_index] = True
        pos_vector[onehot_att-1] = False
        remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R",",".join(map(str, indeces_not_filtered)),"-V"])
        remove.inputformat(data)
        filtered_data[key].append(remove.filter(data))
            

# Criterions

In [14]:
data = filtered_data[attname][attnum]
data.class_index = class_index

input_config = f'weka.classifiers.trees.PBC4cip -S 1 -miner \"PRFramework.Core.SupervisedClassifiers.EmergingPatterns.Miners.RandomForestMinerWithoutFiltering -bagSizePercent 100 -numFeatures -1 -numTrees {trees} -builder \\\"PRFramework.Core.SupervisedClassifiers.DecisionTrees.Builder.DecisionTreeBuilder -distributionEvaluator \\\\\\\"PRFramework.Core.SupervisedClassifiers.DecisionTrees.DistributionEvaluators.QuinlanGain \\\\\\\" -maxDepth {maxDepth} \\\\\\\"-minimalObjByLeaf \\\\\\\" {objectsByLeaf} -minimalSplitGain 1.0E-30\\\"\"'

classifier = from_commandline(input_config, classname="weka.classifiers.Classifier")

print(">>> Building classifier...")
start_time_1 = time.time()
classifier.build_classifier(data)
print(f">>> [Done] Bulding classifier. Time: {(time.time() - start_time_1)} seconds ---")

print(">>> Serializing model...")
start_time_1 = time.time()
classifier.serialize(f"{filename}.model")
print(f">>> [Done] Serializing model. Time: {(time.time() - start_time_1)} seconds ---")

print(">>> Generating big string...")
start_time_1 = time.time()
big_string = str(classifier).split("]\n")
print(f">>> [Done] Generating big string. Time: {(time.time() - start_time_1)} seconds ---")

&gt;&gt;&gt; Building classifier...
&gt;&gt;&gt; [Done] Bulding classifier. Time: 288.4211390018463 seconds ---
&gt;&gt;&gt; Serializing model...
&gt;&gt;&gt; [Done] Serializing model. Time: 1.1611230373382568 seconds ---
&gt;&gt;&gt; Generating big string...
&gt;&gt;&gt; [Done] Generating big string. Time: 1.1709182262420654 seconds ---


In [15]:
attributes = []
c0_count = 0
c1_count = 0

for instance in data:
    if instance.values[class_index] == 0:
        c0_count += 1
    else:
        c1_count += 1
print('C0_count:', c0_count)
print('C1_count:', c1_count)
print('Total:', c0_count + c1_count)

C0_count: 68902
C1_count: 27220
Total: 96122


In [15]:
list_fields = []
list_support_c0 = []
list_support_c1 = []
list_num_c0 = []
list_num_c1 = []
list_diff = []
with tqdm(total=len(big_string)) as pbar:
    for item in big_string:
        text = ""
        text = item + "]"
        text = text.split("[")
        fields = text[0]
        # print(fields)
        if updated_PBC4CIP:
            if (len(text) > 1):
                class_nums = text[1]
                class_nums = class_nums[:-1]
                class_nums = class_nums.split()
                supports = text[2]
                supports = supports[:-1]
                supports = supports.split()
                list_fields.append(fields.strip())
                list_support_c0.append(float(supports[0]))
                list_support_c1.append(float(supports[1]))
                list_num_c0.append(float(class_nums[0]))
                list_num_c1.append(float(class_nums[1]))
        else:
            if (len(text) > 1):
                supports = text[1]
                supports = supports[:-1]
                supports = supports.split()
                list_fields.append(fields.strip())
                list_support_c0.append(float(supports[0]))
                list_support_c1.append(float(supports[1]))
                list_num_c0.append(float(supports[0])*c0_count)
                list_num_c1.append(float(supports[1])*c1_count)
        pbar.update(1)

100%|██████████| 2200/2200 [00:00&lt;00:00, 245483.22it/s]


In [16]:
df = pd.DataFrame(columns = ['fields', 'support_c0', 'support_c1', 'num_c0', 'num_c1'])
df['fields']=list_fields
df['support_c0']=list_support_c0
df['support_c1']= list_support_c1
df['num_c0']= list_num_c0
df['num_c1']= list_num_c1
df.head()

Unnamed: 0,fields,support_c0,support_c1,num_c0,num_c1
0,Entidad federativa = 'centro' AND Tipo de cont...,0.0,0.0,0.0,0.0
1,Articulo = '43' AND Correo electronico = 'espa...,0.0,0.0,0.0,0.0
2,Correo electronico = 'diconsa' AND Operador = ...,0.0,0.0,0.0,0.0
3,Correo electronico = 'diconsa' AND Most_used_U...,0.0,0.0,0.0,0.0
4,Caracter del procedimiento != 'nacional' AND T...,0.0,0.0,0.0,0.0


In [17]:
df['s_diff'] = df.support_c1 - df.support_c0
df['n_diff'] = df.num_c1 - df.num_c0
df['ratio'] = df.num_c1/(df.num_c0 + df.num_c1)

df = df[df.s_diff > min_support_diff]
df = df[df.n_diff > min_count_diff]
df = df[df.ratio > min_ratio]

df = df.sort_values(by=['s_diff'], ascending=False)
df = df[:max_patterns]

df.head()

Unnamed: 0,fields,support_c0,support_c1,num_c0,num_c1,s_diff,n_diff,ratio
2198,Most_used_description_word_adquisicion = '0' A...,0.17,0.9,11713.34,24498.0,0.73,12784.66,0.676528
2197,Caracter del procedimiento = 'nacional' AND Co...,0.16,0.88,11024.32,23953.6,0.72,12929.28,0.684821
2195,Tipo de contratacion = 'servicios' AND Caracte...,0.14,0.82,9646.28,22320.4,0.68,12674.12,0.69824
2189,Tipo de contratacion = 'servicios' AND Articul...,0.14,0.81,9646.28,22048.2,0.67,12401.92,0.695648
2192,Tipo de contratacion = 'servicios' AND Articul...,0.14,0.81,9646.28,22048.2,0.67,12401.92,0.695648


## Export to Patterns Obtained

In [None]:
df.to_csv(f"Patterns_Obtained/{filename}.csv", index=False)
jvm.stop()