In [25]:
import sys
import weka.core.jvm as jvm
import weka.core.packages as packages
from weka.core.classes import complete_classname
from weka.core.converters import Loader
from weka.classifiers import Classifier
import weka.plot.graph as graph  # NB: pygraphviz and PIL are required
from weka.core.classes import Random, from_commandline
import weka.core.serialization as serialization
from weka.filters import Filter
import re
from tqdm import tqdm
import time
import pandas as pd

In [29]:
wekafiles_path = "C:/Users/mzenk/wekafiles"
PBC4CIP_zip_path = "C:/Users/mzenk/Google Drive/ITESM/Maestría/Semestre 3/ML2/Assignment4/PBC4cip-1.0-weka.zip"
data_dir = "C:/Users/mzenk/Google Drive/ITESM/Maestría/Semestre 3/ML2/Assignment4/Exped_Visualizations/"
arff_file = "weka_dataset_clean.arff"

In [26]:
start_time = time.time()
#jvm.start(packages=True, max_heap_size="12g") #max_heap_size 512m, 4g. packages=true searches for weka packages in installation program
jvm.start(packages=wekafiles_path, max_heap_size='12g')

INFO:weka.core.jvm:JVM already running, call jvm.stop() first


In [27]:
pkg = "PBC4cip"
print(complete_classname("." + pkg))
# install package if necessary 
if not packages.is_installed(pkg):
    print("Installing %s..." % pkg)
    #packages.install_package("http://prdownloads.sourceforge.net/weka/discriminantAnalysis1.0.3.zip?download")
    packages.install_package(PBC4CIP_zip_path)
    print("Installed %s, please re-run script!" % pkg)
    jvm.stop()
    sys.exit(0)
else:
    print(pkg + " is already installed.")

weka.classifiers.trees.PBC4cip
PBC4cip is already installed.


In [30]:
# testing classname completion

print("\n\n\n\n\n")
print(">>> Start...")

loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(data_dir + arff_file)
data.class_is_last()

filename_array = ["NumberOfTrees", "RandomFeatures", "DepthOfTrees", "MinimumObjectByLeaf", "RandomSubSpace", "InfoGainAttributeEval", "PrincipalComponents", "GainRatioAttributeEval", "WrapperSubsetEval", "CorrelationAttributeEval"]
filename = str(filename_array[9])
filename_index = filename_array.index(filename)

cmdline = []







&gt;&gt;&gt; Start...


# Preprocess
Here the one-hot attributes not used are deleted.
Also the nominal attribute where the class being evaluated comes from is also deleted.

(&#39;d&#39;, &#39;e&#39;)


In [41]:
pos_vector = [False for i in range(41)]
pos_vector[0:6] = [True for i in range(6)]
pos_vector[31:41] = [True for i in range(10)]
# Dictionary with the nominal attributes in order, and their respective onehot positions
nom_pos = {
    'Carácter del procedimiento' : [16,17]
    'Forma del procedimiento' : [7,8,9]
    'Entidad federativa' : [10,11,12,13,14,15]
    'Tipo de contratación' : [21,22,23,24,25]
    'Artículo' : [18,19,20]
    'Plantilla' : [26,27,28,29,30,31]
}
# Dict that will contain filtered datasets
filtered_data = {
    'Carácter del procedimiento' : []
    'Forma del procedimiento' : []
    'Entidad federativa' : []
    'Tipo de contratación' : []
    'Artículo' : []
    'Plantilla' : []
}
for key, value in nom_pos.items:
    for onehot_att in value:
        # Here modify pos vector then feed it into weka remove
        # Change nom att to false
        # Change only specific onehot to true
        # See how to save it to change the class also
        nominal_att_index = list(nom_pos.keys()).index(key)
        pos_vector[nominal_att_index] = False
        pos_vector[onehot_att] = True
        remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", pos_vector,"-V"])
        remove.inputformat(data)
        data.class_index()
        filtered_data[key].append(remove.filter(data))
            

[True,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True]

# Criterions

In [None]:
# Number of trees: 1000
trees = 150
maxDepth = 10
objectsByLeaf = 35

input_config = f'weka.classifiers.trees.PBC4cip -S 1 -miner \"PRFramework.Core.SupervisedClassifiers.EmergingPatterns.Miners.RandomForestMinerWithoutFiltering -bagSizePercent 100 -numFeatures -1 -numTrees {trees} -builder \\\"PRFramework.Core.SupervisedClassifiers.DecisionTrees.Builder.DecisionTreeBuilder -distributionEvaluator \\\\\\\"PRFramework.Core.SupervisedClassifiers.DecisionTrees.DistributionEvaluators.QuinlanGain \\\\\\\" -maxDepth {maxDepth} \\\\\\\"-minimalObjByLeaf \\\\\\\" {objectsByLeaf} -minimalSplitGain 1.0E-30\\\"\"'

classifier = from_commandline(input_config, classname="weka.classifiers.Classifier")

print(">>> Building classifier...")
start_time_1 = time.time()
classifier.build_classifier(data)
print(f">>> [Done] Bulding classifier. Time: {(time.time() - start_time_1)} seconds ---")

print(">>> Serializing model...")
start_time_1 = time.time()
classifier.serialize(f"{filename}.model")
print(f">>> [Done] Serializing model. Time: {(time.time() - start_time_1)} seconds ---")

print(">>> Generating big string...")
start_time_1 = time.time()
big_string = str(classifier).split("]")
print(f">>> [Done] Generating big string. Time: {(time.time() - start_time_1)} seconds ---")

In [None]:
list_fields = []
list_supports = []
list_top100 = []
list_top200 = []
list_diff = []
with tqdm(total=len(big_string)) as pbar:
    for item in big_string:
        text = ""
        text = item + "]"
        text = text.split("[")
        fields = text[0]
        # print(fields)
        if (len(text) > 1):
            supports = text[1]
            supports = supports[:-1]
            suuports = supports.split()
            supports_split = supports.split()
            top100 = float(supports_split[0])
            top200 = float(supports_split[1])
            diff = float(top100 - top200)
            list_fields.append(fields.strip())
            list_supports.append(supports_split)
            list_top100.append(top100)
            list_top200.append(top200)
            list_diff.append(diff)
        pbar.update(1)

In [None]:
df = pd.DataFrame(columns = ['fields', 'supports', 'top100', 'top200', 'diff'])
df['fields']=list_fields
df['supports']=list_supports
df['top100']=list_top100
df['top200']=list_top200
df['diff']=list_diff
df.to_csv(f"{filename}.csv", index=False)

In [None]:
with tqdm(total=len(big_string), file=sys.stdout) as pbar:
    with open(f'{filename}.txt', 'w') as f:
        print("input: " + input_config, file=f)
        print("output: " + classifier.to_commandline(), file=f)
        print("model:\n", file=f)
        for item in big_string:
            text = item + "]"
            print(text.strip(), file=f)
            pbar.update(1)
    
print(f"--- {(time.time() - start_time)} seconds ---")
    
jvm.stop()