Skip to content

Commit

Permalink
Added support for flagging mode. Implements a separate and conquer ev…
Browse files Browse the repository at this point in the history
…olution search. Note: the dataset has to be annotated in this way: positive examples, all example is a match, negative examples, all the example is an unmatch.
  • Loading branch information
ftarlao committed Jul 21, 2015
1 parent 19536ea commit bac59a1
Show file tree
Hide file tree
Showing 10 changed files with 720 additions and 50 deletions.
Expand Up @@ -38,6 +38,7 @@
import it.units.inginf.male.tree.NodeFactory;
import it.units.inginf.male.tree.RegexRange;
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
Expand Down Expand Up @@ -80,16 +81,26 @@ public Configuration() {
"\\}","\\{","\\(","\\)","\\[","\\]","<",">",
"@","#"," "," ");
this.ranges = new LinkedList<>();
this.operators = Arrays.asList("it.units.inginf.male.tree.operator.Group",
this.operators = new ArrayList<>(Arrays.asList("it.units.inginf.male.tree.operator.Group",
"it.units.inginf.male.tree.operator.NonCapturingGroup",
"it.units.inginf.male.tree.operator.ListMatch",
"it.units.inginf.male.tree.operator.ListNotMatch",
"it.units.inginf.male.tree.operator.MatchOneOrMore",
"it.units.inginf.male.tree.operator.MatchZeroOrMore",
"it.units.inginf.male.tree.operator.MatchZeroOrOne",
"it.units.inginf.male.tree.operator.MatchMinMax");
"it.units.inginf.male.tree.operator.MatchMinMax"));
//Add context wise operators (lookaround)
this.operators.addAll(
Arrays.asList("it.units.inginf.male.tree.operator.PositiveLookbehind","it.units.inginf.male.tree.operator.NegativeLookbehind",
"it.units.inginf.male.tree.operator.PositiveLookahead", "it.units.inginf.male.tree.operator.NegativeLookahead"));


this.initNodeFactory(); //initNodeFactory also instantiate the NodeFactory object, this decouples the terminalset between threads
List<Leaf> terminalSet = this.nodeFactory.getTerminalSet();
//Add default ranges
terminalSet.add(new RegexRange("A-Z"));
terminalSet.add(new RegexRange("a-z"));
terminalSet.add(new RegexRange("A-Za-z"));

this.evaluator = new CachedTreeEvaluator();
this.evaluator.setup(Collections.EMPTY_MAP);
Expand All @@ -100,14 +111,14 @@ public Configuration() {
this.strategyParameters.put("runStrategy","it.units.inginf.male.strategy.impl.SeparateAndConquerStrategy");

this.strategyParameters.put("runStrategy2","it.units.inginf.male.strategy.impl.DiversityElitarismStrategy");
//this.strategyParameters.put("terminationCriteria","true"); //this is already enabled in dto buildConifg

this.strategyParameters.put("objective2","it.units.inginf.male.objective.CharmaskMatchLengthObjective");


this.strategyParameters.put("threads","2");
this.strategy = new CombinedMultithreadStrategy(); //MultithreadStrategy();

//TerminalSet and population builder setup is performed later
this.terminalSetBuilderParameters = new HashMap<>();
this.terminalSetBuilderParameters.put("tokenThreashold","80.0");
this.terminalSetBuilder = new TokenizedContextTerminalSetBuilder();
Expand Down Expand Up @@ -160,6 +171,7 @@ public Configuration(Configuration cc) {
this.constants = cc.constants;
this.ranges = cc.ranges;
this.operators = cc.operators;
this.isFlagging = cc.isIsFlagging();
this.initNodeFactory(); //initialized nodeFacory after introducing constants and operators
}

Expand Down Expand Up @@ -188,6 +200,7 @@ public Configuration(Configuration cc) {
private List<String> operators;
private transient BestSelector bestSelector;
private Map<String, String> bestSelectorParameters;
private boolean isFlagging = false;

public NodeFactory getNodeFactory() {
return nodeFactory;
Expand Down Expand Up @@ -353,12 +366,21 @@ public BestSelector getBestSelector() {

public void setBestSelector(BestSelector bestSelector) {
this.bestSelector = bestSelector;
this.bestSelector.setup(Collections.EMPTY_MAP);
}

public Map<String, String> getBestSelectorParameters() {
return bestSelectorParameters;
}

public boolean isIsFlagging() {
return isFlagging;
}

public void setIsFlagging(boolean isFlagging) {
this.isFlagging = isFlagging;
}

public void setBestSelectorParameters(Map<String, String> bestSelectorParameters) {
this.bestSelectorParameters = bestSelectorParameters;
}
Expand Down
@@ -0,0 +1,203 @@
/*
* Copyright (C) 2015 Machine Learning Lab - University of Trieste,
* Italy (http://machinelearning.inginf.units.it/)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package it.units.inginf.male.generations;

import it.units.inginf.male.configuration.Configuration;
import it.units.inginf.male.inputs.Context;
import it.units.inginf.male.inputs.DataSet;
import it.units.inginf.male.inputs.DataSet.Example;
import it.units.inginf.male.tree.Constant;
import it.units.inginf.male.tree.Node;
import it.units.inginf.male.tree.RegexRange;
import it.units.inginf.male.tree.operator.Concatenator;
import it.units.inginf.male.tree.operator.ListMatch;
import it.units.inginf.male.tree.operator.MatchOneOrMore;
import it.units.inginf.male.utils.Utils;
import java.util.*;

/**
* Create a population, 3 individuals for each positive example.
* @author andrea + fabiano
*/
public class FlaggingNaivePopulationBuilder implements InitialPopulationBuilder {

private List<Node> population = new LinkedList<>();
private final boolean useDottification;
private final boolean useWordClasses;

/**
* Initialises a population from examples by replacing charcters with "\\w"
* and digits with "\\d"
*/
public FlaggingNaivePopulationBuilder() {
this.useDottification = true;
this.useWordClasses = true;
}

/**
* When dottification is true and useWordClasses is true, instances are
* initialized using character classes "[A-Za-z]" "\\d" When dottification is
* true and useWordClasses is false, instances are initialized by replacing
* characters and digits with "."
*
* @param useDottification
* @param useWordClasses
*/
public FlaggingNaivePopulationBuilder(boolean useDottification, boolean useWordClasses) {
this.useDottification = useDottification;
this.useWordClasses = useWordClasses;
}

@Override
public List<Node> init() {
return new ArrayList<>(population);
}

@Override
public void setup(Configuration configuration) {
DataSet trainingDataset = configuration.getDatasetContainer().getTrainingDataset();
this.population.addAll(this.setup(configuration, trainingDataset));
}

private List<Node> setup(Configuration configuration, DataSet usedTrainingDataset) {
Set<String> phrases = new HashSet<>();
List<Node> newPopulation = new LinkedList<>();
DataSet dataSet = usedTrainingDataset;


for (Example example : dataSet.getExamples()) {
if(!example.match.isEmpty()){
//is positive example
phrases.add(example.getString());
}
}

int examples = Math.min(configuration.getEvolutionParameters().getPopulationSize() / 3, phrases.size());

List<String> uniquePhrases = new ArrayList<>(phrases);

int counter = 0;
for (String node : uniquePhrases) {
if (this.useDottification) {
newPopulation.add(createByExample(node, true, false));
newPopulation.add(createByExample(node, true, true));
}
newPopulation.add(createByExample(node, false, false));
counter++;
if (counter >= examples) {
break;
}
}
return newPopulation;
}

private Node createByExample(String example, boolean replace, boolean compact) {
Deque<Node> nodes = new LinkedList<>();
Deque<Node> tmp = new LinkedList<>();

//String w = this.useWordClasses ? "\\w" : ".";
String d = this.useWordClasses ? "\\d" : ".";
Node letters;
if(useWordClasses){
letters = new ListMatch();
letters.getChildrens().add(new RegexRange("A-Za-z"));
} else {
letters = new Constant(".");
}

for (char c : example.toCharArray()) {
if (replace) {
if (Character.isLetter(c)) {
nodes.add(letters.cloneTree());
} else if (Character.isDigit(c)) {
nodes.add(new Constant(d));
} else {
nodes.add(new Constant(Utils.escape(c)));
}
} else {
nodes.add(new Constant(Utils.escape(c)));
}
}

//when two adiacent nodes are equal symbols/tokens, a quantifier is used to compact.
// /w/w/w is converted to /w++
if(compact){
Deque<Node> newNodes = new LinkedList<>();
String nodeValue;
String nextValue;
//do compact

while (nodes.size()>0) {
Node node = nodes.pollFirst();
nodeValue = node.toString();
boolean isRepeat = false;
while (nodes.size()>0){
Node next = nodes.peek();
nextValue = next.toString();

if(nodeValue.equals(nextValue)){
isRepeat = true;
//Consume and drop the repetition
nodes.pollFirst();
} else {
//They are different, get out
break;
}
}
if(isRepeat){
Node finalNode = new MatchOneOrMore();
finalNode.getChildrens().add(node);
node = finalNode;
}
newNodes.add(node);
}
nodes = newNodes;
}

while (nodes.size() > 1) {

while (nodes.size() > 0) {
Node first = nodes.pollFirst();
Node second = nodes.pollFirst();

if (second != null) {
Node conc = new Concatenator();
conc.getChildrens().add(first);
conc.getChildrens().add(second);
first.setParent(conc);
second.setParent(conc);
tmp.addLast(conc);
} else {
tmp.addLast(first);
}
}

nodes = tmp;
tmp = new LinkedList<>();

}


return nodes.getFirst();
}

@Override
public List<Node> init(Context context) {
return setup(context.getConfiguration(), context.getCurrentDataSet());
}
}

0 comments on commit bac59a1

Please sign in to comment.