Skip to content

Commit

Permalink
Implemented Multiple-Word-Search by splitting the original searchterm…
Browse files Browse the repository at this point in the history
… and perform search for each loose word.
  • Loading branch information
ronaldmansveld committed Mar 19, 2012
1 parent 66840a6 commit b34715b
Showing 1 changed file with 79 additions and 73 deletions.
152 changes: 79 additions & 73 deletions classes/search.php
Expand Up @@ -15,55 +15,55 @@
namespace Search;

class Search {

protected $term = null;
protected $data = null;
protected $fields = array();
protected $relevance = 75;
protected $limit = null;
protected $offset = null;
private $_words = array();

private $_words = array();

public static function find($search_term = '') {
return new static($search_term);
}

public function __construct($search_term = '') {
if (!is_string($search_term)) throw new InvalidArgumentException('The search term must be a string');
$this->term = strtolower($search_term);

$this->term = explode(' ', strtolower($search_term));

return $this;
}


/**
* The data you want to search in. Accepts an array of arrays and an array
* of objects.
*
* @param mixed $data The data to search through. An array of arrays, or an array of objects
* @return $this
* @return $this
*/
public function in($data) {
if (!is_array($data)) {
throw new InvalidArgumentException('Search only accepts an array of arrays, or an array of objects');
}

// lets make sure this is data we can search through, before we waste resources on trying
foreach ($data as $key => $value) {
if (!is_object($value) && !is_array($value)) {
throw new InvalidArgumentException('Search only accepts an array of arrays, or an array of objects');
}
}

$this->data = $data;
// lets try and prevent the next step, because with big datasets, this could turn out to be pretty memory intensive
//$this->_normalized_data = \Format::forge($data)->to_array();

return $this;
}

/**
* The fields of the data entries you wan't to look into for the search term
*
Expand All @@ -72,35 +72,35 @@ public function in($data) {
*/
public function by($fields = null) {
if (!is_array($fields)) $fields = func_get_args();

$this->fields = array_merge($this->fields, $fields);
array_unique($this->fields);

return $this;
}

/**
* Limit the amount of the results you'll get back
* @param int $limit the limit of results you want to get back
* @return $this
*/
public function limit($limit) {
$this->limit = (int) $limit;

return $this;
}

/**
* offset the results you'll get back
* @param int $offset the offset from which you want results back
* @return $this
*/
public function offset($offset) {
$this->offset = (int) $offset;

return $this;
}

/**
* Determine how well the results should relate to the searchterm by percentage.
* With a relevance of 50% using a 8 letter search term, results are included that
Expand All @@ -111,16 +111,16 @@ public function offset($offset) {
*/
public function relevance($relevance) {
$relevance = (int) $relevance;

if ($relevance > 100) $relevance = 100;
if ($relevance < 1) $relevance = 1;

$this->relevance = $relevance;

return $this;
}


/**
* Executes the search that's been built. It retrieves the words out of the
* specified fields from the data. It then generates a score for each one of them,
Expand All @@ -132,46 +132,49 @@ public function execute() {
if (empty($this->data)) { // if there is no data to work with
return array(); // searching in nothing leads to nothing
}

$search_term = $this->term;
$search_term_length = strlen($search_term);

$relevance = $this->relevance;

$cost_limit = (int) ceil( ($search_term_length * (100-$relevance) ) / 100);
$search_term_minlen = $search_term_length - $cost_limit; // anything that's the cost limit longer that search term, will score too low for sure
$search_term_maxlen = $search_term_length + $cost_limit; // anything that's the cost limit longer that search term, will score too high for sure

$this->get_words($search_term_minlen, $search_term_maxlen);

$min_len = $cost_limit;
$max_len = 0;

foreach ($this->term as $term) {
$min_len = min($min_len, strlen($term) - $cost_limit); // anything that's the cost limit longer that search term, will score too low for sure
$max_len = max($max_len, strlen($term) + $cost_limit); // anything that's the cost limit longer that search term, will score too high for sure
}

$this->get_words($min_len, $max_len);
$entry_scores = $this->get_entry_scores($cost_limit);

$results = array();

uasort($entry_scores, function($a, $b){ // sort the entries by score
return ($a < $b) ? -1 : 1;
});

$i = 0;
$j = 1;
\Debug::dump($this->offset);

This comment has been minimized.

Copy link
@JaapRood

JaapRood Mar 20, 2012

Owner

Debug still here, woops!

foreach ($entry_scores as $entry_key => $score) {
$i++;
if (is_int($this->offset) && $this->offset >= $i) continue;


$results[$entry_key] = $this->data[$entry_key];


if (is_int($this->limit) && $j >= $this->limit) {
break;
} else {
$j++;
}
}

return $results;
}


/**
* Get's all the words from the specified fields in the data. they will be set
* with a reference to the data it belongs to so it can be linked back
Expand All @@ -182,46 +185,44 @@ public function execute() {
*/
protected function get_words($min_length, $max_length) {
if (empty($this->data) || empty($this->fields)) return array();

$this->_words = array();

foreach ($this->data as $entry_key => $entry) {
if (!is_object($entry) && !is_array($entry)) continue; // if this is not either an array or an object, we won't be able to work with it
$words_in_entry = array();

foreach ($this->fields as $field) {
if (is_array($entry)) { // if this entry is an array
if (!array_key_exists($field, $entry)) continue; // if the field is not set in this entry, there is not much to find!

$field_contents = $entry[$field];
} else { // because of the if earlier statement, if it's not an array, it must be an object
if (!isset($entry->$field)) continue; // if the field is not set in this entry, there is not much to find!

$field_contents = $entry->$field;
}

$field_words = explode(' ', $field_contents);

foreach ($field_words as $word) {
$word = strtolower($word);

if (isset($words_in_entry[$word])) continue; // if we already found this word in this entry, we don't need it again

$word_length = strlen($word);

if ($word_length >= $min_length && $word_length <= $max_length) {
if (!isset($words[$word])) $words[$word] = array(); // add the word if it doesnt exist yet

$this->_words[$word][] = array('key' => $entry_key);
}

$words_in_entry[$word] = true; // mark this word as done for this entry
}

$words_in_entry[$word] = true; // mark this word as done for this entry
}
}
}

/**
* Determine the scores per entry. With multiple matching words, the best score goes
*
Expand All @@ -231,8 +232,13 @@ protected function get_words($min_length, $max_length) {
protected function get_entry_scores($cost_limit = 0) {
$results = array(); // array of entry_key => score
foreach ($this->_words as $word => $entries) {
$score = static::get_word_score($word, $this->term, $cost_limit + 1);


//run score for each word in searchterm, lowest score wins.
$score = $cost_limit + 1; //init score
foreach ($terms as $term) {
$score = min(static::get_word_score($word, $term, $cost_limit + 1), $score);
}

if ($score <= $cost_limit) { // if this word scores within our cost limit
foreach ($entries as $entry) {
if (!isset($results[$entry['key']]) || (isset($results[$entry['key']]) && $results[$entry['key']] > $score)) { // if his entries score improved
Expand All @@ -241,17 +247,17 @@ protected function get_entry_scores($cost_limit = 0) {
}
}
}

return $results;
}

/**
* Calculate the distance between $word and $search_term based on the Damerau-Levenshtein algorithm.
*
* Credits for this algorithm implementation go out to Ronald Mansveld, who worked hard to make it efficient
* while creating useful results
* @author Ronald Mansveld
*
*
* @param string $word The word to check
* @param string $search_term The searchterm to check word against
* @param int $cost_limit The maximum cost we are looking for (so we can break early on words with higher costs)
Expand All @@ -262,13 +268,13 @@ protected function get_entry_scores($cost_limit = 0) {
*/
protected static function get_word_score($word, $search_term, $cost_limit) {
if ($word == $search_term) return 0;

$len1 = strlen($word);
$len2 = strlen($search_term);

if ($len1 == 0) return $len2;
if ($len2 == 0) return $len1;

//strip common prefix
$i = 0;
do {
Expand All @@ -277,12 +283,12 @@ protected static function get_word_score($word, $search_term, $cost_limit) {
$len1--;
$len2--;
} while ($len1 > 0 && $len2 > 0);

if ($i > 0) {
$word = substr($word, $i);
$search_term = substr($search_term, $i);
}

//strip common suffix
$i = 0;
do {
Expand All @@ -295,10 +301,10 @@ protected static function get_word_score($word, $search_term, $cost_limit) {
$word = substr($word, 0, $len1);
$search_term = substr($search_term, 0, $len2);
}

if ($len1 == 0) return $len2;
if ($len2 == 0) return $len1;

$matrix = array();
for ($i = 0; $i <= $len1; $i++) {
$matrix[$i] = array();
Expand All @@ -307,7 +313,7 @@ protected static function get_word_score($word, $search_term, $cost_limit) {
for ($i = 0; $i <= $len2; $i++) {
$matrix[0][$i] = $i;
}

for ($i = 1; $i <= $len1; $i++) {
$best = $cost_limit;
for ($j = 1; $j <= $len2; $j++) {
Expand Down

3 comments on commit b34715b

@JaapRood
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sweet. This is a good quick improvement on the current results. I'm curious on how this performs, but it should still do pretty well. However, we do need a bit more context sensitivity in our results.

With this approach, when searching for "snow fun" an entry containing both "snow" and "fun" won't score any better than ones that only contain one of the words. Somehow, we will have to add the scores for words together to make entries containing both words come out on top.

PS: we need to sort out our line endings :P

@ronaldmansveld
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Locally I have a new version that incorporates more context sensitivity by adding scores per word.

Is there any news on datasets for testing? I'd like to test this new version before pushing ;)

@JaapRood
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Push it to "origin develop" so a development branch gets created, I can test it on my local testdata (and so could anyone else!)

Please sign in to comment.