Implemented Multiple-Word-Search by splitting the original searchterm…

… and perform search for each loose word.
JaapRood · Mar 19, 2012 · b34715b · JaapRood · Mar 20, 2012 · b34715b
1 parent 66840a6
commit b34715b
Showing 1 changed file with 79 additions and 73 deletions.
diff --git a/classes/search.php b/classes/search.php
@@ -15,55 +15,55 @@
 namespace Search;
 
 class Search {
-	
+
 	protected $term = null;
 	protected $data = null;
 	protected $fields = array();
 	protected $relevance = 75;
 	protected $limit = null;
 	protected $offset = null;
-	
-	private $_words = array(); 
-	
+
+	private $_words = array();
+
 	public static function find($search_term = '') {
 		return new static($search_term);
 	}
-	
+
 	public function __construct($search_term = '') {
 		if (!is_string($search_term)) throw new InvalidArgumentException('The search term must be a string');
-		
-		$this->term = strtolower($search_term);
-		
+
+		$this->term = explode(' ', strtolower($search_term));
+
 		return $this;
 	}
-	
-	
+
+
 	/**
 	 * The data you want to search in. Accepts an array of arrays and an array
 	 * of objects.
 	 *
 	 * @param	mixed	$data	The data to search through. An array of arrays, or an array of objects
-	 * @return 	$this	
+	 * @return 	$this
 	 */
 	public function in($data) {
 		if (!is_array($data)) {
 			throw new InvalidArgumentException('Search only accepts an array of arrays, or an array of objects');
 		}
-		
+
 		// lets make sure this is data we can search through, before we waste resources on trying
 		foreach ($data as $key => $value) {
 			if (!is_object($value) && !is_array($value)) {
 				throw new InvalidArgumentException('Search only accepts an array of arrays, or an array of objects');
 			}
 		}
-		
+
 		$this->data = $data;
 		// lets try and prevent the next step, because with big datasets, this could turn out to be pretty memory intensive
 		//$this->_normalized_data = \Format::forge($data)->to_array();
-		
+
 		return $this;
 	}
-	
+
 	/**
 	 * The fields of the data entries you wan't to look into for the search term
 	 *
@@ -72,35 +72,35 @@ public function in($data) {
 	 */
 	public function by($fields = null) {
 		if (!is_array($fields)) $fields = func_get_args();
-		
+
 		$this->fields = array_merge($this->fields, $fields);
 		array_unique($this->fields);
-		
+
 		return $this;
 	}
-	
+
 	/**
 	 * Limit the amount of the results you'll get back
 	 * @param 	int		$limit	the limit of results you want to get back
 	 * @return 	$this
 	 */
 	public function limit($limit) {
 		$this->limit = (int) $limit;
-		
+
 		return $this;
 	}
-	
+
 	/**
 	 * offset the results you'll get back
 	 * @param 	int		$offset	the offset from which you want results back
 	 * @return 	$this
 	 */
 	public function offset($offset) {
 		$this->offset = (int) $offset;
-		
+
 		return $this;
 	}
-	
+
 	/**
 	 * Determine how well the results should relate to the searchterm by percentage.
 	 * With a relevance of 50% using a 8 letter search term, results are included that
@@ -111,16 +111,16 @@ public function offset($offset) {
 	 */
 	public function relevance($relevance) {
 		$relevance = (int) $relevance;
-		
+
 		if ($relevance > 100) $relevance = 100;
 		if ($relevance < 1) $relevance = 1;
-		
+
 		$this->relevance = $relevance;
-		
+
 		return $this;
 	}
-	
-	
+
+
 	/**
 	 * Executes the search that's been built. It retrieves the words out of the
 	 * specified fields from the data. It then generates a score for each one of them,
@@ -132,46 +132,49 @@ public function execute() {
 		if (empty($this->data)) { // if there is no data to work with
 			return array(); // searching in nothing leads to nothing
 		}
-
-		$search_term = $this->term;
-		$search_term_length = strlen($search_term);
+
 		$relevance = $this->relevance;
-
 		$cost_limit = (int) ceil( ($search_term_length * (100-$relevance) ) / 100);
-		$search_term_minlen = $search_term_length - $cost_limit; // anything that's the cost limit longer that search term, will score too low for sure
-		$search_term_maxlen = $search_term_length + $cost_limit; // anything that's the cost limit longer that search term, will score too high for sure
-
-		$this->get_words($search_term_minlen, $search_term_maxlen);
+
+		$min_len = $cost_limit;
+		$max_len = 0;
+
+		foreach ($this->term as $term) {
+			$min_len = min($min_len, strlen($term) - $cost_limit); // anything that's the cost limit longer that search term, will score too low for sure
+			$max_len = max($max_len, strlen($term) + $cost_limit); // anything that's the cost limit longer that search term, will score too high for sure
+		}
+
+		$this->get_words($min_len, $max_len);
 		$entry_scores = $this->get_entry_scores($cost_limit);
-		
+
 		$results = array();
-		
+
 		uasort($entry_scores, function($a, $b){ // sort the entries by score
 			return ($a < $b) ? -1 : 1;
 		});
-		
+
 		$i = 0;
 		$j = 1;
 		\Debug::dump($this->offset);
 		foreach ($entry_scores as $entry_key => $score) {
 			$i++;
 			if (is_int($this->offset) && $this->offset >= $i) continue;
-			
-			
+
+
 			$results[$entry_key] = $this->data[$entry_key];
-			
-			
+
+
 			if (is_int($this->limit) && $j >= $this->limit) {
 				break;
 			} else {
 				$j++;
 			}
 		}
-		
+
 		return $results;
 	}
-	
-	
+
+
 	/**
 	 * Get's all the words from the specified fields in the data. they will be set
 	 * with a reference to the data it belongs to so it can be linked back
@@ -182,46 +185,44 @@ public function execute() {
 	 */
 	protected function get_words($min_length, $max_length) {
 		if (empty($this->data) || empty($this->fields)) return array();
-		
+
 		$this->_words = array();
-		
+
 		foreach ($this->data as $entry_key => $entry) {
 			if (!is_object($entry) && !is_array($entry)) continue; // if this is not either an array or an object, we won't be able to work with it
 			$words_in_entry = array();
-			
+
 			foreach ($this->fields as $field) {
 				if (is_array($entry)) { // if this entry is an array
 					if (!array_key_exists($field, $entry)) continue; // if the field is not set in this entry, there is not much to find!
-					
+
 					$field_contents = $entry[$field];
 				} else { // because of the if earlier statement, if it's not an array, it must be an object
 					if (!isset($entry->$field)) continue; // if the field is not set in this entry, there is not much to find!
-					
+
 					$field_contents = $entry->$field;
 				}
-				
+
 				$field_words = explode(' ', $field_contents);
-				
+
 				foreach ($field_words as $word) {
 					$word = strtolower($word);
-					
+
 					if (isset($words_in_entry[$word])) continue; // if we already found this word in this entry, we don't need it again
-					
+
 					$word_length = strlen($word);
-					
+
 					if ($word_length >= $min_length && $word_length <= $max_length) {
 						if (!isset($words[$word])) $words[$word] = array(); // add the word if it doesnt exist yet
-						
+
 						$this->_words[$word][] = array('key' => $entry_key);
 					}
-
+					$words_in_entry[$word] = true; // mark this word as done for this entry
 				}
-
-				$words_in_entry[$word] = true; // mark this word as done for this entry
 			}
 		}
 	}
-	
+
 	/**
 	 * Determine the scores per entry. With multiple matching words, the best score goes
 	 *
@@ -231,8 +232,13 @@ protected function get_words($min_length, $max_length) {
 	protected function get_entry_scores($cost_limit = 0) {
 		$results = array(); // array of entry_key => score
 		foreach ($this->_words as $word => $entries) {
-			$score = static::get_word_score($word, $this->term, $cost_limit + 1);
-
+
+			//run score for each word in searchterm, lowest score wins.
+			$score = $cost_limit + 1; //init score
+			foreach ($terms as $term) {
+				$score = min(static::get_word_score($word, $term, $cost_limit + 1), $score);
+			}
+
 			if ($score <= $cost_limit) { // if this word scores within our cost limit
 				foreach ($entries as $entry) {
 					if (!isset($results[$entry['key']]) || (isset($results[$entry['key']]) && $results[$entry['key']] > $score)) { // if his entries score improved
@@ -241,17 +247,17 @@ protected function get_entry_scores($cost_limit = 0) {
 				}
 			}
 		}
-		
+
 		return $results;
 	}
-	
+
 	/**
     * Calculate the distance between $word and $search_term based on the Damerau-Levenshtein algorithm.
     *
     * Credits for this algorithm implementation go out to Ronald Mansveld, who worked hard to make it efficient
     * while creating useful results
     * @author 	Ronald Mansveld
-    * 
+    *
     * @param	string	$word			The word to check
     * @param	string	$search_term		The searchterm to check word against
     * @param	int		$cost_limit		The maximum cost we are looking for (so we can break early on words with higher costs)
@@ -262,13 +268,13 @@ protected function get_entry_scores($cost_limit = 0) {
     */
 	protected static function get_word_score($word, $search_term, $cost_limit) {
 		if ($word == $search_term) return 0;
-      
+
 		$len1 = strlen($word);
 		$len2 = strlen($search_term);
-      
+
 		if ($len1 == 0) return $len2;
 		if ($len2 == 0) return $len1;
-      
+
 		//strip common prefix
 		$i = 0;
 		do {
@@ -277,12 +283,12 @@ protected static function get_word_score($word, $search_term, $cost_limit) {
 			$len1--;
 			$len2--;
 		} while ($len1 > 0 && $len2 > 0);
-		
+
 		if ($i > 0) {
 		   $word = substr($word, $i);
 		   $search_term = substr($search_term, $i);
 		}
-		 
+
 		//strip common suffix
 		$i = 0;
 		do {
@@ -295,10 +301,10 @@ protected static function get_word_score($word, $search_term, $cost_limit) {
 		   $word = substr($word, 0, $len1);
 		   $search_term = substr($search_term, 0, $len2);
 		}
-		 
+
 		if ($len1 == 0) return $len2;
 		if ($len2 == 0) return $len1;
-		 
+
 		$matrix = array();
 		for ($i = 0; $i <= $len1; $i++) {
 		   $matrix[$i] = array();
@@ -307,7 +313,7 @@ protected static function get_word_score($word, $search_term, $cost_limit) {
 		for ($i = 0; $i <= $len2; $i++) {
 		   $matrix[0][$i] = $i;
 		}
-		 
+
 		for ($i = 1; $i <= $len1; $i++) {
 			$best = $cost_limit;
 			for ($j = 1; $j <= $len2; $j++) {