Update search files (#215)

* add results to `search_file()` and `search_files()` * update dict schema from {path:count} -> {path:{keyword:count}} * remove duplicate line printing * update type hint * add class for keyword search results * use dataclass instead of typeddict
MrPowers · Feb 27, 2024 · ac07479 · ac07479
1 parent a661769
commit ac07479
Show file tree

Hide file tree

Showing 2 changed files with 41 additions and 14 deletions.
diff --git a/quinn/keyword_finder.py b/quinn/keyword_finder.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import os
+from dataclasses import dataclass
 from glob import iglob
 
 default_keywords = [
@@ -40,42 +41,59 @@
     "sparkContext",
 ]
 
+@dataclass
+class SearchResult:
+    """Class to hold the results of a file search.
+    file_path: The path to the file that was searched.
+    word_count: A dictionary containing the number of times each keyword was found in the file.
+    """
+
+    file_path: str
+    word_count: dict[str, int]
+
 
-def search_file(path: str, keywords: list[str] = default_keywords) -> None:
+def search_file(path: str, keywords: list[str] = default_keywords) -> SearchResult:
     """Searches a file for keywords and prints the line number and line containing the keyword.
 
     :param path: The path to the file to search.
     :type path: str
     :param keywords: The list of keywords to search for.
     :type keywords: list[str]
-    :returns: None
-    :rtype: None
+    :returns: A dictionary containing a file path and the number of lines containing a keyword in `keywords`.
+    :rtype: SearchResult
 
     """
+    match_results = SearchResult(file_path=path, word_count={keyword: 0 for keyword in keywords})
+
     print(f"\nSearching: {path}")
     with open(path) as f:
         for line_number, line in enumerate(f, 1):
+            line_printed = False
             for keyword in keywords:
                 if keyword in line:
-                    print(f"{line_number}: {keyword_format(line)}", end="")
-                    break
+                    match_results.word_count[keyword] += 1
+
+                    if not line_printed:
+                        print(f"{line_number}: {keyword_format(line)}", end="")
+                        line_printed = True
+
+    return match_results
 
 
-def search_files(path: str, keywords: list[str] = default_keywords) -> None:
+def search_files(path: str, keywords: list[str] = default_keywords) -> list[SearchResult]:
     """Searches all files in a directory for keywords.
 
     :param path: The path to the directory to search.
     :type path: str
     :param keywords: The list of keywords to search for.
     :type keywords: list[str]
-    :returns: None
-    :rtype: None
+    :returns: A list of dictionaries containing file paths and the number of lines containing a keyword in `keywords`.
+    :rtype: list[SearchResult]
 
     """
     rootdir_glob = f"{path}/**/*"
     file_list = [f for f in iglob(rootdir_glob, recursive=True) if os.path.isfile(f)]
-    for f in file_list:
-        search_file(f, keywords)
+    return [search_file(f, keywords) for f in file_list]
 
 
 def keyword_format(input: str, keywords: list[str] = default_keywords) -> str:

diff --git a/tests/test_keyword_finder.py b/tests/test_keyword_finder.py
@@ -2,11 +2,22 @@
 
 
 def test_search_file():
-    search_file("tests/test_files/some_pyspark.py")
+    file_path = "tests/test_files/some_pyspark.py"
+    results = search_file(file_path)
+
+    assert results.word_count["rdd"] == 5
+    assert results.word_count["sparkContext"] == 2
 
 
 def test_search_files():
-    search_files("tests/test_files")
+    results = search_files("tests/test_files")
+
+    pyspark_file = [result for result in results if result.file_path == "tests/test_files/some_pyspark.py"][0]
+    csv_file = [result for result in results if result.file_path == "tests/test_files/good_schema1.csv"][0]
+
+    assert pyspark_file.word_count["rdd"] == 5
+    assert pyspark_file.word_count["sparkContext"] == 2
+    assert csv_file.word_count["rdd"] == 0
 
 
 def test_keyword_format():
@@ -21,5 +32,3 @@ def test_surround_substring():
     assert "spark **rdd|| stuff" == surround_substring("spark rdd stuff", "rdd", "**", "||")
     assert "spark **rdd|| stuff with **rdd||" == surround_substring("spark rdd stuff with rdd", "rdd", "**", "||")
     assert "spark **rdd||dd stuff" == surround_substring("spark rdddd stuff", "rdd", "**", "||")
-
-