diff --git a/quinn/keyword_finder.py b/quinn/keyword_finder.py index 79bcd3f..926f9e5 100644 --- a/quinn/keyword_finder.py +++ b/quinn/keyword_finder.py @@ -1,6 +1,7 @@ from __future__ import annotations import os +from dataclasses import dataclass from glob import iglob default_keywords = [ @@ -40,42 +41,59 @@ "sparkContext", ] +@dataclass +class SearchResult: + """Class to hold the results of a file search. + file_path: The path to the file that was searched. + word_count: A dictionary containing the number of times each keyword was found in the file. + """ + + file_path: str + word_count: dict[str, int] + -def search_file(path: str, keywords: list[str] = default_keywords) -> None: +def search_file(path: str, keywords: list[str] = default_keywords) -> SearchResult: """Searches a file for keywords and prints the line number and line containing the keyword. :param path: The path to the file to search. :type path: str :param keywords: The list of keywords to search for. :type keywords: list[str] - :returns: None - :rtype: None + :returns: A dictionary containing a file path and the number of lines containing a keyword in `keywords`. + :rtype: SearchResult """ + match_results = SearchResult(file_path=path, word_count={keyword: 0 for keyword in keywords}) + print(f"\nSearching: {path}") with open(path) as f: for line_number, line in enumerate(f, 1): + line_printed = False for keyword in keywords: if keyword in line: - print(f"{line_number}: {keyword_format(line)}", end="") - break + match_results.word_count[keyword] += 1 + + if not line_printed: + print(f"{line_number}: {keyword_format(line)}", end="") + line_printed = True + + return match_results -def search_files(path: str, keywords: list[str] = default_keywords) -> None: +def search_files(path: str, keywords: list[str] = default_keywords) -> list[SearchResult]: """Searches all files in a directory for keywords. :param path: The path to the directory to search. :type path: str :param keywords: The list of keywords to search for. :type keywords: list[str] - :returns: None - :rtype: None + :returns: A list of dictionaries containing file paths and the number of lines containing a keyword in `keywords`. + :rtype: list[SearchResult] """ rootdir_glob = f"{path}/**/*" file_list = [f for f in iglob(rootdir_glob, recursive=True) if os.path.isfile(f)] - for f in file_list: - search_file(f, keywords) + return [search_file(f, keywords) for f in file_list] def keyword_format(input: str, keywords: list[str] = default_keywords) -> str: diff --git a/tests/test_keyword_finder.py b/tests/test_keyword_finder.py index f433afa..9263217 100644 --- a/tests/test_keyword_finder.py +++ b/tests/test_keyword_finder.py @@ -2,11 +2,22 @@ def test_search_file(): - search_file("tests/test_files/some_pyspark.py") + file_path = "tests/test_files/some_pyspark.py" + results = search_file(file_path) + + assert results.word_count["rdd"] == 5 + assert results.word_count["sparkContext"] == 2 def test_search_files(): - search_files("tests/test_files") + results = search_files("tests/test_files") + + pyspark_file = [result for result in results if result.file_path == "tests/test_files/some_pyspark.py"][0] + csv_file = [result for result in results if result.file_path == "tests/test_files/good_schema1.csv"][0] + + assert pyspark_file.word_count["rdd"] == 5 + assert pyspark_file.word_count["sparkContext"] == 2 + assert csv_file.word_count["rdd"] == 0 def test_keyword_format(): @@ -21,5 +32,3 @@ def test_surround_substring(): assert "spark **rdd|| stuff" == surround_substring("spark rdd stuff", "rdd", "**", "||") assert "spark **rdd|| stuff with **rdd||" == surround_substring("spark rdd stuff with rdd", "rdd", "**", "||") assert "spark **rdd||dd stuff" == surround_substring("spark rdddd stuff", "rdd", "**", "||") - -