In [10]:
import os
import json
import re
from docx import Document

def get_feature_from_filename(filename, feature_patterns):
    filename_lower = filename.lower()  # Normalize the case
    for feature, pattern in feature_patterns.items():
        if re.search(pattern, filename_lower):
            return feature
    return "Unknown"

def get_file_type(file_path):
    if 'controller' in file_path.lower():
        return 'Controller'
    elif 'model' in file_path.lower():
        return 'Model'
    elif 'service' in file_path.lower():
        return 'Service'
    elif 'repository' in file_path.lower():
        return 'Repository'
    elif file_path.lower().endswith('.docx'):
        return 'SRS Document'
    elif file_path.lower().endswith('.sql'):
        return 'Database Schema'
    else:
        return 'Unknown'

def read_file_content(file_path):
    try:
        if file_path.endswith('.docx'):
            doc = Document(file_path)
            return "\n".join([paragraph.text for paragraph in doc.paragraphs])
        else:
            # Try reading with utf-8
            try:
                with open(file_path, 'r', encoding='utf-8-sig') as file:
                    return file.read()
            except UnicodeDecodeError:
                # If utf-8 fails, try utf-16
                with open(file_path, 'r', encoding='utf-16') as file:
                    return file.read()
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return ""

def categorize_files_by_feature(repo_path, srs_path, schema_path, output_file, feature_patterns):
    with open(output_file, 'w', encoding='utf-8') as f:
        for path in [repo_path, srs_path, schema_path]:
            for root, _, files in os.walk(path):
                for file in files:
                    feature_name = get_feature_from_filename(file, feature_patterns)
                    file_path = os.path.join(root, file)
                    file_type = get_file_type(file_path)
                    file_content = read_file_content(file_path)

                    if feature_name != "Unknown" and file_content:
                        record = {
                            "feature_name": feature_name,
                            "file_type": file_type,
                            "file_name": file,
                            "file_content": file_content
                        }
                        
                        f.write(json.dumps(record) + '\n')

    print(f"Data categorized and saved to {output_file}")

# Regex patterns for features
feature_patterns = {
    "Order Processing": r"order",
    "Import Processing": r"import",
    "Events": r"event",
    "UID Management": r"unique[_\s]*identifiable[_\s]*marker|unique[_\s]*identifier"
}


# Example usage
repo_path = 'C:\\Project'  # Absolute path to your project root
srs_path = 'C:\\SRS Documents'  # Path where SRS documents are stored
schema_path = 'C:\\Db Model'  # Path where database schema files are stored
output_file = 'feature_dataset.jsonl'
categorize_files_by_feature(repo_path, srs_path, schema_path, output_file, feature_patterns)


Data categorized and saved to feature_dataset.jsonl


In [None]:
pip install python-docx