# 1.0 - Extract documentation from repositories

In [2]:
import pandas as pd
import numpy as np
import os
import sys


## Find all text files in the repository

In [22]:
import ast
import warnings

def extract_docstrings(file_path):
    """
    Extract all docstrings from a Python file.
    Returns a list of docstrings found in the file.
    """
    docstrings = []
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            source_code = f.read()
        
        # Suppress SyntaxWarning for invalid escape sequences in docstrings
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', category=SyntaxWarning)
            tree = ast.parse(source_code, filename=file_path)
        
        # Extract module-level docstring
        if ast.get_docstring(tree):
            docstrings.append(ast.get_docstring(tree))
        
        # Walk through all nodes in the AST
        for node in ast.walk(tree):
            # Extract class docstrings
            if isinstance(node, ast.ClassDef):
                docstring = ast.get_docstring(node)
                if docstring:
                    docstrings.append(f"Class {node.name}:\n{docstring}\n")
            
            # Extract function/method docstrings
            elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
                docstring = ast.get_docstring(node)
                if docstring:
                    # Try to get the class name if it's a method
                    parent_class = None
                    for parent in ast.walk(tree):
                        if isinstance(parent, ast.ClassDef):
                            for child in parent.body:
                                if child == node:
                                    parent_class = parent.name
                                    break
                    
                    if parent_class:
                        docstrings.append(f"Method {parent_class}.{node.name}:\n{docstring}\n")
                    else:
                        docstrings.append(f"Function {node.name}:\n{docstring}\n")
    
    except SyntaxError:
        # Skip files with syntax errors
        return []
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return []
    
    return docstrings

In [79]:
#REPO_PATH = os.path.join(os.getcwd(), '..', 'repositories', 'orange3-doc-visual-programming')
REPO_PATH = os.path.join(os.getcwd(), '..', 'repositories')
FILE_EXTENSIONS = ["txt", "md", "rst"]

SKIP = ["negative_words_Slolex.txt", "positive_words_Slolex.txt"]

text_files = []
for root, dirs, files in os.walk(REPO_PATH):
    for file in files:
        if True in [file.endswith(ext) for ext in FILE_EXTENSIONS] and file not in SKIP:
            text_files.append(os.path.join(root, file))

python_files = []
for root, dirs, files in os.walk(REPO_PATH):
    for file in files:
        if file.endswith('.py') and not 'test' in file and file not in SKIP:
            python_files.append(os.path.join(root, file))

print(f"Found {len(text_files)} text files")
print(f"Found {len(python_files)} Python files")

Found 378 text files
Found 902 Python files


## Extract documentation and save it to a joint text file

In [86]:
total_characters = 0
all_docstrings = []

# Extract from text files
for file_path in text_files:
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            content = content.replace("http://youtube.com/orangedatamining", "<link>")
            total_characters += len(content)
            all_docstrings.append(f"=== {os.path.relpath(file_path, REPO_PATH)} ===\n{content}")
    except Exception as e:
        print(f"Error reading {file_path}: {e}")

print(f"Total characters in text: {total_characters:,}")
print(f"Total text chunks: {len(all_docstrings)}")

Total characters in text: 1,137,579
Total text chunks: 378


In [87]:
with open(os.path.join(os.getcwd(), '..', 'data', 'all_documentation.txt'), 'w', encoding='utf-8') as f:
    f.write("\n\n".join(all_docstrings))
