In [11]:
import os
import json
import re
from collections import Counter

class JupyterPipelineAnalyzer:
    def __init__(self, folder_path, output_file="extracted_code.json"):
        self.folder_path = folder_path
        self.output_file = output_file
        self.notebook_files = self._find_notebooks()
        self.notebooks_data = {}  # Dictionary to store extracted code per notebook
        self.import_counts = Counter()
        self.function_calls = Counter()

    def _find_notebooks(self):
        """Recursively find all Jupyter notebooks in the given folder."""
        notebooks = []
        for root, _, files in os.walk(self.folder_path):
            for file in files:
                if file.endswith(".ipynb"):
                    notebooks.append(os.path.join(root, file))
        return notebooks

    def _extract_code_cells(self, notebook_path):
        """Extract all code cells from a Jupyter notebook."""
        try:
            with open(notebook_path, 'r', encoding='utf-8') as f:
                notebook_data = json.load(f)
            
            return [line.strip() for cell in notebook_data["cells"] if cell["cell_type"] == "code" for line in cell["source"]]
        except Exception as e:
            print(f"Error reading {notebook_path}: {e}")
            return []

    def _analyze_imports_and_functions(self, lines):
        """Identify imported libraries and function calls within a list of lines."""
        for line in lines:
            import_match = re.match(r'^\s*(import|from)\s+([\w\.]+)', line)
            if import_match:
                self.import_counts[import_match.group(2)] += 1

            function_matches = re.findall(r'\b(\w+)\s*\(', line)
            for func in function_matches:
                self.function_calls[func] += 1

    def process_notebooks(self):
        """Parse all notebooks and extract relevant code information."""
        for notebook in self.notebook_files:
            notebook_name = os.path.basename(notebook)  # Get only the file name
            code_lines = self._extract_code_cells(notebook)

            if code_lines:
                self.notebooks_data[notebook_name] = {
                    "file_path": notebook,
                    "code": code_lines
                }
                self._analyze_imports_and_functions(code_lines)

        self._save_code_json()

    def _save_code_json(self):
        """Save all extracted code lines to a JSON file."""
        with open(self.output_file, "w", encoding="utf-8") as f:
            json.dump(self.notebooks_data, f, indent=4)
        print(f"✅ Extracted code from {len(self.notebook_files)} notebooks and saved to {self.output_file}")

    def print_summary(self):
        """Print summary statistics of the extracted code."""
        print("\n📊 Summary Statistics:")
        print(f"📂 Found {len(self.notebook_files)} notebooks.")
        print(f"📜 Extracted code from {len(self.notebooks_data)} notebooks.")

        print("\n🔍 Top Imported Libraries:")
        for lib, count in self.import_counts.most_common(20):
            print(f"  - {lib}: {count} times")

        print("\n🔧 Most Used Functions:")
        for func, count in self.function_calls.most_common(20):
            print(f"  - {func}(): {count} times")

# # ==================== USAGE ====================
# if __name__ == "__main__":
#     folder_path = "path/to/kaggle/notebooks"  # Replace with your actual path
#     analyzer = JupyterPipelineAnalyzer(folder_path)
#     analyzer.process_notebooks()
#     analyzer.print_summary()


In [None]:
folder_path = "../kaggle_notebooks/notebooks"  # Update this with the actual path
analyzer = JupyterPipelineAnalyzer(folder_path)
analyzer.process_notebooks()
analyzer.print_summary()


✅ Extracted code from 19 notebooks and saved to extracted_code.json

📊 Summary Statistics:
📂 Found 19 notebooks.
📜 Extracted code from 16 notebooks.

🔍 Top Imported Libraries:
  - sklearn.preprocessing: 28 times
  - pandas: 26 times
  - numpy: 24 times
  - sklearn.metrics: 23 times
  - sklearn.model_selection: 22 times
  - sklearn.ensemble: 16 times
  - matplotlib.pyplot: 15 times
  - seaborn: 15 times
  - xgboost: 14 times
  - sklearn.linear_model: 14 times
  - os: 10 times
  - lightgbm: 7 times
  - catboost: 7 times
  - sklearn.neighbors: 7 times
  - sklearn.tree: 7 times
  - sklearn.impute: 6 times
  - sklearn.svm: 6 times
  - sklearn.compose: 4 times
  - tensorflow: 3 times

🔧 Most Used Functions:
  - print(): 342 times
  - fillna(): 107 times
  - fit(): 99 times
  - predict(): 94 times
  - drop(): 71 times
  - show(): 67 times
  - len(): 57 times
  - sum(): 56 times
  - fit_transform(): 56 times
  - mean_squared_error(): 52 times
  - mean(): 50 times
  - head(): 49 times
  - isnul

In [14]:
import json

# Load the categorized JSON file
categorized_file_path = "categorized_code.json"  # Update this path if needed
with open(categorized_file_path, "r", encoding="utf-8") as f:
    categorized_code = json.load(f)

# Initialize summary statistics
summary_stats = {
    "total_notebooks": len(categorized_code),
    "overall_counts": {
        "Feature Augmentation": 0,
        "Feature Reduction": 0,
        "Feature Engineering": 0,
        "Other": 0
    },
    "notebook_counts": {}
}

# Process each notebook
for notebook, data in categorized_code.items():
    category_counts = {
        "Feature Augmentation": len(data["Feature Augmentation"]),
        "Feature Reduction": len(data["Feature Reduction"]),
        "Feature Engineering": len(data["Feature Engineering"]),
        "Other": len(data["Other"])
    }
    
    # Store per-notebook statistics
    summary_stats["notebook_counts"][notebook] = category_counts
    
    # Aggregate overall statistics
    for category, count in category_counts.items():
        summary_stats["overall_counts"][category] += count

# Print summary statistics
print("\n📊 **Summary Statistics**")
print(f"Total Notebooks Analyzed: {summary_stats['total_notebooks']}\n")

print("🔹 **Overall Code Line Counts Across All Notebooks:**")
for category, count in summary_stats["overall_counts"].items():
    print(f"  - {category}: {count} lines")

print("\n📂 **Per-Notebook Breakdown:**")
for notebook, counts in summary_stats["notebook_counts"].items():
    print(f"\n📘 {notebook}:")
    for category, count in counts.items():
        print(f"  - {category}: {count} lines")


analyzer.print_summary()



📊 **Summary Statistics**
Total Notebooks Analyzed: 16

🔹 **Overall Code Line Counts Across All Notebooks:**
  - Feature Augmentation: 65 lines
  - Feature Reduction: 24 lines
  - Feature Engineering: 111 lines
  - Other: 3906 lines

📂 **Per-Notebook Breakdown:**

📘 simple-house-prices-prediction-with-explanation.ipynb:
  - Feature Augmentation: 0 lines
  - Feature Reduction: 1 lines
  - Feature Engineering: 4 lines
  - Other: 115 lines

📘 reduce-complexity-for-house-prices-predictions.ipynb:
  - Feature Augmentation: 1 lines
  - Feature Reduction: 8 lines
  - Feature Engineering: 6 lines
  - Other: 436 lines

📘 spaceship-titanic-eda-predictions.ipynb:
  - Feature Augmentation: 0 lines
  - Feature Reduction: 0 lines
  - Feature Engineering: 5 lines
  - Other: 630 lines

📘 spaceship-titanic-competition-with-ensemble-models.ipynb:
  - Feature Augmentation: 0 lines
  - Feature Reduction: 0 lines
  - Feature Engineering: 5 lines
  - Other: 133 lines

📘 house-prices-prediction.ipynb:
  - Fe