# Cricket Analytics RAG System - Setup & Data Exploration

## Project Overview
Welcome to the **Cricket Analytics RAG System** - an Agentic AI system that enhances cricket strategy insights by combining structured statistics with unstructured commentary using RAG + LangChain.

### 🏏 Goals
- **Enhanced Cricket Strategy**: Combine structured stats with unstructured commentary
- **Multimodal Analysis**: Process both numerical data and text-based insights  
- **Tactical Intelligence**: Answer complex questions like "How does Virat Kohli perform against left-arm pacers in death overs?"

### 🛠️ Tech Stack
- **LangChain & LangGraph**: For RAG and multi-hop reasoning
- **Vector Databases**: ChromaDB/FAISS for embeddings
- **Data Sources**: Cricinfo commentary, Kaggle datasets, match reports
- **API Layer**: FastAPI for serving insights

### 📋 Development Timeline
- **Week 5-7**: Data foundation & exploration ✅ (This notebook)
- **Week 9**: Baseline RAG implementation 🔄  
- **Week 11**: Advanced LangGraph reasoning 📅

## 1. Install Essential Python Packages

First, let's install and import all the essential packages for our cricket analytics system.

In [None]:
# Install essential packages (run this cell if packages are not installed)
import subprocess
import sys

def install_package(package):
    """Install a package using pip"""
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"✅ Successfully installed {package}")
    except subprocess.CalledProcessError:
        print(f"❌ Failed to install {package}")

# Core packages for cricket analytics
packages = [
    "pandas",
    "numpy", 
    "matplotlib",
    "seaborn",
    "plotly",
    "langchain",
    "chromadb",
    "sentence-transformers",
    "openai",
    "fastapi",
    "uvicorn"
]

print("Installing essential packages for Cricket Analytics RAG System...")
for package in packages:
    install_package(package)

In [None]:
# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime, timedelta
import json
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

print("✅ All essential libraries imported successfully!")
print(f"📊 Pandas version: {pd.__version__}")
print(f"🔢 NumPy version: {np.__version__}")
print(f"📈 Matplotlib version: {mpl.__version__}")

# Check if we're in the correct project directory
current_dir = Path.cwd()
project_name = "Project"
if project_name in str(current_dir):
    print(f"✅ Working in project directory: {current_dir}")
else:
    print(f"⚠️ Current directory: {current_dir}")
    print("📂 Make sure you're running from the project root directory")

## 2. Configure Development Environment

Let's set up environment variables and verify our Python installation.

In [None]:
# Configure development environment
import sys
import platform
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Display system information
print("🖥️ System Information:")
print(f"   Platform: {platform.system()} {platform.release()}")
print(f"   Python Version: {sys.version}")
print(f"   Python Executable: {sys.executable}")

# Set up project paths
project_root = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
data_dir = project_root / "data"
src_dir = project_root / "src"
config_dir = project_root / "config"

print(f"\n📁 Project Structure:")
print(f"   Project Root: {project_root}")
print(f"   Data Directory: {data_dir}")
print(f"   Source Directory: {src_dir}")
print(f"   Config Directory: {config_dir}")

# Add src to Python path for imports
if str(src_dir) not in sys.path:
    sys.path.insert(0, str(src_dir))
    print(f"✅ Added {src_dir} to Python path")

# Environment variables for Cricket Analytics
os.environ.setdefault('CRICKET_DATA_DIR', str(data_dir))
os.environ.setdefault('CRICKET_LOG_LEVEL', 'INFO')
os.environ.setdefault('CRICKET_CHUNK_SIZE', '1000')
os.environ.setdefault('CRICKET_TOP_K', '5')

print(f"\n🏏 Cricket Analytics Environment:")
print(f"   Data Directory: {os.environ.get('CRICKET_DATA_DIR')}")
print(f"   Log Level: {os.environ.get('CRICKET_LOG_LEVEL')}")
print(f"   Chunk Size: {os.environ.get('CRICKET_CHUNK_SIZE')}")
print(f"   Top K Retrieval: {os.environ.get('CRICKET_TOP_K')}")

print("\n✅ Development environment configured successfully!")

## 3. Set Up Project Structure & Verify Files

Let's verify our project structure and ensure all necessary directories and files exist.

In [None]:
# Verify and create project structure
def verify_project_structure():
    """Verify that all essential project directories and files exist"""
    
    required_dirs = [
        "src",
        "src/data", 
        "src/rag",
        "src/langgraph",
        "src/api",
        "src/evaluation",
        "data",
        "data/raw",
        "data/processed",
        "notebooks",
        "config",
        "tests",
        "docs"
    ]
    
    required_files = [
        "requirements.txt",
        "README.md",
        ".env.example",
        "src/__init__.py",
        "src/data/__init__.py",
        "src/rag/__init__.py",
        "src/langgraph/__init__.py",
        "src/api/__init__.py"
    ]
    
    print("📂 Checking Project Structure:")
    print("=" * 50)
    
    # Check directories
    for dir_path in required_dirs:
        full_path = project_root / dir_path
        status = "✅" if full_path.exists() else "❌"
        print(f"{status} {dir_path}/")
        
        # Create missing directories
        if not full_path.exists():
            full_path.mkdir(parents=True, exist_ok=True)
            print(f"   📁 Created directory: {dir_path}")
    
    print("\n📄 Checking Essential Files:")
    print("=" * 50)
    
    # Check files
    for file_path in required_files:
        full_path = project_root / file_path
        status = "✅" if full_path.exists() else "❌"
        size = f"({full_path.stat().st_size} bytes)" if full_path.exists() else ""
        print(f"{status} {file_path} {size}")
    
    print(f"\n🏗️ Project structure verification complete!")
    
    # Display project tree
    print(f"\n🌳 Project Tree:")
    print("=" * 50)
    for root, dirs, files in os.walk(project_root):
        level = root.replace(str(project_root), '').count(os.sep)
        indent = ' ' * 2 * level
        print(f"{indent}{os.path.basename(root)}/")
        subindent = ' ' * 2 * (level + 1)
        for file in files[:3]:  # Show only first 3 files per directory
            print(f"{subindent}{file}")
        if len(files) > 3:
            print(f"{subindent}... and {len(files) - 3} more files")

verify_project_structure()

## 4. Initialize Version Control

Let's set up Git for version control and create essential Git files.

In [None]:
# Initialize Git repository and create .gitignore
import subprocess

def run_git_command(command, description):
    """Run a git command and handle errors"""
    try:
        result = subprocess.run(command, shell=True, cwd=project_root, 
                              capture_output=True, text=True)
        if result.returncode == 0:
            print(f"✅ {description}")
            if result.stdout.strip():
                print(f"   Output: {result.stdout.strip()}")
        else:
            print(f"⚠️ {description} - {result.stderr.strip()}")
    except Exception as e:
        print(f"❌ Failed to {description.lower()}: {e}")

# Check if Git is installed
print("🔧 Setting up Version Control:")
print("=" * 50)

# Initialize Git repository
run_git_command("git init", "Initialize Git repository")

# Create .gitignore file
gitignore_content = """
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg

# Virtual Environment
venv/
env/
ENV/

# IDE
.vscode/
.idea/
*.swp
*.swo

# Environment variables
.env
.env.local

# Data files
data/raw/*.csv
data/raw/*.json
data/processed/*.pkl
*.db
*.sqlite

# Model files
models/
*.model
*.pkl

# Logs
logs/
*.log

# Jupyter Notebook
.ipynb_checkpoints

# Vector Database
data/vectordb/

# OS generated files
.DS_Store
.DS_Store?
._*
.Spotlight-V100
.Trashes
ehthumbs.db
Thumbs.db
"""

gitignore_path = project_root / ".gitignore"
with open(gitignore_path, 'w') as f:
    f.write(gitignore_content.strip())

print(f"✅ Created .gitignore file at {gitignore_path}")

# Add initial files
run_git_command("git add .", "Add all files to Git")

# Check Git status
run_git_command("git status", "Check Git status")

print(f"\n📋 Next steps for Git:")
print(f"   1. Set up your Git user config: git config --global user.name 'Your Name'")
print(f"   2. Set up your Git email: git config --global user.email 'your.email@example.com'")
print(f"   3. Make initial commit: git commit -m 'Initial commit: Cricket Analytics RAG Project setup'")
print(f"   4. Connect to remote repository if needed")

## 5. Configure IDE Settings

We'll set up Visual Studio Code workspace recommendations and debugging configuration for the project.

In [None]:
# Configure VS Code workspace settings
import json
from pathlib import Path

vscode_dir = project_root / ".vscode"
settings_path = vscode_dir / "settings.json"
extensions_path = vscode_dir / "extensions.json"
launch_path = vscode_dir / "launch.json"

vscode_dir.mkdir(exist_ok=True)

# Settings configuration
settings = {
    "python.pythonPath": sys.executable,
    "python.analysis.extraPaths": [str(src_dir)],
    "python.formatting.provider": "black",
    "editor.formatOnSave": True,
    "python.linting.enabled": True,
    "python.linting.mypyEnabled": True,
    "python.testing.pytestEnabled": True,
    "python.testing.pytestArgs": ["tests"],
}

# Recommended extensions
extensions = {
    "recommendations": [
        "ms-python.python",
        "ms-toolsai.jupyter",
        "ms-azuretools.vscode-docker",
        "ms-python.vscode-pylance",
        "charliermarsh.ruff",
        "tamasfe.even-better-toml"
    ]
}

# Launch configuration for FastAPI
launch_config = {
    "version": "0.2.0",
    "configurations": [
        {
            "name": "FastAPI: Run API",
            "type": "python",
            "request": "launch",
            "module": "uvicorn",
            "args": ["src.api.__init__:app", "--reload"],
            "jinja": True,
            "envFile": "${workspaceFolder}/.env"
        },
        {
            "name": "Pytest",
            "type": "python",
            "request": "launch",
            "module": "pytest",
            "args": ["-q"],
            "console": "integratedTerminal"
        }
    ]
}

settings_path.write_text(json.dumps(settings, indent=4))
extensions_path.write_text(json.dumps(extensions, indent=4))
launch_path.write_text(json.dumps(launch_config, indent=4))

print(f"✅ VS Code settings configured at {settings_path}")
print(f"✅ Extension recommendations saved to {extensions_path}")
print(f"✅ Launch configurations saved to {launch_path}")

## 6. Test Environment Setup

Run a quick diagnostic to ensure the environment, data directories, and core modules are working correctly.

In [None]:
# Environment diagnostics
import importlib

modules_to_check = [
    "pandas",
    "numpy",
    "matplotlib",
    "seaborn",
    "plotly",
    "langchain",
    "chromadb",
    "fastapi",
    "uvicorn"
]

print("🔍 Verifying installed modules...")
module_status = {}
for module in modules_to_check:
    try:
        importlib.import_module(module)
        module_status[module] = "✅"
    except ImportError:
        module_status[module] = "❌"

for module, status in module_status.items():
    print(f"   {status} {module}")

# Run compile check for src directory
print("\n🧪 Running compile check on src directory...")
result = subprocess.run([sys.executable, "-m", "compileall", str(src_dir)], capture_output=True, text=True)
if result.returncode == 0:
    print("✅ Source files compiled successfully")
else:
    print("⚠️ Compile errors detected:")
    print(result.stderr)

# Summary
missing_modules = [m for m, status in module_status.items() if status == "❌"]
if missing_modules:
    print("\n⚠️ Missing modules detected:")
    for module in missing_modules:
        print(f"   - {module}")
    print("\nPlease install the missing modules using pip:")
    print("pip install " + " ".join(missing_modules))
else:
    print("\n✅ All essential modules are installed!")

## 5. Configure Environment Settings

Let's create the main configuration file for our RAG system and set up the essential configurations.

In [None]:
# Create configuration file for RAG system
import yaml

# Configuration for RAG system
rag_config = {
    "system": {
        "name": "Cricket Analytics RAG",
        "version": "0.1.0",
        "description": "Agentic AI system for cricket match analytics using RAG + LangChain"
    },
    "data": {
        "raw_dir": "data/raw",
        "processed_dir": "data/processed",
        "supported_formats": ["csv", "json", "txt"],
        "default_encoding": "utf-8"
    },
    "rag": {
        "embeddings": {
            "model": "sentence-transformers/all-MiniLM-L6-v2",
            "dimension": 384,
            "normalize": True
        },
        "vectorstore": {
            "type": "chroma",
            "persist_directory": "data/vectordb",
            "collection_name": "cricket_analytics"
        },
        "text_splitter": {
            "chunk_size": 1000,
            "chunk_overlap": 200,
            "separators": ["\n\n", "\n", ". ", " ", ""]
        },
        "retrieval": {
            "top_k": 5,
            "search_type": "similarity",
            "score_threshold": 0.7
        }
    },
    "llm": {
        "provider": "openai",
        "model_name": "gpt-3.5-turbo",
        "temperature": 0.3,
        "max_tokens": 1000
    },
    "langgraph": {
        "nodes": [
            "data_retrieval",
            "player_analysis", 
            "match_analysis",
            "tactical_reasoning",
            "synthesis"
        ],
        "reasoning_depth": 2
    },
    "api": {
        "host": "0.0.0.0",
        "port": 8000,
        "reload": True,
        "docs_url": "/docs",
        "redoc_url": "/redoc"
    },
    "logging": {
        "level": "INFO",
        "file": "logs/cricket_analytics.log",
        "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
    }
}

# Save configuration to YAML file
config_path = config_dir / "rag_config.yaml"

with open(config_path, 'w') as f:
    yaml.dump(rag_config, f, default_flow_style=False, sort_keys=False)

print(f"✅ Created configuration file at {config_path}")

# Display the contents of the configuration file
print("\n📋 Configuration Preview:")
print("=" * 50)
print(yaml.dump(rag_config, default_flow_style=False, sort_keys=False)[:500] + "...")

## 6. Install Dependencies and Fix Errors

Let's install the required dependencies to fix the import errors in our notebook and source files.

In [None]:
# Install dependencies to fix import errors
import sys
import subprocess

def install_packages_from_requirements():
    """Install packages from requirements.txt file"""
    req_path = project_root / "requirements.txt"
    
    if not req_path.exists():
        print(f"❌ Requirements file not found at {req_path}")
        return
    
    print(f"📦 Installing packages from {req_path}")
    print("=" * 50)
    
    try:
        # Use subprocess to run pip install
        result = subprocess.run(
            [sys.executable, "-m", "pip", "install", "-r", str(req_path)],
            capture_output=True,
            text=True
        )
        
        if result.returncode == 0:
            print("✅ Successfully installed packages from requirements.txt")
        else:
            print(f"❌ Failed to install packages: {result.stderr}")
            
    except Exception as e:
        print(f"❌ Error during installation: {e}")

# Install specific packages to fix current errors
error_packages = [
    "plotly==5.18.0",
    "python-dotenv==1.0.0",
    "fastapi==0.109.0",
    "langchain==0.1.0",
    "pyyaml==6.0.1"
]

print("🔧 Installing packages to fix import errors:")
print("=" * 50)

for package in error_packages:
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"✅ Successfully installed {package}")
    except subprocess.CalledProcessError as e:
        print(f"❌ Failed to install {package}: {e}")

# Ask if user wants to install all dependencies from requirements.txt
print("\n⚠️ To install all dependencies from requirements.txt, run the next cell")
print("⚠️ This may take some time depending on your internet connection")

In [None]:
# Install all packages from requirements.txt
# Uncomment and run this cell to install all dependencies
# install_packages_from_requirements()

## 7. Create Sample Cricket Data

Let's create some sample cricket data for our RAG system to use.

In [None]:
# Create sample cricket data for development
import pandas as pd
import json
from pathlib import Path

# Create directories if they don't exist
raw_data_dir = data_dir / "raw"
processed_data_dir = data_dir / "processed"

raw_data_dir.mkdir(parents=True, exist_ok=True)
processed_data_dir.mkdir(parents=True, exist_ok=True)

print("🏏 Creating sample cricket data...")
print("=" * 50)

# 1. Create ball-by-ball data
ball_by_ball_data = {
    'match_id': [],
    'inning': [],
    'over': [],
    'ball': [],
    'batsman': [],
    'bowler': [],
    'runs_scored': [],
    'wicket_type': [],
    'venue': []
}

# Sample match data - India vs Australia
match_id = "IPL_2023_04_01"
batsmen = ["V Kohli", "R Sharma", "S Dhawan", "R Pant", "H Pandya"]
bowlers = ["P Cummins", "M Starc", "J Hazlewood", "N Lyon"]
venues = ["Eden Gardens", "Wankhede Stadium", "M. Chinnaswamy Stadium"]

# Generate synthetic ball-by-ball data
for inn in range(1, 3):
    for o in range(1, 21):  # 20 overs
        for b in range(1, 7):  # 6 balls per over
            import random
            
            ball_by_ball_data['match_id'].append(match_id)
            ball_by_ball_data['inning'].append(inn)
            ball_by_ball_data['over'].append(o)
            ball_by_ball_data['ball'].append(b)
            ball_by_ball_data['batsman'].append(random.choice(batsmen))
            ball_by_ball_data['bowler'].append(random.choice(bowlers))
            
            # Simulate run scoring
            runs = random.choices([0, 1, 2, 3, 4, 6], 
                                 weights=[0.4, 0.3, 0.1, 0.05, 0.1, 0.05])[0]
            ball_by_ball_data['runs_scored'].append(runs)
            
            # Simulate wickets
            if random.random() < 0.05:  # 5% chance of wicket
                wicket_type = random.choice(['bowled', 'caught', 'lbw', 'run out', 'stumped'])
            else:
                wicket_type = None
            ball_by_ball_data['wicket_type'].append(wicket_type)
            
            ball_by_ball_data['venue'].append(venues[0])  # All same venue for simplicity

# Create DataFrame and save to CSV
ball_by_ball_df = pd.DataFrame(ball_by_ball_data)
ball_by_ball_path = raw_data_dir / "ball_by_ball.csv"
ball_by_ball_df.to_csv(ball_by_ball_path, index=False)
print(f"✅ Created ball-by-ball data: {ball_by_ball_path} ({len(ball_by_ball_df)} records)")

# 2. Create match commentary data
commentary_data = {
    "match_id": match_id,
    "match_date": "2023-04-01",
    "teams": ["India", "Australia"],
    "venue": venues[0],
    "commentary": []
}

# Generate sample commentary
for inn in range(1, 3):
    team_batting = "India" if inn == 1 else "Australia"
    team_bowling = "Australia" if inn == 1 else "India"
    
    commentary_data["commentary"].append({
        "text": f"Welcome to the {inn}{'st' if inn == 1 else 'nd'} innings where {team_batting} will be batting against {team_bowling}.",
        "timestamp": f"2023-04-01T{14+inn}:00:00Z",
        "inning": inn,
        "over": 0,
        "ball": 0
    })
    
    for o in range(1, 21):
        for b in range(1, 7):
            ball_data = ball_by_ball_df[(ball_by_ball_df['inning'] == inn) & 
                                      (ball_by_ball_df['over'] == o) & 
                                      (ball_by_ball_df['ball'] == b)]
            
            if len(ball_data) > 0:
                row = ball_data.iloc[0]
                batsman = row['batsman']
                bowler = row['bowler']
                runs = row['runs_scored']
                wicket = row['wicket_type']
                
                # Create commentary text
                if wicket is not None:
                    commentary = f"{bowler} to {batsman}, OUT! {wicket.upper()}! {batsman} has to walk back to the pavilion."
                elif runs == 0:
                    commentary = f"{bowler} to {batsman}, no run, good defensive shot."
                elif runs == 1:
                    commentary = f"{bowler} to {batsman}, 1 run, pushed into the gap for a single."
                elif runs == 2:
                    commentary = f"{bowler} to {batsman}, 2 runs, good placement and quick running between wickets."
                elif runs == 3:
                    commentary = f"{bowler} to {batsman}, 3 runs, excellent running by the batsmen."
                elif runs == 4:
                    commentary = f"{bowler} to {batsman}, FOUR! Beautifully timed shot to the boundary."
                else:  # 6 runs
                    commentary = f"{bowler} to {batsman}, SIX! Massive hit over the boundary rope!"
                
                commentary_data["commentary"].append({
                    "text": commentary,
                    "timestamp": f"2023-04-01T{14+inn}:{o:02d}:{b*10:02d}Z",
                    "inning": inn,
                    "over": o,
                    "ball": b
                })

# Save commentary to JSON
commentary_path = raw_data_dir / f"commentary_{match_id}.json"
with open(commentary_path, 'w') as f:
    json.dump(commentary_data, f, indent=2)
print(f"✅ Created commentary data: {commentary_path} ({len(commentary_data['commentary'])} entries)")

# 3. Create match report data
match_report = {
    "match_id": match_id,
    "title": "IPL 2023: Royal Challengers Bangalore vs Mumbai Indians - Match Analysis",
    "date": "2023-04-01",
    "author": "Cricket Analytics Team",
    "content": """
    In an electrifying encounter at the M. Chinnaswamy Stadium, Royal Challengers Bangalore defeated Mumbai Indians by 7 wickets. 
    
    The match saw Virat Kohli returning to form with a spectacular 82* off 49 balls, including 6 fours and 4 sixes. His partnership with Faf du Plessis (61 off 41) was the highlight of RCB's chase, as they comfortably chased down MI's target of 168 with 8 balls to spare.
    
    Earlier, Mumbai Indians' innings was anchored by Rohit Sharma's patient 43 and Suryakumar Yadav's explosive 52 off just 33 deliveries. However, RCB's bowling unit, led by Harshal Patel (3/28) and Mohammed Siraj (2/22), did well to restrict MI to a below-par total.
    
    Key Moments:
    1. Harshal Patel's double-wicket over (16th) removed both set batsmen, Tilak Varma and Suryakumar Yadav
    2. Virat Kohli's assault against Jasprit Bumrah in the 14th over, taking 18 runs
    3. Faf du Plessis' calculated approach against MI's spinners in the middle overs
    
    Strategic Insights:
    - RCB successfully targeted MI's fifth bowling option, extracting 46 runs from 3 overs
    - MI's decision to hold back Bumrah until the 12th over backfired as RCB built momentum
    - RCB's bowling changes, particularly bringing Harshal Patel back for the 16th over, proved decisive
    
    Player Performance Analysis:
    - Virat Kohli showed exceptional strike rotation, with a non-boundary SR of 118.5
    - Harshal Patel's variations were particularly effective in the death overs, conceding just 5.2 RPO
    - Suryakumar Yadav's innovative shots against spin (SR of 192.3) kept MI in the game
    """,
    "key_moments": [
        "Harshal Patel's double-wicket over (16th)",
        "Virat Kohli's assault against Jasprit Bumrah in the 14th over",
        "Faf du Plessis' calculated approach against MI's spinners"
    ],
    "player_performances": {
        "V Kohli": {"runs": 82, "balls": 49, "fours": 6, "sixes": 4, "strike_rate": 167.35},
        "F du Plessis": {"runs": 61, "balls": 41, "fours": 4, "sixes": 3, "strike_rate": 148.78},
        "H Patel": {"overs": 4, "runs": 28, "wickets": 3, "economy": 7.00},
        "M Siraj": {"overs": 4, "runs": 22, "wickets": 2, "economy": 5.50},
        "R Sharma": {"runs": 43, "balls": 37, "fours": 3, "sixes": 1, "strike_rate": 116.22},
        "S Yadav": {"runs": 52, "balls": 33, "fours": 5, "sixes": 2, "strike_rate": 157.58}
    }
}

# Save match report to JSON
report_path = raw_data_dir / f"report_{match_id}.json"
with open(report_path, 'w') as f:
    json.dump(match_report, f, indent=2)
print(f"✅ Created match report data: {report_path}")

print("\n🔄 Processing data for RAG system...")

# Process the data for RAG
# 1. Create processed ball-by-ball stats
player_stats = ball_by_ball_df.groupby(['batsman']).agg({
    'runs_scored': 'sum',
    'match_id': 'count'
}).rename(columns={'match_id': 'balls_faced'})

player_stats['strike_rate'] = player_stats['runs_scored'] / player_stats['balls_faced'] * 100
player_stats['role'] = 'batsman'

# Bowling stats
bowling_stats = ball_by_ball_df.groupby(['bowler']).agg({
    'runs_scored': 'sum',
    'wicket_type': lambda x: sum(x.notna())
}).rename(columns={'wicket_type': 'wickets'})

bowling_stats['overs'] = ball_by_ball_df.groupby(['bowler']).size() / 6
bowling_stats['economy'] = bowling_stats['runs_scored'] / bowling_stats['overs']
bowling_stats['role'] = 'bowler'

# Save processed stats
player_stats.to_csv(processed_data_dir / "batting_stats.csv")
bowling_stats.to_csv(processed_data_dir / "bowling_stats.csv")

print(f"✅ Created processed batting stats: {len(player_stats)} records")
print(f"✅ Created processed bowling stats: {len(bowling_stats)} records")

# 2. Process commentary data for RAG
processed_commentary = []

for comment in commentary_data["commentary"]:
    processed_comment = {
        "text": comment["text"],
        "match_id": commentary_data["match_id"],
        "teams": commentary_data["teams"],
        "venue": commentary_data["venue"],
        "inning": comment["inning"],
        "over": comment["over"],
        "ball": comment["ball"],
        "type": "commentary"
    }
    processed_commentary.append(processed_comment)

# Save processed commentary
processed_commentary_path = processed_data_dir / f"processed_commentary_{match_id}.json"
with open(processed_commentary_path, 'w') as f:
    json.dump(processed_commentary, f, indent=2)

print(f"✅ Created processed commentary: {len(processed_commentary)} entries")

print("\n✅ Sample cricket data creation completed!")

## 5. Configure IDE Settings (VS Code)

Let's create workspace settings for optimal development experience with our Cricket Analytics project.

In [None]:
# Create VS Code workspace settings
vscode_dir = project_root / ".vscode"
vscode_dir.mkdir(exist_ok=True)

# VS Code settings for Cricket Analytics project
vscode_settings = {
    "python.defaultInterpreterPath": "./venv/Scripts/python.exe",
    "python.terminal.activateEnvironment": True,
    "python.formatting.provider": "black",
    "python.linting.enabled": True,
    "python.linting.pylintEnabled": False,
    "python.linting.flake8Enabled": True,
    "python.testing.pytestEnabled": True,
    "python.testing.unittestEnabled": False,
    "files.exclude": {
        "**/__pycache__": True,
        "**/*.pyc": True,
        ".pytest_cache": True,
        "data/vectordb": True
    },
    "jupyter.askForKernelRestart": False,
    "jupyter.interactiveWindow.collapseCellInputCode": "firstLine"
}

# Save VS Code settings
settings_path = vscode_dir / "settings.json"
with open(settings_path, 'w') as f:
    json.dump(vscode_settings, f, indent=2)

print(f"✅ Created VS Code settings at {settings_path}")

# Create VS Code extensions recommendations
extensions = {
    "recommendations": [
        "ms-python.python",
        "ms-python.flake8",
        "ms-python.black-formatter",
        "ms-toolsai.jupyter",
        "ms-vscode.vscode-json",
        "redhat.vscode-yaml",
        "ms-python.pylint",
        "ms-python.isort"
    ]
}

extensions_path = vscode_dir / "extensions.json"
with open(extensions_path, 'w') as f:
    json.dump(extensions, f, indent=2)

print(f"✅ Created VS Code extensions recommendations at {extensions_path}")

# Create launch configuration for debugging
launch_config = {
    "version": "0.2.0",
    "configurations": [
        {
            "name": "Cricket Analytics API",
            "type": "python",
            "request": "launch",
            "program": "${workspaceFolder}/src/api/__init__.py",
            "console": "integratedTerminal",
            "envFile": "${workspaceFolder}/.env"
        },
        {
            "name": "Cricket RAG Test",
            "type": "python",
            "request": "launch",
            "program": "${workspaceFolder}/src/rag/__init__.py",
            "console": "integratedTerminal"
        },
        {
            "name": "Current File",
            "type": "python",
            "request": "launch",
            "program": "${file}",
            "console": "integratedTerminal"
        }
    ]
}

launch_path = vscode_dir / "launch.json"
with open(launch_path, 'w') as f:
    json.dump(launch_config, f, indent=2)

print(f"✅ Created VS Code launch configuration at {launch_path}")
print(f"\n🔧 VS Code workspace configured for Cricket Analytics development!")

## 6. Test Environment Setup

Let's run comprehensive tests to verify that our development environment is properly configured.

In [None]:
# Comprehensive environment testing
from typing import Optional

import matplotlib.pyplot as plt
import pandas as pd

def test_environment_setup():
    """Run comprehensive tests to verify environment setup"""
    
    print("🧪 Testing Environment Setup")
    print("=" * 50)
    
    tests_passed = 0
    total_tests = 0
    sample_data: Optional[pd.DataFrame] = None
    
    # Test 1: Python imports
    total_tests += 1
    try:
        import sys
        import os
        import json
        import numpy as np
        print("✅ Test 1: Core Python libraries import successfully")
        tests_passed += 1
    except ImportError as e:
        print(f"❌ Test 1: Failed to import core libraries - {e}")
    
    # Test 2: Project structure
    total_tests += 1
    required_dirs = ["src", "data", "config", "notebooks", "tests"]
    missing_dirs = [d for d in required_dirs if not (project_root / d).exists()]
    
    if not missing_dirs:
        print("✅ Test 2: All required directories exist")
        tests_passed += 1
    else:
        print(f"❌ Test 2: Missing directories - {missing_dirs}")
    
    # Test 3: Configuration files
    total_tests += 1
    config_files = ["requirements.txt", "README.md", ".env.example"]
    missing_files = [f for f in config_files if not (project_root / f).exists()]
    
    if not missing_files:
        print("✅ Test 3: All configuration files exist")
        tests_passed += 1
    else:
        print(f"❌ Test 3: Missing files - {missing_files}")
    
    # Test 4: VS Code configuration
    total_tests += 1
    vscode_files = [".vscode/settings.json", ".vscode/extensions.json"]
    missing_vscode = [f for f in vscode_files if not (project_root / f).exists()]
    
    if not missing_vscode:
        print("✅ Test 4: VS Code configuration files exist")
        tests_passed += 1
    else:
        print(f"❌ Test 4: Missing VS Code files - {missing_vscode}")
    
    # Test 5: Create sample data
    total_tests += 1
    try:
        sample_data = pd.DataFrame({
            'player': ['V Kohli', 'R Sharma', 'K Williamson'],
            'runs': [50, 75, 42],
            'balls': [45, 68, 38],
            'strike_rate': [111.11, 110.29, 110.53]
        })
        
        # Save sample data
        sample_path = data_dir / "raw" / "sample_cricket_data.csv"
        sample_data.to_csv(sample_path, index=False)
        
        print("✅ Test 5: Sample cricket data created successfully")
        tests_passed += 1
    except Exception as e:
        print(f"❌ Test 5: Failed to create sample data - {e}")
    
    # Test 6: Basic data visualization
    total_tests += 1
    try:
        if sample_data is None:
            raise ValueError("Sample data unavailable for visualization")
        plt.figure(figsize=(8, 5))
        plt.bar(sample_data['player'], sample_data['runs'])
        plt.title('Sample Cricket Runs Analysis')
        plt.xlabel('Player')
        plt.ylabel('Runs')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
        
        print("✅ Test 6: Data visualization working")
        tests_passed += 1
    except Exception as e:
        print(f"❌ Test 6: Visualization failed - {e}")
    
    # Test Summary
    print(f"\n📊 Test Summary:")
    print(f"   Tests Passed: {tests_passed}/{total_tests}")
    print(f"   Success Rate: {(tests_passed/total_tests)*100:.1f}%")
    
    if tests_passed == total_tests:
        print(f"\n🎉 All tests passed! Environment setup is complete.")
        return True
    else:
        print(f"\n⚠️ Some tests failed. Please check the issues above.")
        return False

# Run the tests
test_success = test_environment_setup()

## 5. Configure IDE Settings (VS Code)

Let's create VS Code workspace settings optimized for our cricket analytics project.

In [None]:
# Create VS Code workspace settings
vscode_dir = project_root / ".vscode"
vscode_dir.mkdir(exist_ok=True)

# VS Code settings for Cricket Analytics project
vscode_settings = {
    "python.defaultInterpreterPath": "./venv/Scripts/python.exe",
    "python.linting.enabled": True,
    "python.linting.pylintEnabled": True,
    "python.formatting.provider": "black",
    "python.sortImports.args": ["--profile", "black"],
    "jupyter.askForKernelRestart": False,
    "files.associations": {
        "*.yaml": "yaml",
        "*.yml": "yaml"
    },
    "files.exclude": {
        "**/__pycache__": True,
        "**/.env": True,
        "**/data/vectordb": True
    },
    "search.exclude": {
        "**/data/raw": True,
        "**/data/processed": True,
        "**/logs": True
    },
    "python.terminal.activateEnvironment": True,
    "editor.rulers": [88],
    "editor.formatOnSave": True
}

# Extensions recommendations for Cricket Analytics
vscode_extensions = {
    "recommendations": [
        "ms-python.python",
        "ms-python.black-formatter", 
        "ms-toolsai.jupyter",
        "ms-python.pylint",
        "redhat.vscode-yaml",
        "ms-vscode.vscode-json",
        "GitHub.copilot",
        "ms-python.isort"
    ]
}

# Create settings.json
settings_path = vscode_dir / "settings.json"
with open(settings_path, 'w') as f:
    json.dump(vscode_settings, f, indent=2)

# Create extensions.json
extensions_path = vscode_dir / "extensions.json"
with open(extensions_path, 'w') as f:
    json.dump(vscode_extensions, f, indent=2)

print("⚙️ VS Code Configuration:")
print("=" * 50)
print(f"✅ Created settings.json: {settings_path}")
print(f"✅ Created extensions.json: {extensions_path}")
print(f"📁 VS Code workspace directory: {vscode_dir}")

print(f"\n🔧 Configured Settings:")
for key, value in vscode_settings.items():
    if isinstance(value, str):
        print(f"   {key}: {value}")
    else:
        print(f"   {key}: {type(value).__name__} with {len(value)} items")

print(f"\n📦 Recommended Extensions:")
for ext in vscode_extensions["recommendations"]:
    print(f"   - {ext}")

print(f"\n💡 Next steps:")
print(f"   1. Restart VS Code to apply settings")
print(f"   2. Install recommended extensions")
print(f"   3. Set up Python virtual environment")
print(f"   4. Configure debugging if needed")

## 6. Test Environment Setup

Let's run comprehensive tests to verify that our cricket analytics environment is properly configured.

In [None]:
# Comprehensive environment testing
import importlib
import traceback
from typing import Optional
from dataclasses import dataclass
import pandas as pd
import matplotlib.pyplot as plt

@dataclass
class EnvironmentTestResult:
    name: str
    description: str
    status: str
    error: Optional[str] = None
    severity: str = "warning"

def test_package_import(package_name: str, description: str = "") -> EnvironmentTestResult:
    """Test if a package can be imported successfully"""
    try:
        importlib.import_module(package_name)
        return EnvironmentTestResult(package_name, description, "success", severity="info")
    except ImportError as exc:
        return EnvironmentTestResult(package_name, description, "error", error=str(exc), severity="error")
    except Exception as exc:
        return EnvironmentTestResult(package_name, description, "warning", error=str(exc))

def run_environment_tests() -> None:
    """Run comprehensive environment tests"""
    test_df = pd.DataFrame()
    plt.close("all")
    
    print("🧪 Cricket Analytics Environment Tests")
    print("=" * 60)
    
    # Core Python packages
    print("\n📦 Core Python Packages:")
    core_packages = [
        ("pandas", "Data manipulation and analysis"),
        ("numpy", "Numerical computing"),
        ("matplotlib", "Plotting and visualization"),
        ("seaborn", "Statistical visualization"),
        ("plotly", "Interactive plotting"),
        ("json", "JSON handling"),
        ("pathlib", "Path operations"),
        ("datetime", "Date and time operations")
    ]
    
    for package, desc in core_packages:
        result = test_package_import(package, desc)
        if result.status == "success":
            print(f"   ✅ {result.name} - {result.description}")
        else:
            print(f"   {result.status.upper()} {result.name} - {result.description} ({result.error})")
    
    # AI/ML packages (optional for now)
    print("\n🤖 AI/ML Packages (Optional):")
    ai_packages = [
        ("langchain", "LangChain framework"),
        ("chromadb", "Vector database"),
        ("sentence_transformers", "Sentence embeddings"),
        ("openai", "OpenAI API client"),
        ("fastapi", "API framework"),
        ("uvicorn", "ASGI server")
    ]
    
    for package, desc in ai_packages:
        result = test_package_import(package, desc)
        if result.status == "success":
            print(f"   ✅ {result.name} - {result.description}")
        else:
            print(f"   {result.status.upper()} {result.name} - {result.description} ({result.error})")
    
    # Test basic functionality
    print("\n🔧 Functionality Tests:")
    
    # Test data operations
    try:
        test_df = pd.DataFrame({
            "player": ["Virat Kohli", "Rohit Sharma", "KL Rahul"],
            "runs": [10570, 9205, 2265],
            "matches": [254, 227, 42]
        })
        test_df["average"] = test_df["runs"] / test_df["matches"]
        print("   ✅ Pandas DataFrame operations")
    except Exception as exc:
        print(f"   ❌ Pandas DataFrame operations: {exc}")
        test_df = pd.DataFrame()
    
    # Test plotting
    try:
        fig, ax = plt.subplots(1, 1, figsize=(6, 4))
        if not test_df.empty:
            ax.bar(test_df["player"], test_df["runs"])
        ax.set_title("Player Runs")
        plt.close(fig)
        print("   ✅ Matplotlib plotting")
    except Exception as exc:
        print(f"   ❌ Matplotlib plotting: {exc}")
        plt.close("all")
    
    # Test file operations
    try:
        test_file = project_root / "data" / "test.txt"
        test_file.write_text("Cricket Analytics Test")
        content = test_file.read_text()
        test_file.unlink()  # Delete test file
        print("   ✅ File I/O operations")
    except Exception as exc:
        print(f"   ❌ File I/O operations: {exc}")
    
    # Test JSON operations
    try:
        test_data = {"match_id": "test", "runs": 180}
        json_str = json.dumps(test_data)
        parsed_data = json.loads(json_str)
        print("   ✅ JSON serialization/deserialization")
    except Exception as exc:
        print(f"   ❌ JSON operations: {exc}")
    
    print("\n📊 Environment Summary:")
    print(f"   Project Root: {project_root}")
    print(f"   Python Version: {sys.version.split()[0]}")
    print(f"   Current Working Directory: {Path.cwd()}")
    print(f"   Data Directory: {data_dir}")
    
    # Check if our custom modules can be imported
    print("\n🏏 Cricket Analytics Modules:")
    try:
        cricket_modules = [
            ("data", "Cricket data processing"),
            ("rag", "RAG system implementation"),
            ("langgraph", "Multi-hop reasoning"),
            ("api", "FastAPI application")
        ]
        
        for module, desc in cricket_modules:
            result = test_package_import(f"src.{module}", desc)
            if result.status == "success":
                print(f"   ✅ {module} - {desc}")
            else:
                print(f"   {result.status.upper()} {module} - {desc} ({result.error})")
    except Exception as exc:
        print(f"   ⚠️ Custom modules check failed: {exc}")

run_environment_tests()

## 5. Configure IDE Settings (VS Code)

Let's create VS Code workspace settings optimized for our cricket analytics project.

In [None]:
# Configure VS Code workspace settings
vscode_dir = project_root / ".vscode"
vscode_dir.mkdir(exist_ok=True)

# VS Code settings for cricket analytics project
vscode_settings = {
    "python.defaultInterpreterPath": "./venv/Scripts/python.exe",
    "python.linting.enabled": True,
    "python.linting.pylintEnabled": True,
    "python.formatting.provider": "black",
    "python.testing.pytestEnabled": True,
    "python.testing.pytestArgs": ["tests"],
    "files.exclude": {
        "**/__pycache__": True,
        "**/*.pyc": True,
        "data/raw/*.csv": True,
        "data/vectordb": True
    },
    "jupyter.askForKernelRestart": False,
    "jupyter.interactiveWindow.textEditor.executeSelection": True,
    "editor.formatOnSave": True,
    "editor.codeActionsOnSave": {
        "source.organizeImports": True
    },
    "files.associations": {
        "*.yaml": "yaml",
        "*.yml": "yaml"
    },
    "workbench.colorTheme": "Default Dark+",
    "terminal.integrated.defaultProfile.windows": "PowerShell"
}

# VS Code extensions recommendations
vscode_extensions = {
    "recommendations": [
        "ms-python.python",
        "ms-python.pylint", 
        "ms-toolsai.jupyter",
        "ms-vscode.vscode-json",
        "redhat.vscode-yaml",
        "ms-python.black-formatter",
        "ms-python.isort",
        "GitHub.copilot",
        "ms-vscode.powershell"
    ]
}

# Create settings.json
settings_path = vscode_dir / "settings.json"
with open(settings_path, 'w') as f:
    json.dump(vscode_settings, f, indent=2)

# Create extensions.json
extensions_path = vscode_dir / "extensions.json"
with open(extensions_path, 'w') as f:
    json.dump(vscode_extensions, f, indent=2)

# Create launch.json for debugging
launch_config = {
    "version": "0.2.0",
    "configurations": [
        {
            "name": "Python: Current File",
            "type": "python",
            "request": "launch",
            "program": "${file}",
            "console": "integratedTerminal",
            "cwd": "${workspaceFolder}",
            "env": {
                "PYTHONPATH": "${workspaceFolder}/src"
            }
        },
        {
            "name": "Python: FastAPI",
            "type": "python",
            "request": "launch",
            "program": "${workspaceFolder}/src/api/__init__.py",
            "console": "integratedTerminal",
            "cwd": "${workspaceFolder}",
            "env": {
                "PYTHONPATH": "${workspaceFolder}/src"
            }
        },
        {
            "name": "Python: Test Suite",
            "type": "python",
            "request": "launch",
            "module": "pytest",
            "args": ["tests/"],
            "console": "integratedTerminal",
            "cwd": "${workspaceFolder}"
        }
    ]
}

launch_path = vscode_dir / "launch.json"
with open(launch_path, 'w') as f:
    json.dump(launch_config, f, indent=2)

print("🔧 VS Code Configuration:")
print("=" * 50)
print(f"✅ Created {settings_path}")
print(f"✅ Created {extensions_path}")  
print(f"✅ Created {launch_path}")

print(f"\n🔌 Recommended Extensions:")
for ext in vscode_extensions["recommendations"]:
    print(f"   • {ext}")

print(f"\n⚡ VS Code is now optimized for cricket analytics development!")
print(f"   • Python formatting with Black")
print(f"   • Jupyter notebook support") 
print(f"   • Integrated testing with pytest")
print(f"   • Debug configurations for API and tests")

## 6. Test Environment Setup

Let's run comprehensive tests to verify that our cricket analytics environment is properly set up.

In [None]:
# Comprehensive environment testing
from typing import Optional

import importlib
import importlib.util
import os
import sys

import matplotlib.pyplot as plt
import pandas as pd

def test_cricket_analytics_environment():
    """Test all components of the cricket analytics environment"""
    
    print("🧪 Testing Cricket Analytics Environment")
    print("=" * 60)
    
    tests_passed = 0
    total_tests = 0
    df: Optional[pd.DataFrame] = None
    
    # Test 1: Core Python libraries
    print("\n1️⃣ Testing Core Libraries:")
    try:
        import numpy as np
        print("   ✅ Core data science libraries working")
        tests_passed += 1
    except ImportError as e:
        print(f"   ❌ Core libraries failed: {e}")
    total_tests += 1
    
    # Test 2: Sample data processing
    print("\n2️⃣ Testing Data Processing:")
    try:
        # Create sample cricket data
        sample_data = {
            'player': ['V Kohli', 'R Sharma', 'MS Dhoni', 'H Pandya'],
            'runs': [89, 45, 23, 67],
            'balls_faced': [56, 38, 29, 42],
            'strike_rate': [158.9, 118.4, 79.3, 159.5],
            'match_type': ['T20', 'T20', 'T20', 'T20']
        }
        
        df = pd.DataFrame(sample_data)
        df['average'] = df['runs'].mean()
        
        print(f"   ✅ Created sample cricket dataset: {len(df)} players")
        print(f"   ✅ Data processing working (avg runs: {df['runs'].mean():.1f})")
        tests_passed += 1
    except Exception as e:
        print(f"   ❌ Data processing failed: {e}")
    total_tests += 1
    
    # Test 3: Visualization
    print("\n3️⃣ Testing Visualization:")
    try:
        if df is None:
            raise ValueError("Sample dataframe unavailable for visualization")
        plt.figure(figsize=(8, 5))
        plt.bar(df['player'], df['runs'], color='skyblue', alpha=0.7)
        plt.title('Sample Cricket Runs Analysis')
        plt.ylabel('Runs Scored')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
        
        print("   ✅ Matplotlib visualization working")
        tests_passed += 1
    except Exception as e:
        print(f"   ❌ Visualization failed: {e}")
    total_tests += 1
    
    # Test 4: File I/O
    print("\n4️⃣ Testing File Operations:")
    try:
        if df is None:
            raise ValueError("Sample dataframe unavailable for file tests")
        test_file = data_dir / "raw" / "test_cricket_data.csv"
        df.to_csv(test_file, index=False)
        
        # Read it back
        df_loaded = pd.read_csv(test_file)
        assert len(df_loaded) == len(df)
        
        print(f"   ✅ File I/O working: {test_file}")
        tests_passed += 1
    except Exception as e:
        print(f"   ❌ File I/O failed: {e}")
    total_tests += 1
    
    # Test 5: Project module imports
    print("\n5️⃣ Testing Project Modules:")
    try:
        # Test if we can import our modules (may fail due to dependencies)
        sys.path.insert(0, str(src_dir))
        
        # Try importing without actual execution
        spec_data = importlib.util.spec_from_file_location("cricket_data", src_dir / "data" / "__init__.py")
        if spec_data and spec_data.loader:
            print("   ✅ Cricket data module structure valid")
            tests_passed += 1
        else:
            print("   ⚠️ Cricket data module structure needs verification")
    except Exception as e:
        print(f"   ⚠️ Module import test inconclusive: {e}")
    total_tests += 1
    
    # Test 6: Environment variables
    print("\n6️⃣ Testing Environment Configuration:")
    try:
        required_env_vars = ['CRICKET_DATA_DIR', 'CRICKET_LOG_LEVEL']
        missing_vars = [var for var in required_env_vars if not os.environ.get(var)]
        
        if not missing_vars:
            print("   ✅ All required environment variables set")
            tests_passed += 1
        else:
            print(f"   ⚠️ Missing environment variables: {missing_vars}")
    except Exception as e:
        print(f"   ❌ Environment test failed: {e}")
    total_tests += 1
    
    # Test Summary
    print(f"\n📊 Test Summary:")
    print("=" * 60)
    print(f"Tests Passed: {tests_passed}/{total_tests}")
    print(f"Success Rate: {(tests_passed/total_tests)*100:.1f}%")
    
    if tests_passed == total_tests:
        print("🎉 All tests passed! Environment is ready for cricket analytics.")
    elif tests_passed >= total_tests * 0.8:
        print("✅ Environment mostly ready. Minor issues detected.")
    else:
        print("⚠️ Significant issues detected. Please review the details above.")

    return tests_passed, total_tests

# Run the comprehensive environment test suite
env_passed, env_total = test_cricket_analytics_environment()

## 🎯 Next Steps & Development Roadmap

Congratulations! You've successfully set up the Cricket Analytics RAG Project workspace. Here's what comes next: