# PawConnect Data Exploration

This notebook explores the pet shelter data to understand patterns, distributions, and insights for building the recommendation system.

## Table of Contents
1. Setup and Data Loading
2. Pet Demographics Analysis
3. Shelter Statistics
4. Adoption Patterns
5. Feature Engineering Ideas
6. Insights and Recommendations

## 1. Setup and Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys

# Add parent directory to path
sys.path.append(str(Path().absolute().parent))

# PawConnect imports
from pawconnect_ai.config import settings
from pawconnect_ai.sub_agents.pet_search_agent import PetSearchAgent

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✓ Libraries imported successfully")

In [None]:
# Enable mock mode for exploration
settings.mock_apis = True
settings.testing_mode = True

print(f"Environment: {settings.environment}")
print(f"Mock APIs: {settings.mock_apis}")

### Load Sample Pet Data

In [None]:
# Initialize search agent
search_agent = PetSearchAgent()

# Get mock pet data
import asyncio
pets = await search_agent.search_pets(
    location="Seattle, WA",
    limit=100
)

print(f"Loaded {len(pets)} pets for analysis")

In [None]:
# Convert to DataFrame for analysis
pet_data = []
for pet in pets:
    pet_data.append({
        'pet_id': pet.pet_id,
        'name': pet.name,
        'species': pet.species.value,
        'breed': pet.breed,
        'age': pet.age.value,
        'size': pet.size.value,
        'gender': pet.gender.value,
        'good_with_children': pet.attributes.good_with_children,
        'good_with_dogs': pet.attributes.good_with_dogs,
        'good_with_cats': pet.attributes.good_with_cats,
        'house_trained': pet.attributes.house_trained,
        'energy_level': pet.attributes.energy_level,
        'days_in_shelter': pet.days_in_shelter,
        'is_urgent': pet.is_urgent,
        'shelter_city': pet.shelter.city,
        'shelter_state': pet.shelter.state
    })

df = pd.DataFrame(pet_data)
print(f"Created DataFrame with {len(df)} rows and {len(df.columns)} columns")
df.head()

## 2. Pet Demographics Analysis

### Species Distribution

In [None]:
# Species distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
species_counts = df['species'].value_counts()
ax1.bar(species_counts.index, species_counts.values)
ax1.set_title('Pet Species Distribution', fontsize=14, fontweight='bold')
ax1.set_xlabel('Species')
ax1.set_ylabel('Count')

# Pie chart
ax2.pie(species_counts.values, labels=species_counts.index, autopct='%1.1f%%')
ax2.set_title('Species Percentage', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print("\nSpecies Statistics:")
print(species_counts)

### Age and Size Distribution

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Age distribution
age_order = ['baby', 'young', 'adult', 'senior']
df['age'].value_counts().reindex(age_order).plot(kind='bar', ax=ax1, color='skyblue')
ax1.set_title('Age Distribution', fontsize=14, fontweight='bold')
ax1.set_xlabel('Age Category')
ax1.set_ylabel('Count')
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45)

# Size distribution
size_order = ['small', 'medium', 'large', 'extra_large']
df['size'].value_counts().reindex(size_order).plot(kind='bar', ax=ax2, color='lightcoral')
ax2.set_title('Size Distribution', fontsize=14, fontweight='bold')
ax2.set_xlabel('Size Category')
ax2.set_ylabel('Count')
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45)

plt.tight_layout()
plt.show()

### Breed Analysis

In [None]:
# Top 10 breeds
top_breeds = df['breed'].value_counts().head(10)

plt.figure(figsize=(12, 6))
plt.barh(range(len(top_breeds)), top_breeds.values)
plt.yticks(range(len(top_breeds)), top_breeds.index)
plt.xlabel('Count')
plt.title('Top 10 Most Common Breeds', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print(f"\nTotal unique breeds: {df['breed'].nunique()}")

## 3. Behavioral Attributes

In [None]:
# Analyze behavioral attributes
behavioral_cols = ['good_with_children', 'good_with_dogs', 'good_with_cats', 'house_trained']

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, col in enumerate(behavioral_cols):
    # Count values
    counts = df[col].value_counts()
    
    # Create pie chart
    axes[idx].pie(counts.values, labels=counts.index, autopct='%1.1f%%', startangle=90)
    axes[idx].set_title(col.replace('_', ' ').title(), fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

### Energy Level Analysis

In [None]:
# Energy levels by species
energy_by_species = pd.crosstab(df['species'], df['energy_level'], normalize='index') * 100

energy_by_species.plot(kind='bar', stacked=False, figsize=(12, 6))
plt.title('Energy Levels by Species (%)', fontsize=14, fontweight='bold')
plt.xlabel('Species')
plt.ylabel('Percentage')
plt.xticks(rotation=45)
plt.legend(title='Energy Level')
plt.tight_layout()
plt.show()

## 4. Shelter Statistics

In [None]:
# Days in shelter analysis
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
df['days_in_shelter'].hist(bins=20, ax=ax1, edgecolor='black')
ax1.set_title('Days in Shelter Distribution', fontsize=14, fontweight='bold')
ax1.set_xlabel('Days')
ax1.set_ylabel('Frequency')
ax1.axvline(df['days_in_shelter'].mean(), color='red', linestyle='--', label=f'Mean: {df["days_in_shelter"].mean():.1f}')
ax1.legend()

# Box plot by species
df.boxplot(column='days_in_shelter', by='species', ax=ax2)
ax2.set_title('Days in Shelter by Species', fontsize=14, fontweight='bold')
ax2.set_xlabel('Species')
ax2.set_ylabel('Days')
plt.suptitle('')  # Remove default title

plt.tight_layout()
plt.show()

print("\nDays in Shelter Statistics:")
print(df['days_in_shelter'].describe())

### Urgent Cases Analysis

In [None]:
# Urgent pets
urgent_stats = df.groupby('is_urgent').agg({
    'pet_id': 'count',
    'days_in_shelter': 'mean'
}).rename(columns={'pet_id': 'count', 'days_in_shelter': 'avg_days_in_shelter'})

print("Urgent Cases Statistics:")
print(urgent_stats)
print(f"\nPercentage of urgent cases: {(df['is_urgent'].sum() / len(df)) * 100:.1f}%")

## 5. Correlation Analysis

In [None]:
# Create numeric columns for correlation
df_numeric = df.copy()
df_numeric['species_num'] = df['species'].astype('category').cat.codes
df_numeric['age_num'] = df['age'].astype('category').cat.codes
df_numeric['size_num'] = df['size'].astype('category').cat.codes
df_numeric['energy_num'] = df['energy_level'].astype('category').cat.codes

# Select numeric columns
corr_cols = ['species_num', 'age_num', 'size_num', 'energy_num', 'days_in_shelter', 'is_urgent']
corr_matrix = df_numeric[corr_cols].corr()

# Plot correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Feature Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 6. Insights and Recommendations

### Key Findings:

1. **Species Distribution**: [Analysis based on actual data]
2. **Age Patterns**: [Analysis based on actual data]
3. **Shelter Stay Duration**: [Analysis based on actual data]
4. **Urgent Cases**: [Analysis based on actual data]

### Recommendations for Recommendation System:

1. **Feature Engineering**:
   - Create composite compatibility scores
   - Weight urgent cases higher
   - Consider days in shelter as urgency factor

2. **Model Features**:
   - Use behavioral attributes (good with kids/pets)
   - Include energy level matching
   - Factor in size and home type compatibility

3. **Business Logic**:
   - Boost senior pets in recommendations
   - Prioritize long-stay pets
   - Match energy levels to user activity

### Next Steps:

1. Build recommendation model (see `model_training.ipynb`)
2. Implement A/B testing framework
3. Collect user feedback for model improvement
4. Monitor recommendation performance metrics

In [None]:
# Save processed data for model training
output_path = Path('../data/processed_pet_data.csv')
output_path.parent.mkdir(exist_ok=True, parents=True)
df.to_csv(output_path, index=False)
print(f"✓ Processed data saved to {output_path}")