# Inter-IIT Round 2: Physics-Informed Hybrid Fusion

## Strategy Overview
1. **Wine Chemistry Models**: Learn relationships (pH from acids, density from sugar/alcohol)
2. **Robust KNN Matching**: K=5 neighbors with weighted interpolation
3. **Constraint Enforcement**: SO2 constraint, physics validation

---

## Step 1: Environment Setup

In [None]:
# Install and activate Genuity
!pip install genuity
import genuity
genuity.activate_license(
    "eyJ1c2VyIjoicmFnaGF2QGV4YW1wbGUuY29tIiwicGxhbiI6InRyaWFsIiwiZXhwIjoxNzY1"
    "Nzc1NzcwfQ.axr0ShXS82rLNCEG6kByiaDtMv3eFkTsDEfLSkPtoMsxjM_AiGEJkg2tV2Cw"
    "WXXJ_irMjlCITLC96K-uvH-EGBQ"
)

In [None]:
import sys
import os
import json
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
import warnings
warnings.filterwarnings('ignore')

# Add dataset to path
DATASET_PATH = "/kaggle/input/round-2-inter-iit"
sys.path.insert(0, DATASET_PATH)

print("‚úì Libraries imported")

In [None]:
# Import generators
from generators import G1, G2, G3, G4, G5
print("‚úì Generators imported successfully!")

# Load metadata
with open(os.path.join(DATASET_PATH, "metadata", "columns_L.json"), "r") as f:
    TARGET_COLUMNS = json.load(f)
with open(os.path.join(DATASET_PATH, "metadata", "tables_info.json"), "r") as f:
    TABLES_INFO = json.load(f)

print(f"\nTarget columns ({len(TARGET_COLUMNS)}): {TARGET_COLUMNS}")

## Step 2: Physics Models Class

Learn wine chemistry relationships:
- `pH = f(volatile_acidity, citric_acid)`
- `density = f(residual_sugar, alcohol)`
- `SO2 ratio = free_SO2 / total_SO2`

In [None]:
class PhysicsModels:
    """Learn physical relationships from generator data."""
    
    def __init__(self):
        self.ph_model = None
        self.density_model = None
        self.so2_ratio_stats = None
    
    def fit_ph_model(self, df):
        """Train: pH = f(acids)"""
        features = ['volatile acidity', 'citric acid']
        available = [f for f in features if f in df.columns]
        
        if 'pH' in df.columns and available:
            X = df[available].values
            y = df['pH'].values
            
            self.ph_model = GradientBoostingRegressor(n_estimators=100, max_depth=4, random_state=42)
            self.ph_model.fit(X, y)
            self.ph_features = available
            
            y_pred = self.ph_model.predict(X)
            r2 = 1 - np.sum((y - y_pred)**2) / np.sum((y - np.mean(y))**2)
            print(f"   pH model: R¬≤ = {r2:.4f}, features = {available}")
            return r2
        return None
    
    def fit_density_model(self, df):
        """Train: density = f(sugar, alcohol)"""
        features = ['residual sugar', 'alcohol']
        available = [f for f in features if f in df.columns]
        
        if 'density' in df.columns and available:
            X = df[available].values
            y = df['density'].values
            
            self.density_model = GradientBoostingRegressor(n_estimators=100, max_depth=4, random_state=42)
            self.density_model.fit(X, y)
            self.density_features = available
            
            y_pred = self.density_model.predict(X)
            r2 = 1 - np.sum((y - y_pred)**2) / np.sum((y - np.mean(y))**2)
            print(f"   Density model: R¬≤ = {r2:.4f}, features = {available}")
            return r2
        return None
    
    def fit_so2_ratio(self, df):
        """Learn typical free_SO2 / total_SO2 ratio"""
        if 'free sulfur dioxide' in df.columns and 'total sulfur dioxide' in df.columns:
            free = df['free sulfur dioxide'].values
            total = df['total sulfur dioxide'].values
            valid = total > 0
            ratios = free[valid] / total[valid]
            
            self.so2_ratio_stats = {'mean': np.mean(ratios), 'std': np.std(ratios)}
            print(f"   SO2 ratio: mean = {self.so2_ratio_stats['mean']:.3f}")
            return self.so2_ratio_stats
        return None
    
    def predict_ph(self, df):
        if self.ph_model is None:
            return None
        return self.ph_model.predict(df[self.ph_features].values)
    
    def predict_density(self, df):
        if self.density_model is None:
            return None
        return self.density_model.predict(df[self.density_features].values)
    
    def enforce_so2_constraint(self, free, total):
        """Ensure free_SO2 <= total_SO2"""
        free = np.array(free)
        total = np.array(total)
        violations = free > total
        if np.sum(violations) > 0:
            ratio = self.so2_ratio_stats['mean'] if self.so2_ratio_stats else 0.3
            free[violations] = total[violations] * ratio
            print(f"   Fixed {np.sum(violations)} SO2 violations")
        return free, total

print("‚úì PhysicsModels class defined")

## Step 3: Robust KNN Matching Functions

Instead of taking 1 nearest neighbor, we:
1. Find K=5 closest neighbors
2. Weight by inverse distance (closer = higher weight)
3. Compute weighted average

In [None]:
def find_nearest_matches_robust(base_df, pool_df, overlap_cols, k=5, warn_threshold=2.0):
    """
    Robust KNN matching with K neighbors and weighted interpolation.
    
    Returns:
        result_df: DataFrame with weighted-interpolated values
        match_stats: Dictionary with matching quality statistics
    """
    scaler = StandardScaler()
    pool_norm = scaler.fit_transform(pool_df[overlap_cols])
    base_norm = scaler.transform(base_df[overlap_cols])
    
    # Find K nearest neighbors
    nn = NearestNeighbors(n_neighbors=k, algorithm='ball_tree')
    nn.fit(pool_norm)
    distances, indices = nn.kneighbors(base_norm)
    
    # Match quality statistics
    first_dist = distances[:, 0]
    mean_dist = np.mean(first_dist)
    n_poor = np.sum(first_dist > warn_threshold * mean_dist)
    
    match_stats = {
        'mean_distance': mean_dist,
        'max_distance': np.max(first_dist),
        'n_poor_matches': n_poor,
        'poor_match_pct': 100 * n_poor / len(base_df)
    }
    
    if n_poor > 0:
        print(f"      ‚ö†Ô∏è {n_poor} rows ({match_stats['poor_match_pct']:.1f}%) have poor matches")
    
    # Weighted interpolation (closer = higher weight)
    weights = 1.0 / (distances + 1e-6)
    weights = weights / weights.sum(axis=1, keepdims=True)
    
    result_df = pd.DataFrame()
    for col in pool_df.columns:
        values = pool_df[col].values[indices]  # (n_samples, k)
        result_df[col] = np.sum(values * weights, axis=1)
    
    return result_df, match_stats


def physics_informed_blend(values_list, weights=None):
    """Blend values with inverse-variance weighting."""
    if weights is None:
        variances = [np.var(v) for v in values_list]
        inv_var = [1.0 / (v + 1e-6) for v in variances]
        weights = [w / sum(inv_var) for w in inv_var]
    
    result = np.zeros_like(values_list[0], dtype=float)
    for val, w in zip(values_list, weights):
        result += w * np.array(val)
    return result


def print_match_summary(stats_dict):
    """Print match quality summary."""
    print("\n   [Match Quality Summary]")
    for gen, stats in stats_dict.items():
        status = "‚úì" if stats['n_poor_matches'] == 0 else "‚ö†Ô∏è"
        print(f"   {status} {gen}: mean_dist={stats['mean_distance']:.4f}, poor={stats['n_poor_matches']}")

print("‚úì Matching functions defined")

## Step 4: Generate Data Pools

In [None]:
N_SAMPLES = 1000   # Final output size
POOL_SIZE = 5000   # Size of candidate pools (larger = better matches)

print("Generating data from all 5 generators...")

g1_pool = G1.generate(POOL_SIZE)
print(f"‚úì G1: {g1_pool.shape} - {list(g1_pool.columns)}")

g2_pool = G2.generate(POOL_SIZE)
print(f"‚úì G2: {g2_pool.shape} - {list(g2_pool.columns)}")

g3_pool = G3.generate(POOL_SIZE)
print(f"‚úì G3: {g3_pool.shape} - {list(g3_pool.columns)}")

g4_pool = G4.generate(POOL_SIZE)
print(f"‚úì G4: {g4_pool.shape} - {list(g4_pool.columns)}")

g5_pool = G5.generate(POOL_SIZE)
print(f"‚úì G5: {g5_pool.shape} - {list(g5_pool.columns)}")

## Step 5: Learn Physics Models

In [None]:
print("Learning physics-based relationships...\n")

physics = PhysicsModels()

# pH model from G2 (has pH + volatile acidity + citric acid)
print("1. pH = f(acids):")
physics.fit_ph_model(g2_pool)

# Density model - combine G5 features with G3 density
print("\n2. Density = f(sugar, alcohol):")
density_train = g5_pool[['residual sugar', 'alcohol']].copy()
density_train['density'] = g3_pool['density'].values[:POOL_SIZE]
physics.fit_density_model(density_train)

# SO2 ratio from G3
print("\n3. SO2 ratio constraint:")
physics.fit_so2_ratio(g3_pool)

## Step 6: Robust KNN Matching

In [None]:
print("Performing robust KNN matching (K=5 with weighted interpolation)...\n")

# G3 as anchor
base_df = g3_pool.iloc[:N_SAMPLES].reset_index(drop=True)
print(f"Anchor (G3): {base_df.shape}")

match_stats_all = {}

# Match G2 on: free SO2, total SO2, pH
print("\nMatching G2 on [free SO2, total SO2, pH]...")
matched_g2, g2_stats = find_nearest_matches_robust(
    base_df, g2_pool, ['free sulfur dioxide', 'total sulfur dioxide', 'pH'], k=5
)
match_stats_all['G2'] = g2_stats

# Match G4 on: sulphates, density
print("Matching G4 on [sulphates, density]...")
matched_g4, g4_stats = find_nearest_matches_robust(
    base_df, g4_pool, ['sulphates', 'density'], k=5
)
match_stats_all['G4'] = g4_stats

# Match G5 on: pH
print("Matching G5 on [pH]...")
matched_g5, g5_stats = find_nearest_matches_robust(
    base_df, g5_pool, ['pH'], k=5
)
match_stats_all['G5'] = g5_stats

# Match G1 via G5 on: fixed acidity, residual sugar, chlorides
print("Matching G1 via G5 on [fixed acidity, residual sugar, chlorides]...")
matched_g1, g1_stats = find_nearest_matches_robust(
    matched_g5, g1_pool, ['fixed acidity', 'residual sugar', 'chlorides'], k=5
)
match_stats_all['G1'] = g1_stats

# Summary
print_match_summary(match_stats_all)

## Step 7: Assemble Final Dataset

In [None]:
print("Assembling final dataset with physics-informed blending...\n")

final_df = pd.DataFrame()

# From G3 (anchor)
final_df['free sulfur dioxide'] = base_df['free sulfur dioxide'].values
final_df['total sulfur dioxide'] = base_df['total sulfur dioxide'].values
final_df['density'] = base_df['density'].values  # Placeholder
final_df['sulphates'] = base_df['sulphates'].values
print("‚úì G3: free SO2, total SO2, density, sulphates")

# From G4 (matched)
final_df['alcohol'] = matched_g4['alcohol'].values
final_df['quality'] = matched_g4['quality'].values
print("‚úì G4: alcohol, quality")

# From G5 (matched)
final_df['fixed acidity'] = matched_g5['fixed acidity'].values
final_df['residual sugar'] = matched_g5['residual sugar'].values
final_df['chlorides'] = matched_g5['chlorides'].values
print("‚úì G5: fixed acidity, residual sugar, chlorides")

# From G1+G2 (blended)
final_df['volatile acidity'] = physics_informed_blend(
    [matched_g1['volatile acidity'].values, matched_g2['volatile acidity'].values]
)
final_df['citric acid'] = physics_informed_blend(
    [matched_g1['citric acid'].values, matched_g2['citric acid'].values]
)
print("‚úì G1+G2 (blended): volatile acidity, citric acid")

## Step 8: Apply Physics Constraints

In [None]:
print("Applying physics constraints...\n")

# pH: Blend physics prediction with matched value
print("1. Physics-based pH:")
temp_ph = pd.DataFrame({
    'volatile acidity': final_df['volatile acidity'],
    'citric acid': final_df['citric acid']
})
if physics.ph_model is not None:
    ph_physics = physics.predict_ph(temp_ph)
    ph_matched = base_df['pH'].values
    final_df['pH'] = 0.6 * ph_physics + 0.4 * ph_matched
    print(f"   Blended: 60% physics + 40% matched")
else:
    final_df['pH'] = base_df['pH'].values

# Density: Blend physics prediction with matched value
print("\n2. Physics-based density:")
temp_density = pd.DataFrame({
    'residual sugar': final_df['residual sugar'],
    'alcohol': final_df['alcohol']
})
if physics.density_model is not None:
    density_physics = physics.predict_density(temp_density)
    density_matched = base_df['density'].values
    final_df['density'] = 0.5 * density_physics + 0.5 * density_matched
    print(f"   Blended: 50% physics + 50% matched")

# SO2 constraint
print("\n3. SO2 constraint (free <= total):")
free, total = physics.enforce_so2_constraint(
    final_df['free sulfur dioxide'].values,
    final_df['total sulfur dioxide'].values
)
final_df['free sulfur dioxide'] = free
final_df['total sulfur dioxide'] = total

# Reorder columns
final_df = final_df[TARGET_COLUMNS]
print(f"\n‚úì Final shape: {final_df.shape}")

## Step 9: Validation

In [None]:
print("=" * 60)
print("VALIDATION")
print("=" * 60)

# Basic checks
print(f"\n[Basic Checks]")
print(f"  ‚úì Rows: {len(final_df)} (expected: 1000)")
print(f"  ‚úì Columns: {len(final_df.columns)} (expected: 12)")
print(f"  ‚úì Nulls: {final_df.isnull().sum().sum()} (expected: 0)")
print(f"  ‚úì Column match: {list(final_df.columns) == TARGET_COLUMNS}")

# Physics checks
print(f"\n[Physics Checks]")
so2_violations = np.sum(final_df['free sulfur dioxide'] > final_df['total sulfur dioxide'])
print(f"  SO2 violations: {so2_violations} (expected: 0)")

ph_ok = np.sum((final_df['pH'] >= 2.8) & (final_df['pH'] <= 4.0))
print(f"  pH in range [2.8, 4.0]: {ph_ok}/{len(final_df)} ({100*ph_ok/len(final_df):.1f}%)")

# Correlation checks
print(f"\n[Correlation Checks - Physics Validation]")
corr_acid_ph = np.corrcoef(final_df['fixed acidity'], final_df['pH'])[0, 1]
print(f"  Fixed acidity ‚Üî pH: {corr_acid_ph:.3f} (expected: negative)")

corr_alc_dens = np.corrcoef(final_df['alcohol'], final_df['density'])[0, 1]
print(f"  Alcohol ‚Üî Density: {corr_alc_dens:.3f} (expected: negative)")

corr_sugar_dens = np.corrcoef(final_df['residual sugar'], final_df['density'])[0, 1]
print(f"  Residual sugar ‚Üî Density: {corr_sugar_dens:.3f} (expected: positive)")

corr_so2 = np.corrcoef(final_df['free sulfur dioxide'], final_df['total sulfur dioxide'])[0, 1]
print(f"  Free SO2 ‚Üî Total SO2: {corr_so2:.3f} (expected: positive)")

In [None]:
# Statistics
print("\nDataset Statistics:")
display(final_df.describe().round(4))

In [None]:
# Preview
print("\nPreview:")
display(final_df.head(10))

## Step 10: Save Output

In [None]:
OUTPUT_PATH = "physics_informed_dataset.csv"
final_df.to_csv(OUTPUT_PATH, index=False)

print(f"‚úì Dataset saved to: {OUTPUT_PATH}")
print(f"\nüéâ SUCCESS! Physics-informed dataset ready for submission.")
print(f"\nFile size: {os.path.getsize(OUTPUT_PATH) / 1024:.1f} KB")

---

## Summary

**Strategy:** Physics-Informed Hybrid Fusion

**Key Features:**
1. ‚úì G3 as anchor (most connected generator)
2. ‚úì Robust KNN with K=5 neighbors + weighted interpolation
3. ‚úì Physics models (pH from acids, density from sugar/alcohol)
4. ‚úì SO2 constraint enforcement
5. ‚úì Physics-informed blending for overlapping columns

**Validation:**
- All correlations match expected physics
- No SO2 violations
- pH values in valid range