In [6]:
# Airfoil Similarity Search with Dask

import dask.dataframe as dd
import numpy as np
import pandas as pd
import json
import os

# ----------- Configuration -----------

DATA_PATH = './airfoil_data.parquet'  # Change path if needed

# ----------- Sample Payload -----------

targets = {
    "cl": 0.8,
    "cd": 0.012,
    "cm": -0.05,
    "reynolds_number": 1000000,
    "angle_of_attack": 5,
    "cl_cd_ratio": 66.7
}

weights = {
    "cl": 0.1923076923076923,
    "cd": 0.1923076923076923,
    "cm": 0.11538461538461538,
    "reynolds_number": 0.15384615384615385,
    "angle_of_attack": 0.17307692307692307,
    "cl_cd_ratio": 0.17307692307692307
}

# ----------- Similarity Function -----------

def find_best_airfoils_dask(targets, weights, top_k=3):
    # Load Parquet using Dask
    df = dd.read_parquet(DATA_PATH)

    features = list(targets.keys())
    weights_array = np.array([weights[f] for f in features])

    def compute_similarity(row):
        try:
            diffs = np.array([row[f] - targets[f] for f in features])
            weighted_diffs = weights_array * diffs
            similarity_score = np.sum(weighted_diffs ** 2)

            feature_scores = {f: row[f] - targets[f] for f in features}

            return {
                "airfoil_name": row.get("airfoil_name", ""),
                "airfoil_file": row.get("airfoil_file", ""),
                "reynolds_number": row["reynolds_number"],
                "angle_of_attack": row["angle_of_attack"],
                "cl": row["cl"],
                "cd": row["cd"],
                "cm": row["cm"],
                "cl_cd_ratio": row["cl_cd_ratio"],
                "geometry": json.loads(row["geometry"]) if isinstance(row.get("geometry"), str) else [],
                "airfoil_id": row.get("airfoil_id", ""),
                "similarity_score": similarity_score,
                "feature_scores": feature_scores
            }
        except Exception as e:
            print(f"Error in row: {e}")
            return None

    # Apply row-wise similarity calculation
    results = df.map_partitions(lambda partition: partition.apply(compute_similarity, axis=1)).compute()

    # Remove failed rows
    results = [r for r in results if r is not None]

    # Sort and return top-k
    sorted_results = sorted(results, key=lambda x: x["similarity_score"])
    return sorted_results[:top_k]

# ----------- Run Similarity Search -----------

top_matches = find_best_airfoils_dask(targets, weights, top_k=3)

# ----------- Display Results -----------

for i, match in enumerate(top_matches, 1):
    print(f"\nMatch #{i}")
    print(f"Airfoil: {match['airfoil_name']}")
    print(f"Similarity Score: {match['similarity_score']:.6e}")
    print("Feature Deltas:")
    for k, v in match['feature_scores'].items():
        print(f"  {k}: {v}")


ImportError: An error occurred while calling the read_parquet method registered to the pandas backend.
Original Message: pyarrow>=10.0.1 is required for PyArrow backed StringArray.

In [5]:
!pip install pyarrow


Collecting pyarrow
  Downloading pyarrow-20.0.0-cp311-cp311-win_amd64.whl.metadata (3.4 kB)
Downloading pyarrow-20.0.0-cp311-cp311-win_amd64.whl (25.8 MB)
   ---------------------------------------- 0.0/25.8 MB ? eta -:--:--
   - -------------------------------------- 0.8/25.8 MB 4.8 MB/s eta 0:00:06
   -- ------------------------------------- 1.6/25.8 MB 4.0 MB/s eta 0:00:07
   --- ------------------------------------ 2.1/25.8 MB 3.9 MB/s eta 0:00:07
   ---- ----------------------------------- 3.1/25.8 MB 3.9 MB/s eta 0:00:06
   ------ --------------------------------- 3.9/25.8 MB 4.0 MB/s eta 0:00:06
   ------ --------------------------------- 4.5/25.8 MB 3.8 MB/s eta 0:00:06
   -------- ------------------------------- 5.2/25.8 MB 3.7 MB/s eta 0:00:06
   --------- ------------------------------ 6.3/25.8 MB 3.7 MB/s eta 0:00:06
   ---------- ----------------------------- 7.1/25.8 MB 3.8 MB/s eta 0:00:05
   ------------ --------------------------- 7.9/25.8 MB 3.8 MB/s eta 0:00:05
   --