In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
import joblib




# --------------------------- Load dataset
# Use StringIO to simulate file loading
df = pd.read_csv("./vehicles_merged.csv")

# --------------------------- Numeric preprocessing
numeric_cols = ['Seating Capacity','EFF (km/l)/(km/kwh)','Ground Clearance (range)']
# Ensure numeric columns are actually numeric, coercing errors to NaN before filling
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# --------------------------- Feature definitions
# Note: The body type list is slightly simplified here to match existing data columns
body_types = [
    "sedan","hatchback","suv","mpv","pickup","coupe",
    "convertible","wagon","van","crossover","kei / microvan",
    "roadster","other","liftback","mpv / minivan"
]
road_cols = ['City/Urban', "Suburban/Normal", "Mid Off-Road", "Off-Road/Hilly Terrain"]
fuel_types = df['Fuel Type'].str.lower().unique().tolist()

# One-hot encoding (re-running this to ensure consistency for all body types)
for bt in body_types:
    df[f'Body_{bt}'] = df['Body Type'].apply(
        lambda x: 1 if bt.lower() in [t.strip().lower() for t in str(x).replace('/', ',').split(',')] else 0
    )
for f in fuel_types:
    df[f'Fuel_{f}'] = df['Fuel Type'].str.lower().apply(lambda x: 1 if x == f else 0)

feature_cols = [f'Body_{bt}' for bt in body_types] + numeric_cols + road_cols + [f'Fuel_{f}' for f in fuel_types]

road_mapping = {
    "town": "City/Urban", "urban": "City/Urban", "city": "City/Urban",
    "suburban": "Suburban/Normal", "mid off-road": "Mid Off-Road",
    "off-road": "Off-Road/Hilly Terrain", "hilly": "Off-Road/Hilly Terrain"
}

# --------------------------- User Input (Hardcoded for demonstration)
print("--- Using Sample Vehicle Preferences ---")
user_body = input("Enter body type: (hatchback, sedan, suv, etc.)").strip().lower() # User wants a hatchback
user_seating = int(input("Enter seating capacity (number of seats 2,4,5,7,etc.): ").strip())         # User wants 5 seats
user_road = input("Enter primary road type: (city, suburban, off-road, etc.)").strip().lower()       # User primarily drives in the city
user_traffic = input("Enter Usual traffic condition:(high,mixed,mid and low)").strip().lower()    # User experiences high traffic (prefers high efficiency)
user_fuel = input("Enter fuel type you prefer:(petrol,diesel,hybrid and electric)")     # User prefers petrol

print(f"Preferences: Body={user_body}, Seats={user_seating}, Road={user_road}, Traffic={user_traffic}, Fuel={user_fuel}\n")

# Map road input to standard road types
user_road_mapped = road_mapping.get(user_road, "City/Urban")

# --------------------------- Adaptive fuel preference ordering
if user_traffic == "high":
    preferred_fuels = ["electric","hybrid","petrol","diesel"]
elif user_traffic == "mixed":
    preferred_fuels = ["hybrid","petrol","electric","diesel"]
elif user_traffic == "mid":
    preferred_fuels = ["hybrid","petrol","diesel","electric"]
else:
    preferred_fuels = ["petrol","diesel","hybrid","electric"]

# Merge with user's fuel choice
user_fuel_list = [user_fuel] if user_fuel and user_fuel in fuel_types else []
combined_fuels = user_fuel_list + [f for f in preferred_fuels if f not in user_fuel_list]
user_fuel_list = list(dict.fromkeys(combined_fuels))

# --------------------------- Filtering by fuel & body type
filtered_df = df[df['Fuel Type'].str.lower().isin(user_fuel_list)].copy()
body_col_name = f'Body_{user_body}'

# Ensure the required body type column exists before filtering
if body_col_name not in filtered_df.columns:
     print(f"Error: Body type column {body_col_name} not found in features.")
     # Fallback: if body column not found, skip body filtering but proceed
else:
    filtered_df = filtered_df[filtered_df[body_col_name] == 1]

# Seating capacity ±1
filtered_df = filtered_df[filtered_df['Seating Capacity'].apply(lambda x: abs(x-user_seating) <= 1)]

if filtered_df.empty:
    print("No vehicles match your selected body type and fuel type (after applying strict +/- 1 seat filter).")
    # For demonstration, we won't exit, but in a real app, this should return an error.
    # We will simply skip the rest of the logic if filtered_df is empty.
else:
    # --------------------------- Data scaling
    X_filtered = filtered_df[feature_cols].fillna(0)
    scaler = StandardScaler()
    X_filtered[numeric_cols] = scaler.fit_transform(X_filtered[numeric_cols])

    # --------------------------- User vector construction
    body_vec = [1 if bt == user_body else 0 for bt in body_types]
    road_vec = [1 if rc == user_road_mapped else 0 for rc in road_cols]

    traffic_eff_map = {
        "high": df['EFF (km/l)/(km/kwh)'].quantile(0.45),
        "mixed": df['EFF (km/l)/(km/kwh)'].quantile(0.55),
        "mid": df['EFF (km/l)/(km/kwh)'].quantile(0.5),
        "low": df['EFF (km/l)/(km/kwh)'].quantile(0.7)
    }
    fuel_efficiency = traffic_eff_map.get(user_traffic, df['EFF (km/l)/(km/kwh)'].mean())
    ground_clearance = df['Ground Clearance (range)'].max() if user_road in ["off-road","mid off-road","hilly"] else df['Ground Clearance (range)'].mean()
    fuel_vec = [1 if f == user_fuel else 0 for f in fuel_types]

    # Unscaled user vector (order must match feature_cols structure)
    user_vector = body_vec + [user_seating] + road_vec + [fuel_efficiency, ground_clearance] + fuel_vec

    # Scale numeric features
    numeric_scaled_input = np.array([[user_seating, fuel_efficiency, ground_clearance]])
    numeric_scaled = scaler.transform(numeric_scaled_input)[0]

    # Determine indices for insertion back into user_vector
    # Feature_cols structure: [Body Types] + [Seating] + [Road] + [EFF, GC] + [Fuel]
    start_idx = len(body_types) # Index of 'Seating Capacity'
    eff_idx = start_idx + 1 + len(road_cols) # Index of 'EFF'
    gc_idx = eff_idx + 1 # Index of 'Ground Clearance'

    # Replace unscaled numeric values with scaled ones
    user_vector[start_idx] = numeric_scaled[0] # Seating
    user_vector[eff_idx] = numeric_scaled[1]   # EFF
    user_vector[gc_idx] = numeric_scaled[2]    # GC

    user_vector = np.nan_to_num(user_vector)

    # --------------------------- Feature weighting
    weights = []
    weights += [3.5] * len(body_types) # Body Type
    weights.append(3.5) # Seating Capacity
    weights += [3] * len(road_cols) # Road Type
    weights += [3.5, 3.5] # EFF, Ground Clearance
    weights += [15] * len(fuel_types) # Fuel Type (High weight ensures fuel preference is critical)
    weights = np.array(weights)

    X_weighted = X_filtered.values * weights
    user_vector_weighted = np.array(user_vector) * weights

    # --------------------------- Weighted K-NN Recommendation
    top_n = 100
    knn_weighted = NearestNeighbors(n_neighbors=min(top_n, len(X_filtered)), metric='euclidean')
    knn_weighted.fit(X_weighted)

    distances, indices = knn_weighted.kneighbors([user_vector_weighted])
    recommended_all = filtered_df.iloc[indices[0]].copy()
    recommended_all['Distance'] = distances[0]

    # --- Post-KNN Prioritization (Hybrid Sorting) ---
    # Used to break ties or refine ordering, heavily prioritizing fuel consistency.
    preferred_fuel_set = preferred_fuels # Use the adaptive order
    
    # Fuel Priority: index in the preferred list (lower index is higher priority)
    recommended_all['Fuel_Priority'] = recommended_all['Fuel Type'].str.lower().apply(
        lambda f: preferred_fuel_set.index(f) if f in preferred_fuel_set else len(preferred_fuel_set)
    )
    # Seat Priority: exact match is best (0)
    recommended_all['Seat_Priority'] = recommended_all['Seating Capacity'].apply(lambda s: 0 if s == user_seating else 1)

    # Final Sort: 1. Strict fuel order, 2. KNN distance, 3. Seat match
    recommended_all.sort_values(by=['Fuel_Priority','Distance','Seat_Priority'], inplace=True)
    recommended_all['NN_Distance'] = recommended_all['Distance'].round(3)

    # --------------------------- Display
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 200)

    print("\n--- Weighted Hybrid K-NN Recommendation Results ---")
    print(recommended_all[['Manufacturer','Model','Body Type','Seating Capacity','Fuel Type','EFF (km/l)/(km/kwh)','Ground Clearance (range)','NN_Distance']].head(40).to_string(index=False))

    # --------------------------- Summary of top 5
    print("\nTop 5 Recommended Vehicles Summary:\n")
    for i, row in recommended_all.head(5).iterrows():
        print(f"- {row['Manufacturer']} {row['Model']}")
        print(f"  → Body Type: {row['Body Type']} (Matched user preference: {user_body})")
        print(f"  → Seating Capacity: {row['Seating Capacity']} (Matched user preference: {user_seating})")
        print(f"  → Fuel Type: {row['Fuel Type']} (User preference order: {', '.join(preferred_fuels)})")
        print(f"  → Efficiency: {row['EFF (km/l)/(km/kwh)']} km/l or km/kwh")
        print(f"  → Ground Clearance: {row['Ground Clearance (range)']} mm")
        print(f"  → Similarity Score (Lower Distance is Better): {row['NN_Distance']}\n")

    
X_final=X_filtered[feature_cols].fillna(0).values

scaler_final = StandardScaler()
X_scaled=scaler_final.fit_transform(X_final[:,len(body_types):len(body_types)+len(numeric_cols)])
X_model=np.hstack((X_final[:,:len(body_types)],X_scaled,X_final[:,len(body_types)+len(numeric_cols):]))

knn_final = NearestNeighbors(n_neighbors=100, metric='euclidean')
knn_final.fit(X_model)

joblib.dump(scaler_final, 'scaler.joblib')
joblib.dump(knn_final, 'knn_model.joblib')
joblib.dump(feature_cols, 'feature_cols.joblib') # Save feature order
joblib.dump(df[['Manufacturer','Model','Fuel Type', 'Seating Capacity']].copy(), 'vehicle_metadata.joblib')
# You must also save the original dataframe indices to map results back!
joblib.dump(df.index.tolist(), 'original_indices.joblib')

print("Model, Scaler, and Metadata saved successfully.")



--- Using Sample Vehicle Preferences ---
Preferences: Body=hatchback, Seats=5, Road=city, Traffic=high, Fuel=petrol






--- Weighted Hybrid K-NN Recommendation Results ---
 Manufacturer                Model       Body Type  Seating Capacity Fuel Type  EFF (km/l)/(km/kwh)  Ground Clearance (range)  NN_Distance
       Toyota          pixis epoch kei / hatchback                 5    Petrol                30.00                       150        7.612
       Toyota             passo xl       hatchback                 5    Petrol                26.00                       150        7.630
       Toyota              passo x       hatchback                 5    Petrol                26.00                       150        7.630
       Toyota                passo       hatchback                 5    Petrol                26.00                       150        7.630
       Nissan        dayz ek wagon       hatchback                 5    Petrol                27.50                       155        7.634
       Nissan          dayz bolero       hatchback                 5    Petrol                27.50              