In [14]:
import numpy
import spacy

nlp = spacy.load("en_core_web_sm")

def merge_currency(doc):
    """
    Merge currency symbols like $ with adjacent numbers into a single token.
    """
    with doc.retokenize() as retokenizer:
        spans = []
        for i, token in enumerate(doc[:-1]):  # Avoid index out of bounds
            if token.text in ["$", "€", "£"] and doc[i + 1].like_num:
                span = doc[token.i : token.i + 2]  # Create span for "$ 20,000"
                spans.append(span)
        for span in spans:
            retokenizer.merge(span)
            
def extract_price_range_dynamic(user_input):
    doc = nlp(user_input)
    merge_currency(doc)

    price_range = {"MinPrice": None, "MaxPrice": None}

    def parse_number(text):
        try:
            text = text.lower().replace("k", "000").replace("$", "").replace(",", "").replace(" ", "").strip("?.!")
            return float(text)
        except ValueError:
            return None

    lower_bound = None
    upper_bound = None
    in_range = False
    single_number_encountered = False  # Flag for single numbers without context

    for token in doc:
        value = parse_number(token.text)

        # Handle range keywords like "between"
        if token.text.lower() == "between":
            in_range = True
            continue

        if value is not None:  # Numeric token
            # Check immediate head and ancestors
            head = token.head.text.lower()
            ancestors = [ancestor.text.lower() for ancestor in token.ancestors]
            children = [child.text.lower() for child in token.children]  # Include children for better context

            # Handle "more than", "less than" by looking at ancestors and children
            if ("more" in ancestors or "above" in ancestors or "over" in ancestors) and "than" in ancestors:
                lower_bound = value
            elif ("less" in ancestors or "below" in ancestors or "under" in ancestors) and "than" in ancestors:
                upper_bound = value

            # Handle explicit range
            elif in_range:
                if lower_bound is None:
                    lower_bound = value
                else:
                    upper_bound = value
                    in_range = False

            # Handle standalone comparison keywords
            elif head in ["under", "below", "less"]:
                upper_bound = value
            elif head in ["over", "above", "more"]:
                lower_bound = value

            # Handle ranges connected by "to", "and"
            elif token.text in ["to", "and"]:
                if lower_bound is None:
                    lower_bound = value
                else:
                    upper_bound = value

            # Default catch for exact values
            elif head in ["for", "within", "around"]:
                upper_bound = value

            # If no context is found, treat the number as an upper bound by default
            else:
                single_number_encountered = True
                upper_bound = value

        # Handle qualitative terms like "cheap" or "luxury"
        elif token.lemma_ in ["cheap", "affordable"]:
            upper_bound = 20000
        elif token.lemma_ in ["luxury", "expensive"]:
            lower_bound = 50000

    # Swap bounds if lower > upper
    if lower_bound and upper_bound and lower_bound > upper_bound:
        lower_bound, upper_bound = upper_bound, lower_bound

    # If a single number was encountered without context, assign it as an upper bound
    if single_number_encountered and lower_bound is None and upper_bound is None:
        upper_bound = value

    price_range["MinPrice"] = lower_bound
    price_range["MaxPrice"] = upper_bound
    return price_range


In [15]:
def extract_colors(user_input):
    # List of unique colors
    UNIQUE_COLORS = ['White', 'Gray', 'Black', 'Silver', 'Red', 'Blue', 
                     'Green', 'Gold', 'Copper', 'Brown', 'Orange', 'Tan', 
                     'Teal', 'Yellow', 'Maroon']

    # Define darker and lighter color groups
    DARK_COLORS = ['Black', 'Gray', 'Navy', 'Charcoal', 'Brown', 'Maroon', 'Dark Green']
    LIGHT_COLORS = ['White', 'Silver', 'Yellow', 'Beige', 'Light Gray', 'Light Blue', 'Pale Orange']

    # Synonyms for tone-related words
    TONE_MAP = {
        "dark": "dark",
        "light": "light",
        "bright": "light",
        "deep": "dark",
        "pale": "light",
    }

    # Synonyms for colors
    COLOR_SYNONYMS = {
        "grey": "Gray",
        "charcoal": "Black",
        "scarlet": "Red",
        "navy": "Blue",
        "emerald": "Green",
        "golden": "Gold",
        "bronze": "Copper",
        "chocolate": "Brown",
        "amber": "Orange",
        "beige": "Tan",
        "turquoise": "Teal",
        "lemon": "Yellow",
        "burgundy": "Maroon",
    }

    # Reverse mapping for quick lookup
    COLOR_MAP = {color.lower(): color for color in UNIQUE_COLORS}
    for synonym, base_color in COLOR_SYNONYMS.items():
        COLOR_MAP[synonym.lower()] = base_color

    doc = nlp(user_input)
    detected_colors = []
    match_dark = False
    match_light = False
    explicit_color_found = False

    for token in doc:
        token_text = token.text.lower()

        # Check for tone keywords ("dark", "light") with broader context
        if token_text in TONE_MAP:
            tone = TONE_MAP[token_text]
            if tone == "dark":
                match_dark = True
            elif tone == "light":
                match_light = True

        # Check if the token matches any color or synonym
        if token_text in COLOR_MAP:
            base_color = COLOR_MAP[token_text]
            detected_colors.append(base_color)
            explicit_color_found = True

        # Handle tone + color combination (e.g., "dark blue")
        if token_text in TONE_MAP:
            for child in token.children:
                child_text = child.text.lower()
                if child_text in COLOR_MAP:
                    base_color = COLOR_MAP[child_text]
                    detected_colors.append(base_color)
                    explicit_color_found = True

    # If no specific color is mentioned but tone is detected, use dark/light groups
    if not explicit_color_found:
        if match_dark:
            detected_colors.extend(DARK_COLORS)
        if match_light:
            detected_colors.extend(LIGHT_COLORS)

    # Remove duplicates and preserve order
    detected_colors = list(dict.fromkeys(detected_colors))

    return detected_colors


In [16]:
examples = [
    "I need a pickup truck with 4WD, leather seats, and under 50,000 km.",
    "Do you have a sporty car with a sunroof and under 50,000 km?",
    "Do you have a red convertible under 30,000 km?",
    "Do you have a family SUV with leather seats and low mileage?",
    "Do you have an electric car with a range of at least 300 miles and advanced safety features?",
    "I'm looking for an SUV with captain's chairs and a panoramic sunroof.",
    "Do you have a minivan with stow-and-go seating and rear-seat entertainment?",
    "I'm looking for a high-performance car with launch control and Brembo brakes.",
    "Do you have the car with VIN 1HGCM82633A123456?",
    "I'm looking for the car with VIN 5FNYF6H57LB789012.",
    "Do you have an SUV with off-road capabilities, a roof rack, and skid plates?",
    "Do you have an SUV with remote start, a heated steering wheel, and a power liftgate?",
    "I'm looking for an SUV with third-row seating.",
    "Do you have a hybrid SUV?",
    "I want something fun to drive.",
    "I currently drive a 2018 Toyota Camry. Do you have anything similar?",
    "I'm driving a 2018 Hyundai Tucson right now. Any similar SUVs in stock?",
    "I'm looking for a fun car to drive.",
    "I want an electric car.",
]

for example in examples:
    result = extract_price_range_dynamic(example)
    result2 = extract_colors(example)
    print(f"Input: {example}")
    print(f"Extracted Price Range: {result}")
    print(f"Extracted Color: {result2}")
    print("-" * 50)

Input: I need a pickup truck with 4WD, leather seats, and under 50,000 km.
Extracted Price Range: {'MinPrice': None, 'MaxPrice': 50000.0}
Extracted Color: []
--------------------------------------------------
Input: Do you have a sporty car with a sunroof and under 50,000 km?
Extracted Price Range: {'MinPrice': None, 'MaxPrice': 50000.0}
Extracted Color: []
--------------------------------------------------
Input: Do you have a red convertible under 30,000 km?
Extracted Price Range: {'MinPrice': None, 'MaxPrice': 30000.0}
Extracted Color: ['Red']
--------------------------------------------------
Input: Do you have a family SUV with leather seats and low mileage?
Extracted Price Range: {'MinPrice': None, 'MaxPrice': None}
Extracted Color: []
--------------------------------------------------
Input: Do you have an electric car with a range of at least 300 miles and advanced safety features?
Extracted Price Range: {'MinPrice': None, 'MaxPrice': 300.0}
Extracted Color: []
----------------

In [17]:
# Define the categories
PRICE_CATEGORIES = {
    "$0-$10K": (0, 10000),
    "$10K-$20K": (10000, 20000),
    "$20K-$30K": (20000, 30000),
    "$30K-$50K": (30000, 50000),
    "$50K+": (50000, float("inf"))
}

def map_price_to_category(price_range):
    """
    Maps the extracted price range to predefined price categories.

    Args:
        price_range (dict): The extracted price range with 'MinPrice' and 'MaxPrice'.

    Returns:
        str: The corresponding price category or None if no match is found.
    """
    min_price = price_range["MinPrice"]
    max_price = price_range["MaxPrice"]

    # If there's only a max price, find the category it falls into
    if max_price is not None and min_price is None:
        for category, (low, high) in PRICE_CATEGORIES.items():
            if low <= max_price <= high:
                return category

    # If there's only a min price, find the category it falls into
    if min_price is not None and max_price is None:
        for category, (low, high) in PRICE_CATEGORIES.items():
            if low <= min_price < high:
                return category

    # If both bounds are defined, find a category that fits the range
    if min_price is not None and max_price is not None:
        for category, (low, high) in PRICE_CATEGORIES.items():
            if low >= min_price and high <= max_price:
                return category

        # Special case: if the range spans multiple categories, return the closest upper bound
        for category, (low, high) in PRICE_CATEGORIES.items():
            if low <= min_price <= high or low <= max_price <= high:
                return category

    # If no match is found
    return None


In [18]:
for example in examples:
    extracted_price_range = extract_price_range_dynamic(example)  # Use your existing price extraction function
    print(f"Input: {example}")
    print(f"Extracted Price Range: {extracted_price_range}")
    print("--------------------------------------------------")

Input: I need a pickup truck with 4WD, leather seats, and under 50,000 km.
Extracted Price Range: {'MinPrice': None, 'MaxPrice': 50000.0}
--------------------------------------------------
Input: Do you have a sporty car with a sunroof and under 50,000 km?
Extracted Price Range: {'MinPrice': None, 'MaxPrice': 50000.0}
--------------------------------------------------
Input: Do you have a red convertible under 30,000 km?
Extracted Price Range: {'MinPrice': None, 'MaxPrice': 30000.0}
--------------------------------------------------
Input: Do you have a family SUV with leather seats and low mileage?
Extracted Price Range: {'MinPrice': None, 'MaxPrice': None}
--------------------------------------------------
Input: Do you have an electric car with a range of at least 300 miles and advanced safety features?
Extracted Price Range: {'MinPrice': None, 'MaxPrice': 300.0}
--------------------------------------------------
Input: I'm looking for an SUV with captain's chairs and a panoramic su

In [19]:
parsed_criteria = {
    'MinPrice': 20000,               # Extracted minimum price (if provided by NLP)
    'MaxPrice': 50000,               # Extracted maximum price (if provided by NLP)
    'Body': 'Sport Utility',         # Extracted "SUV"
    'Ext_Color_Generic': 'Red'       # Extracted "Red"
}

In [20]:
import pandas as pd

# --- Step 1: Load the Data ---
df = pd.read_csv('vehicles.csv')

# --- Step 2: Eliminate Unnecessary Columns ---
columns_to_drop = ['MRSP', 'Int_Uphalstery', 'Internet_Price']
df = df.drop(columns=columns_to_drop, errors='ignore')

# --- Step 3: Create Price_Range ---
price_bins = [0, 10000, 20000, 30000, 50000, float('inf')]
price_labels = ['$0-$10K', '$10K-$20K', '$20K-$30K', '$30K-$50K', '$50K+']
df['Price_Range'] = pd.cut(df['SellingPrice'], bins=price_bins, labels=price_labels)

# --- Step 4: Create Miles_Range ---
miles_bins = [0, 10000, 30000, 60000, 100000, float('inf')]
miles_labels = ['0-10K', '10K-30K', '30K-60K', '60K-100K', '100K+']
df['Miles_Range'] = pd.cut(df['Miles'], bins=miles_bins, labels=miles_labels)

# --- Step 5: Drop Rows with Missing Ranges ---
df = df.dropna(subset=['Price_Range', 'Miles_Range'])

# --- Step 6: Define the get_combined_ranges Function ---
def get_combined_ranges(user_min, user_max, bins, labels):
    combined_ranges = []
    for i in range(len(bins) - 1):
        bin_min = bins[i]
        bin_max = bins[i + 1]
        if bin_min >= user_min and bin_max <= user_max:
            combined_ranges.append(labels[i])
    return combined_ranges

# --- Step 7: Define the Ranking Algorithm ---
HIERARCHY_WEIGHTS = {
    'Price_Range': 5,
    'Miles_Range': 4,
    'Body': 4,
    'Fuel_Type': 3,
    'Drivetrain': 3,
    'Make': 3,
    'Year': 3,
    'Transmission': 2,
    'PassengerCapacity': 2,
    'Style_Description': 2,
    'Ext_Color_Generic': 2,
    'InteriorColor': 1
}

def calculate_vehicle_score(vehicle, user_criteria):
    score = 0
    for key, value in user_criteria.items():
        if pd.notna(vehicle.get(key)) and vehicle.get(key) == value:
            score += HIERARCHY_WEIGHTS.get(key, 0)
    return score

def rank_vehicles(df, user_criteria, top_n=10):
    if df.empty:
        return "No matches found. Please update your search criteria."
    df['Score'] = df.apply(lambda row: calculate_vehicle_score(row, user_criteria), axis=1)
    ranked_vehicles = df.sort_values(by='Score', ascending=False)
    return ranked_vehicles.head(top_n)
def filter_vehicles(parsed_criteria):
    """
    Filter vehicles based on parsed NLP criteria.

    Parameters:
        parsed_criteria (dict): User's criteria extracted from NLP input.

    Returns:
        pd.DataFrame or str: Filtered vehicles DataFrame or a message if no matches are found.
    """
    filtered_df = df.copy()

    # Handle price range dynamically
   # min_price = parsed_criteria.get('MinPrice')
   # max_price = parsed_criteria.get('MaxPrice')

    #if min_price is not None or max_price is not None:
        # Apply SellingPrice filters
     #   if min_price is not None:
      #      filtered_df = filtered_df[filtered_df['SellingPrice'] >= min_price]
       # if max_price is not None:
        #    filtered_df = filtered_df[filtered_df['SellingPrice'] <= max_price]

    # Apply other filters based on parsed criteria
    for key, value in parsed_criteria.items():
        print(len(filtered_df))
        #if key not in ['MinPrice', 'MaxPrice']:  # Skip MinPrice and MaxPrice
        if key in filtered_df.columns:
            filtered_df = filtered_df[filtered_df[key] == value]

    return filtered_df


# --- Example NLP Input ---
parsed_criteria = {
    'Price_Range': '$20K-$30K',
    'Ext_Color_Generic': 'Red',
    'Miles_Range': '10K-30K',

}

# Apply filtering
filtered_df = filter_vehicles(parsed_criteria)

# Rank and display results or handle no matches
if isinstance(filtered_df, str) or filtered_df.empty:
    print("No matches found. Please update your search criteria.")
else:
    result = rank_vehicles(filtered_df, parsed_criteria, top_n=10)
    if isinstance(result, str):
        print(result)
    else:
        print(result[['Year', 'Make', 'Model', 'SellingPrice', 'Miles', 'Body', 'Ext_Color_Generic', 'Price_Range', 'Miles_Range', 'Score']])


1453
467
46
      Year           Make     Model  SellingPrice  Miles           Body  \
513   2019  Mercedes-Benz   E-Class         22997  24656        4dr Car   
800   2023           Audi        Q3         25498  24116  Sport Utility   
890   2021     Alfa Romeo    Giulia         24498  23480        4dr Car   
972   2023         Toyota     Camry         20998  24243        4dr Car   
1005  2021           Jeep  Wrangler         29998  22147    Convertible   
1178  2022       INFINITI       Q50         23998  20948        4dr Car   
1428  2022  Mercedes-Benz       GLC         29998  17892  Sport Utility   
1585  2023         Nissan    Sentra         20498  12675        4dr Car   
2014  2023         Nissan    Altima         21498  10857        4dr Car   
2015  2023         Nissan    Altima         20998  20896        4dr Car   

     Ext_Color_Generic Price_Range Miles_Range  Score  
513                Red   $20K-$30K     10K-30K     11  
800                Red   $20K-$30K     10K-30K    

In [21]:
examples = [
    "I'm looking for a car under $40,000.",
    "Do you have anything between $20,000 and $50,000?",
    "I want an affordable car.",
    "Show me luxury cars over $70,000.",
    "Give me a cheap vehicle under $15k.",
    "I need a car below $25,000 but above $10,000.",
    "I’m searching for something costing more than 50k.",
    "What can I get for less than $30,000?",
    "Can I get something for $40,000?",
    "Is $20k to $50k within my range?",
    "I am looking for a car that costs less than 50000$?",
    "My budget is 20,000$",
    "I prefer below 15,000",
    "Show me cars costing between $25,000 and $30,000.",
    "Cars over $50k and below $100k are my preference.",
]
for example in examples:
    result = extract_price_range_dynamic(example)
    print(f"Input: {example}")
    print(f"Extracted Price Range: {result}")
    print("-" * 50)

Input: I'm looking for a car under $40,000.
Extracted Price Range: {'MinPrice': None, 'MaxPrice': 40000.0}
--------------------------------------------------
Input: Do you have anything between $20,000 and $50,000?
Extracted Price Range: {'MinPrice': 20000.0, 'MaxPrice': 50000.0}
--------------------------------------------------
Input: I want an affordable car.
Extracted Price Range: {'MinPrice': None, 'MaxPrice': 20000}
--------------------------------------------------
Input: Show me luxury cars over $70,000.
Extracted Price Range: {'MinPrice': 70000.0, 'MaxPrice': None}
--------------------------------------------------
Input: Give me a cheap vehicle under $15k.
Extracted Price Range: {'MinPrice': None, 'MaxPrice': 15000.0}
--------------------------------------------------
Input: I need a car below $25,000 but above $10,000.
Extracted Price Range: {'MinPrice': 10000.0, 'MaxPrice': 25000.0}
--------------------------------------------------
Input: I’m searching for something costin

In [22]:
def transform_nlp_output_to_criteria(nlp_output, price_bins, price_labels):
    """
    Transforms NLP output into the required format for parsed_criteria.

    Parameters:
        nlp_output (dict): Extracted NLP features including MinPrice, MaxPrice, BodyType, etc.
        price_bins (list): Bins for price ranges.
        price_labels (list): Labels for price ranges.

    Returns:
        dict: Formatted parsed_criteria.
    """
    parsed_criteria = {}

    # Handle PriceRange
    min_price = nlp_output.get('MinPrice')
    max_price = nlp_output.get('MaxPrice')

    if min_price is not None or max_price is not None:
        # Assign numerical range for price
        parsed_criteria['MinPrice'] = min_price
        parsed_criteria['MaxPrice'] = max_price

        # Map numerical price range to price category
        if min_price is not None and max_price is not None:
            combined_ranges = [
                price_labels[i]
                for i in range(len(price_bins) - 1)
                if price_bins[i] >= min_price and price_bins[i + 1] <= max_price
            ]
            parsed_criteria['Price_Range'] = combined_ranges
        elif max_price is not None:
            for i in range(len(price_bins) - 1):
                if price_bins[i] <= max_price < price_bins[i + 1]:
                    parsed_criteria['Price_Range'] = price_labels[i]
        elif min_price is not None:
            for i in range(len(price_bins) - 1):
                if price_bins[i] <= min_price < price_bins[i + 1]:
                    parsed_criteria['Price_Range'] = price_labels[i]

    # Handle BodyType
    if 'BodyType' in nlp_output and nlp_output['BodyType']:
        parsed_criteria['Body'] = nlp_output['BodyType']

    # Handle Color
    if 'Color' in nlp_output and nlp_output['Color']:
        parsed_criteria['Ext_Color_Generic'] = nlp_output['Color']

    # Handle other extracted features directly
    additional_keys = ['Miles', 'Fuel_Type', 'Drivetrain', 'PassengerCapacity']
    for key in additional_keys:
        if key in nlp_output:
            parsed_criteria[key] = nlp_output[key]

    return parsed_criteria


In [23]:

# --- Example NLP Input ---
parsed_criteria = {
    'MinPrice': 20000,               # Extracted minimum price (if provided by NLP)
    'MaxPrice': 50000,               # Extracted maximum price (if provided by NLP)

}

# Apply filtering
filtered_df = filter_vehicles(parsed_criteria)

# Rank and display results or handle no matches
if isinstance(filtered_df, str) or filtered_df.empty:
    print("No matches found. Please update your search criteria.")
else:
    result = rank_vehicles(filtered_df, parsed_criteria, top_n=10)
    if isinstance(result, str):
        print(result)
    else:
        print(result[['Year', 'Make', 'Model', 'SellingPrice', 'Miles', 'Body', 'Ext_Color_Generic', 'Price_Range', 'Miles_Range', 'Score']])


1453
1453
      Year     Make                  Model  SellingPrice  Miles  \
0     2019    Mazda                   CX-9         18498  60081   
1388  2021    Honda           Accord Sedan         18498  78728   
1386  2024      BMW                     i4         39998   4689   
1385  2019  Hyundai                 Sonata         12998  44487   
1384  2022    Honda                   HR-V         16498  58796   
1382  2024   Nissan                 Armada         48998  13415   
1381  2021   Jaguar                 F-PACE         28998  48094   
1380  2022   Jaguar                 E-PACE         28698  29052   
1378  2020   Nissan                  Rogue         15998  47910   
1377  2015     MINI  Cooper Hardtop 4 Door         12497  75421   

               Body Ext_Color_Generic Price_Range Miles_Range  Score  
0     Sport Utility             White   $10K-$20K    60K-100K      0  
1388        4dr Car              Gray   $10K-$20K    60K-100K      0  
1386      Hatchback             Black  

In [24]:
import pandas as pd

# --- Step 1: Load the Data ---
df = pd.read_csv('vehicles.csv')

# --- Step 2: Eliminate Unnecessary Columns ---
columns_to_drop = ['MRSP', 'Int_Uphalstery', 'Internet_Price']
df = df.drop(columns=columns_to_drop, errors='ignore')

# --- Step 3: Create Price_Range ---
price_bins = [0, 10000, 20000, 30000, 50000, float('inf')]
price_labels = ['$0-$10K', '$10K-$20K', '$20K-$30K', '$30K-$50K', '$50K+']
df['Price_Range'] = pd.cut(df['SellingPrice'], bins=price_bins, labels=price_labels)

# --- Step 4: Create Miles_Range ---
miles_bins = [0, 10000, 30000, 60000, 100000, float('inf')]
miles_labels = ['0-10K', '10K-30K', '30K-60K', '60K-100K', '100K+']
df['Miles_Range'] = pd.cut(df['Miles'], bins=miles_bins, labels=miles_labels)

# --- Step 5: Drop Rows with Missing Ranges ---
df = df.dropna(subset=['Price_Range', 'Miles_Range'])

# --- Step 6: Define the get_combined_ranges Function ---
def get_combined_ranges(user_min, user_max, bins, labels):
    combined_ranges = []
    for i in range(len(bins) - 1):
        bin_min = bins[i]
        bin_max = bins[i + 1]
        if bin_min >= user_min and bin_max <= user_max:
            combined_ranges.append(labels[i])
    return combined_ranges

# --- Step 7: Define the Ranking Algorithm ---
HIERARCHY_WEIGHTS = {
    'Price_Range': 5,
    'Miles_Range': 4,
    'Body': 4,
    'Fuel_Type': 3,
    'Drivetrain': 3,
    'Make': 3,
    'Year': 3,
    'Transmission': 2,
    'PassengerCapacity': 2,
    'Style_Description': 2,
    'Ext_Color_Generic': 2,
    'InteriorColor': 1
}

def calculate_vehicle_score(vehicle, user_criteria):
    score = 0
    for key, value in user_criteria.items():
        if vehicle.get(key) != value:
            #print("HELLO")
            continue
        if pd.notna(vehicle.get(key)) and vehicle.get(key) == value:
            score += HIERARCHY_WEIGHTS.get(key, 0)
    return score

def rank_vehicles(df, user_criteria, top_n=10):
    if df.empty:
        return "No matches found. Please update your search criteria."
    df['Score'] = df.apply(lambda row: calculate_vehicle_score(row, user_criteria), axis=1)
    ranked_vehicles = df.sort_values(by='Score', ascending=False)
    return ranked_vehicles.head(top_n)

def filter_vehicles(parsed_criteria):
    """
    Filter vehicles based on parsed NLP criteria.

    Parameters:
        parsed_criteria (dict): User's criteria extracted from NLP input.

    Returns:
        pd.DataFrame or str: Filtered vehicles DataFrame or a message if no matches are found.
    """
    filtered_df = df.copy()

    # Handle price range dynamically
    min_price = parsed_criteria.get('MinPrice')
    max_price = parsed_criteria.get('MaxPrice')

    if min_price is not None or max_price is not None:
        # Apply SellingPrice filters
        if min_price is not None:
            filtered_df = filtered_df[filtered_df['SellingPrice'] >= min_price]
        if max_price is not None:
            filtered_df = filtered_df[filtered_df['SellingPrice'] <= max_price]

    # Apply other filters based on parsed criteria
    for key, value in parsed_criteria.items():
        if key not in ['MinPrice', 'MaxPrice']:  # Skip MinPrice and MaxPrice
            if key in filtered_df.columns:
                filtered_df = filtered_df[filtered_df[key] == value]

    return filtered_df


# --- Example NLP Input ---
parsed_criteria = {
    'MinPrice': 0,               # Extracted minimum price (if provided by NLP)
    'MaxPrice': 20000,               # Extracted maximum price (if provided by NLP)
    'Body': 'Sport Utility',         # Extracted "SUV"
    'Ext_Color_Generic': 'Red'       # Extracted "Red"
}
# Apply filtering
filtered_df = filter_vehicles(parsed_criteria)
print(f"Length of results {len(filtered_df)}")
print(filtered_df)
# Rank and display results or handle no matches
if isinstance(filtered_df, str) or filtered_df.empty:
    print("No matches found. Please update your search criteria.")
else:
    result = rank_vehicles(filtered_df, parsed_criteria, top_n=10)
    if isinstance(result, str):
        print(result)
    else:
        print(result[['Year', 'Make', 'Model', 'SellingPrice', 'Miles', 'Body', 'Ext_Color_Generic', 'Price_Range', 'Miles_Range', 'Score']])

Length of results 21
      Type    Stock                VIN  Year           Make            Model  \
42    Used  X730820  X4PPH8TXJ0XJMP9WC  2015       INFINITI             QX50   
133   Used  I128501  WMZ9NU4VY6YVAC9NU  2018       INFINITI             QX30   
377   Used  U566964  GNCY1VALLUNNTY351  2019     Mitsubishi    Eclipse Cross   
387   Used  W969043  TYN0GFU8YG6PXRTN2  2021          Mazda             CX-3   
599   Used  C316063  PK904D3HSFGZVUV78  2017         Nissan            Rogue   
651   Used  B203674  VWN4CGYZ1HSRWJESM  2018            GMC          Terrain   
725   Used  B060315  LJ7KWSBKYFNPB50JE  2023     Mitsubishi  Outlander Sport   
766   Used  I275593  WPKHD37BA8D63VH8S  2022            Kia         Sportage   
894   Used  N631673  M83Y87W4GSJ6YL8T9  2017           Jeep          Patriot   
1159  Used  W254551  MFMEL7YP5PCRMJNW2  2019     Mitsubishi  Outlander Sport   
1169  Used  B129878  ESM4LCKSHUTJ7T21E  2017     Mitsubishi  Outlander Sport   
1226  Used  L235809

In [25]:
from transformers import pipeline

# Load a zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def parse_query_to_criteria(prompt):
    """
    Parse user query into structured criteria using Hugging Face Transformers.
    """
    # Define potential labels (categories)
    labels = ["Price Range", "Color", "Mileage Range"]

    # Classify the prompt
    result = classifier(prompt, labels, multi_label=True)

    # Map results to a dictionary
    parsed_criteria = {}
    for i, label in enumerate(result["labels"]):
        if result["scores"][i] > 0.5:  # Threshold for relevance
            parsed_criteria[label] = label  # Here, you can refine extraction logic
        else:
            parsed_criteria[label] = None

    return parsed_criteria


# Example usage
user_query = "Do you have any red cars under $30,000 with mileage between 10-20K?"
parsed_criteria = parse_query_to_criteria(user_query)
print("Parsed Criteria:", parsed_criteria)


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs.hf.co/facebook/bart-large-mnli/cfbb687dbbd9df99fe865e1860350a22aebac4d26ee4bcb50217f1df606a018e?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1732661131&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczMjY2MTEzMX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9mYWNlYm9vay9iYXJ0LWxhcmdlLW1ubGkvY2ZiYjY4N2RiYmQ5ZGY5OWZlODY1ZTE4NjAzNTBhMjJhZWJhYzRkMjZlZTRiY2I1MDIxN2YxZGY2MDZhMDE4ZT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=cOT4FjBJfA--TiawBHJDp80vh6dm0st8H6PBBZl2M8yfgVr1JwhliIdroO7BPcHCZBqL3v6D5XOI51sxzD1OKj5vnqmN8-4ABvmlNkwhOZ8Tnqs21-jun3EWv6g%7E04eb9a35kjwU0n4HjxPDJuxyQxYBfs0r%7EBM6QtjDuwC%7E--Aa2N1x5nJXCHFw9zZFJIiYtTiZ76i7n2M1pTNcdwRTmhFVyJfgVQeRoRWYQ61gVfCRVE9t904Ant2ED-A7kDMYTefy8QOp5sysCzf0sP3W8PVggEcfCShSYgf%7EEGTanX16T5ZzqbJVL13nGo0k%7EwRRg62L%7EBT35%7EpR8rJg4g6Djg__&Key-Pair-Id=K3RPWS32NSSJCE: HTTPSC

model.safetensors:  53%|#####3    | 870M/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


RuntimeError: Numpy is not available

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the model
model_name = "chavinlo/alpaca-native"
tokenizer = AutoTokenizer.from_pretrained(model_name,use_fast=False)
model = AutoModelForCausalLM.from_pretrained(model_name)

print("Model loaded successfully!")


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/khaledabusalma/hackathon/CodeJam14KKSB/venv/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/khaledabusalma/hackathon/CodeJam14KKSB/venv/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/khaledabusalma/hackathon/CodeJam14KKSB/venv/lib/python3.12/site-packages/ipyk

config.json:   0%|          | 0.00/556 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

pytorch_model-00001-of-00003.bin:   0%|          | 0.00/9.88G [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs.hf.co/repos/e5/93/e593fa89fead22d845aaf9265b67a72b4fa7641684f6cbfb35f92c098cc2a20d/c3f2585b8a77a6b5a2b402f7fb898b072e96b0cb3a8cbe935b71704f4a057a81?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27pytorch_model-00001-of-00003.bin%3B+filename%3D%22pytorch_model-00001-of-00003.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1732658105&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczMjY1ODEwNX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9yZXBvcy9lNS85My9lNTkzZmE4OWZlYWQyMmQ4NDVhYWY5MjY1YjY3YTcyYjRmYTc2NDE2ODRmNmNiZmIzNWY5MmMwOThjYzJhMjBkL2MzZjI1ODViOGE3N2E2YjVhMmI0MDJmN2ZiODk4YjA3MmU5NmIwY2IzYThjYmU5MzViNzE3MDRmNGEwNTdhODE%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qJnJlc3BvbnNlLWNvbnRlbnQtdHlwZT0qIn1dfQ__&Signature=QyQKj0h5uEZCPMZreW2Zb5USAqd1Rh5ZSL9amM37YIJSwkesBN5JNUUGZ9RIlo5Ab9x7SL6nYmmovDYnA8rvvHMK9UUjTQSV%7EizKErc8lyJkhY7VXIr54s5j3zoPvALdKyLpB6byfkKOLco7mO4u9mnwpU

pytorch_model-00001-of-00003.bin:  18%|#7        | 1.74G/9.88G [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs.hf.co/repos/e5/93/e593fa89fead22d845aaf9265b67a72b4fa7641684f6cbfb35f92c098cc2a20d/c3f2585b8a77a6b5a2b402f7fb898b072e96b0cb3a8cbe935b71704f4a057a81?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27pytorch_model-00001-of-00003.bin%3B+filename%3D%22pytorch_model-00001-of-00003.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1732658105&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczMjY1ODEwNX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9yZXBvcy9lNS85My9lNTkzZmE4OWZlYWQyMmQ4NDVhYWY5MjY1YjY3YTcyYjRmYTc2NDE2ODRmNmNiZmIzNWY5MmMwOThjYzJhMjBkL2MzZjI1ODViOGE3N2E2YjVhMmI0MDJmN2ZiODk4YjA3MmU5NmIwY2IzYThjYmU5MzViNzE3MDRmNGEwNTdhODE%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qJnJlc3BvbnNlLWNvbnRlbnQtdHlwZT0qIn1dfQ__&Signature=QyQKj0h5uEZCPMZreW2Zb5USAqd1Rh5ZSL9amM37YIJSwkesBN5JNUUGZ9RIlo5Ab9x7SL6nYmmovDYnA8rvvHMK9UUjTQSV%7EizKErc8lyJkhY7VXIr54s5j3zoPvALdKyLpB6byfkKOLco7mO4u9mnwpU

pytorch_model-00001-of-00003.bin:  18%|#8        | 1.78G/9.88G [00:00<?, ?B/s]

KeyboardInterrupt: 