In [11]:
import pandas as pd

In [12]:
# read an excel and find all the sheet names
excel_file = pd.ExcelFile(r'..\database\raw_data\HYUNDAI NOV25.xlsx')


In [13]:
all_sheets_names = excel_file.sheet_names

In [55]:
# model_df_dict = pd.read_excel(r'..\database\raw_data\HYUNDAI NOV25.xlsx')


# Read all sheets into a dictionary
model_df_dict = pd.read_excel(r'..\database\raw_data\HYUNDAI NOV25.xlsx', sheet_name=None)

# Combine sheets and add sheet_name column
combined_df = pd.concat(
    [df.assign(sheet_name=sheet_name) for sheet_name, df in model_df_dict.items()],
    ignore_index=True
)



In [57]:
combined_df[["Model", "Description ", "Description 2", "sheet_name"]]

Unnamed: 0,Model,Description,Description 2,sheet_name
0,ELCS4V2BES00\t,Elantra,Essential,2026
1,ELCS4V2BPR00\t,Elantra,Preferred,2026
2,ELCS4V2BPRTE,Elantra,with Tech Pkg,2026
3,ELCS4V2BLU00\t,Elantra,Luxury,2026
4,ELCS4V2BLUME,Elantra,Luxury (Two-Tone Interior),2026
...,...,...,...,...
888,KE2SEFEME2BO\t,Kona EV,Preferred N-Line with Ultimate Pk,CPO 2025
889,KE2SEFEME400,Kona EV,Ultimate,CPO 2025
890,KE2SEFEME466,Kona EV,Ultimate w/Two-Tone Interior,CPO 2025
891,KE2SEFEME4BF,Kona EV,Ultimate w/Sage-Green Interior,CPO 2025


In [77]:
def clean_df(df):
    df = df[["Model", "Description ", "Description", "Description 2", "sheet_name"]]

    trim_col_data = df["Description "].fillna("Description")
    df.insert(1, 'combined_description', trim_col_data)
    df = df.drop(columns=['Description ', 'Description'])

    df.columns = ['ModelNumber', 'Model', 'Trim', 'Source_sheets']

    df = df.drop_duplicates(subset=['ModelNumber'], keep='first')    

    # remove /t
    ModelNumber = df["ModelNumber"].str.replace('\t', '', regex=True)
    trim = df["Trim"].replace('\t', '', regex=True)
    
    df["ModelNumber"] = ModelNumber
    df["Trim"] = trim

    # drop rows with null ModelNumbers
    df.dropna(subset=["ModelNumber"], inplace=True)
    

    # grab the year 
    # Extract 4-digit year from sheet_name and create a new column 'Year'
    year = df['Source_sheets'].str.extract(r'(\b\d{4}\b)')

    df.insert(0, 'Year', year)
    # Convert to integer (optional)
    df['Year'] = df['Year'].astype('Int64')  # Keeps NaN if no year found

    return df

cleaned_models = clean_df(combined_df)
cleaned_models


Unnamed: 0,Year,ModelNumber,Model,Trim,Source_sheets
0,2026,ELCS4V2BES00,Elantra,Essential,2026
1,2026,ELCS4V2BPR00,Elantra,Preferred,2026
2,2026,ELCS4V2BPRTE,Elantra,with Tech Pkg,2026
3,2026,ELCS4V2BLU00,Elantra,Luxury,2026
4,2026,ELCS4V2BLUME,Elantra,Luxury (Two-Tone Interior),2026
...,...,...,...,...,...
660,2023,TU4SAA25BO00,Tucson,N Line AWD,CPO 2023
663,2023,TU4SAA16J3AB,Tucson PHEV,Luxury PHEV AWD,CPO 2023
667,2023,SA4SAA25B241,Santa Fe,2.5L Preferred AWD w/ Trend,CPO 2023
668,2023,SA4SAA25DY00,Santa Fe,2.5T Preferred AWD Urban Package,CPO 2023


In [80]:
def data_correction(df):
    # Replace rows where Description contains 'with Tech Pkg' with 'Yes' in a new column
    df.loc[df['Trim'].str.contains('with Tech Pkg', na=False), 'Trim'] = 'Preferred with Tech Pkg'
    return df

cleaned_models = data_correction(cleaned_models)

In [81]:
cleaned_models.to_csv(r'..\database\dbs\Hyundai_models_db.csv', index=False)

In [82]:
cleaned_models

Unnamed: 0,Year,ModelNumber,Model,Trim,Source_sheets
0,2026,ELCS4V2BES00,Elantra,Essential,2026
1,2026,ELCS4V2BPR00,Elantra,Preferred,2026
2,2026,ELCS4V2BPRTE,Elantra,Preferred with Tech Pkg,2026
3,2026,ELCS4V2BLU00,Elantra,Luxury,2026
4,2026,ELCS4V2BLUME,Elantra,Luxury (Two-Tone Interior),2026
...,...,...,...,...,...
660,2023,TU4SAA25BO00,Tucson,N Line AWD,CPO 2023
663,2023,TU4SAA16J3AB,Tucson PHEV,Luxury PHEV AWD,CPO 2023
667,2023,SA4SAA25B241,Santa Fe,2.5L Preferred AWD w/ Trend,CPO 2023
668,2023,SA4SAA25DY00,Santa Fe,2.5T Preferred AWD Urban Package,CPO 2023


In [91]:
def search_vehicle_by_year_and_model(year, model):
    # Filter the DataFrame based on year and model
    filtered_df = cleaned_models[
        (cleaned_models['Year'] == year) & 
        (cleaned_models['Model'].str.contains(model, case=False, na=False))
    ]
    return filtered_df


def search_vehicle_by_model(model):
    # Filter the DataFrame based on model
    filtered_df = cleaned_models[ 
        (cleaned_models['Model'].str.contains(model, case=False, na=False))
    ]
    return filtered_df

def search_vehicle_by_model_and_trim_keyword(model, trim_search_key):
    # Filter the DataFrame based on model and trim key 
    filtered_df = cleaned_models[ 
        (cleaned_models['Model'].str.contains(model, case=False, na=False)) 
        & (cleaned_models['Trim'].str.contains(trim_search_key, case=False, na=False))
    ]
    return filtered_df

In [None]:
search_vehicle_by_model_and_trim_keyword('GV60', "trend")

Unnamed: 0,Year,ModelNumber,Model,Trim,Source_sheets
35,2026,TUCWDK2EPRTR,Tucson,Preferred AWD w/Trend Pkg,2026
37,2026,TUCWDK2EPMTM,Tucson,Preferred AWD w/Trend Pkg,2026
176,2025,TU4SAA25B241,Tucson,Trend,2025 (old)
240,2024,TUCWDK2EPRTR,Tucson,Trend,2024
308,2024,TU4SAA25B241,Tucson,Trend,2024 (old)
396,2020,TU3SAA24B241,Tucson,Preferred AWD w/Trend Package,CPO 2020
488,2021,TU3SAA24B241,Tucson,Preferred AWD w/Trend Package,CPO 2021


In [98]:
res = search_vehicle_by_model("Palisade")
res["Trim"].unique().tolist()

['3.5L Preferred Trend 8-Pass',
 '3.5L XRT PRO 7-Pass',
 '3.5L Ultimate Calligraphy 7-Pass',
 '2.5T Luxury HEV 8-Pass',
 '2.5T Luxury HEV 7-Pass',
 '2.5T Ultimate Calligraphy HEV 7-Pass',
 'Preferred',
 'Urban 8 Passenger',
 'Urban 7 Passenger',
 'Ultimate Calligraphy',
 'Ultimate Calligraphy (Beige Interior)',
 'Calligraphy Night',
 'Preferred AWD',
 'Urban AWD 8-Pass',
 'Urban AWD 7-Pass',
 'Ultimate Caligraphy AWD (Beige Int.)',
 'Essential 8-Passenger FWD',
 'Essential 8-Passenger AWD',
 'Luxury 7-Passenger AWD',
 'Ultimate 7-Passenger AWD',
 'Ultimate 7-Passenger CP AWD',
 'Ultimate Calligraphy 7-Passenger AWD',
 'Preferred 8-Passenger AWD']