In [85]:
import pandas as pd
import ast
import glob
import os

In [86]:
file_paths = glob.glob(os.path.join('../raw_data/', '*.xlsx'))

In [87]:
# Expand 'top' field
def expand_top(top_list):
    return [item['value'] for item in top_list]
 

In [88]:
# Expand 'data' field
def expand_data(data_list):
    expanded_data = {}
    for item in data_list:
        heading = item['heading']
        expanded_data[heading] = [feature['value'] for feature in item['list']]
    return expanded_data

In [89]:
def normalize_data(df):

    # Extract 'top' features
    df['top_features'] = df['new_car_feature'].apply(lambda x: expand_top(x['top']))

    # Extract 'data' features
    data_features = df['new_car_feature'].apply(lambda x: expand_data(x['data']))

    # Normalize 'data_features' and concatenate with 'top_features'
    data_features_df = pd.json_normalize(data_features)

    # Concatenate 'top_features' and 'data_features_df'
    top_feature_df = pd.concat([df[['top_features']], data_features_df], axis=1)

    return top_feature_df

In [90]:
def car_spec_data_generation(df, new_car_detail_flag = False):
    rows = []
    for index, row in df.iterrows():
        # Extract the 'top' list from the row
        if not new_car_detail_flag:
            top_list = row['top']
        
            # Create a dictionary for the current row
            columns_dict = {item['key']: item['value'] for item in top_list}
            # Append the dictionary to the list of rows
            rows.append(columns_dict)
        else:
            rows.append(row)
        

    car_overview = pd.DataFrame(rows)
    return car_overview

In [91]:
def combine_multiple_dfs(new_car_overview_df, new_car_feature_df, new_car_specs_df, new_car_detail_df):
    car_overview =new_car_overview_df.reset_index(drop = True)
    top_feature_df = new_car_feature_df.reset_index(drop = True)
    car_specs = new_car_specs_df.reset_index(drop = True)
    newcar_details = new_car_detail_df.reset_index(drop = True)

    final_df_combined = pd.concat([car_overview, top_feature_df, car_specs, newcar_details], axis = 1)

    return final_df_combined

In [92]:
def cleaned_final_df(final_df_combined, rename_dict):

    # drop duplicate columns with same values
    final_df_combined = final_df_combined.loc[:, ~final_df_combined.columns.duplicated()]

    unwanted_columns = ['Registration Year', 'transmission', 'Kms Driven', 'Engine Displacement', 'trendingText.imgUrl', 'trendingText.heading', 'trendingText.desc', 'priceFixedText', 
                    'owner', 'it', 'ft', 'Ownership', 'Year of Manufacture']
    
    cars_df = final_df_combined.drop(columns=unwanted_columns)

    cars_df.rename(columns=rename_dict, inplace=True)

    return cars_df

In [93]:
# Complete mapping dictionary
rename_dict = {
    'Insurance Validity': 'Insurance_Validity_Period',
    'Fuel Type': 'Fuel_Type',
    'Seats': 'Number_of_Seats',
    'RTO': 'Regional_Transport_Office',
    'Transmission': 'Transmission_Type',
    'top_features': 'Top_Features',
    'Comfort & Convenience': 'Comfort_and_Convenience',
    'Interior': 'Interior_Features',
    'Exterior': 'Exterior_Features',
    'Safety': 'Safety_Features',
    'Entertainment & Communication': 'Entertainment_and_Communication',
    'Mileage': 'Mileage_(km/l)',
    'Engine': 'Engine_Capacity',
    'Max Power': 'Maximum_Power',
    'Torque': 'Torque',
    'Wheel Size': 'Wheel_Size',
    'bt': 'Battery_Type',
    'km': 'Kilometers_Driven',
    'ownerNo': 'Number_of_Owners',
    'oem': 'Original_Equipment_Manufacturer',
    'model': 'Car_Model',
    'modelYear': 'Model_Year',
    'centralVariantId': 'Central_Variant_ID',
    'variantName': 'Variant_Name',
    'price': 'Listed_Price',
    'priceActual': 'Actual_Price',
    'priceSaving': 'Price_Saving_Amount'
}

In [94]:
def generate_structured_excel_data(file_paths):
    for file in file_paths:
        # get file name
        base_name = os.path.basename(file)
        file_name = os.path.splitext(base_name)[0].split('_')[0]
        

        df = pd.read_excel(file)

        
        # changing new_car_overview from Object to json format to structure the data
        df['new_car_overview'] = df['new_car_overview'].apply(ast.literal_eval)
        new_car_expanded = pd.json_normalize(df['new_car_overview'])
        new_car_overview_df = car_spec_data_generation(new_car_expanded)

        # changing new_car_feature from Object to json format to structure the data
        df['new_car_feature'] = df['new_car_feature'].apply(ast.literal_eval)
        #features_expanded = pd.json_normalize(df['new_car_feature'])
        new_car_feature_df = normalize_data(df)

        # changing new_car_specs from Object to json format to structure the data
        df['new_car_specs'] = df['new_car_specs'].apply(ast.literal_eval)
        specs_expanded = pd.json_normalize(df['new_car_specs'])
        new_car_specs_df = car_spec_data_generation(specs_expanded)

        # changing new_car_specs from Object to json format to structure the data
        df['new_car_detail'] = df['new_car_detail'].apply(ast.literal_eval)
        detail_expanded = pd.json_normalize(df['new_car_detail'])
        new_car_detail_df = car_spec_data_generation(detail_expanded, new_car_detail_flag=True)

        final_df_combined = combine_multiple_dfs(new_car_overview_df, new_car_feature_df, new_car_specs_df, new_car_detail_df)

        car_specs = cleaned_final_df(final_df_combined, rename_dict)

        car_specs.to_excel(f'../cleaned_data/{file_name}_cars_cleaned.xlsx', index= False)

        print(f'Structured data created for {file_name}')


In [95]:
generate_structured_excel_data(file_paths)

Structured date created for bangalore
Structured date created for chennai
Structured date created for delhi
Structured date created for hyderabad
Structured date created for jaipur
Structured date created for kolkata
