In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm
from sklearn.preprocessing import LabelEncoder
from pandas import json_normalize
import json
import ast

In [4]:
# Adjust display options to show full content
pd.set_option('display.max_colwidth', None)  # No truncation for column content
pd.set_option('display.max_rows', None)      # Display all rows (set a limit if data is large)
pd.set_option('display.max_columns', None)   # Display all columns

In [40]:
# Function to read Excel files
def Read_CSV(FilePath):
    try:
        df = pd.read_excel(FilePath)
        print(f'The Excel file from {FilePath} got read successfully','\n')
        return df
    except Exception as e:
        print(f'Error While Reading the file from {FilePath} : ', e, '\n')

# Flatten 'new_car_detail' column
def process_new_car_detail(data):
    dataframe = []
    column = data['new_car_detail'].apply(ast.literal_eval)

    for cell in column:
        flattened_data = {**cell, **cell['trendingText']}
        flattened_data.pop('trendingText')
        dataframe.append(flattened_data)

    return pd.DataFrame(dataframe)

# Flatten 'new_car_overview' column
def process_new_car_overview(data):
    dataframe = []
    column = data['new_car_overview'].apply(ast.literal_eval)

    for cell in column:
        flattened_data = {item['key']: item['value'] for item in cell['top']}
        dataframe.append(flattened_data)

    return pd.DataFrame(dataframe)

# Flatten 'new_car_feature' column
def process_new_car_feature(data):
    dataframe = []
    column = data['new_car_feature'].apply(ast.literal_eval)

    for cell in column:
        flattened_data = {'Features': len(cell['top'])}
        for item in cell['data']:
            flattened_data[item['heading']] = len(item['list'])
        dataframe.append(flattened_data)

    return pd.DataFrame(dataframe)

# Flatten 'new_car_specs' column
def process_new_car_specs(data):
    dataframe = []
    column = data['new_car_specs'].apply(ast.literal_eval)

    for cell in column:
        flattened_data = {item['key']: item['value'] for item in cell['top']}
        for item in cell['data']:
            for val in item['list']:
                flattened_data[val['key']] = val['value']
        dataframe.append(flattened_data)

    return pd.DataFrame(dataframe)

# Function to combine all processed data
def process_and_combine_city_data(city, data):
    data1 = process_new_car_detail(data)
    data2 = process_new_car_overview(data)
    data3 = process_new_car_feature(data)
    data4 = process_new_car_specs(data)

    Complete_data = pd.concat([data1.reset_index(drop=True), 
                               data2.reset_index(drop=True), 
                               data3.reset_index(drop=True), 
                               data4.reset_index(drop=True)], axis=1)
    Complete_data.insert(0, 'city', city)
    return Complete_data

# File paths for different cities
filepath = [
    (r'DataSets\chennai_cars.xlsx', 'Chennai'),
    (r'DataSets\bangalore_cars.xlsx', 'Bangalore'),
    (r'DataSets\delhi_cars.xlsx', 'Delhi'),
    (r'DataSets\hyderabad_cars.xlsx', 'Hyderabad'),
    (r'DataSets\jaipur_cars.xlsx', 'Jaipur'),
    (r'DataSets\kolkata_cars.xlsx', 'Kolkata')
]

all_data = []

# Loop through each file and process data
for path, city in filepath:
    df = Read_CSV(path)  # Read the Excel file
    if df is not None:  # Proceed if the file was read successfully
        city_data = process_and_combine_city_data(city, df)  # Process and combine data
        all_data.append(city_data)
        city_file_name = f'DataSets/Structured_{city}_cars.csv'  # Create filename for each city
        city_data.to_csv(city_file_name, index=False)  # Save the processed data to CSV
        print(f'Structured data for {city} saved as {city_file_name}','\n')



The Excel file from DataSets\chennai_cars.xlsx got read successfully 

Structured data for Chennai saved as DataSets/New_Structured_Data/Structured_Chennai_cars.csv 

The Excel file from DataSets\bangalore_cars.xlsx got read successfully 

Structured data for Bangalore saved as DataSets/New_Structured_Data/Structured_Bangalore_cars.csv 

The Excel file from DataSets\delhi_cars.xlsx got read successfully 

Structured data for Delhi saved as DataSets/New_Structured_Data/Structured_Delhi_cars.csv 

The Excel file from DataSets\hyderabad_cars.xlsx got read successfully 

Structured data for Hyderabad saved as DataSets/New_Structured_Data/Structured_Hyderabad_cars.csv 

The Excel file from DataSets\jaipur_cars.xlsx got read successfully 

Structured data for Jaipur saved as DataSets/New_Structured_Data/Structured_Jaipur_cars.csv 

The Excel file from DataSets\kolkata_cars.xlsx got read successfully 

Structured data for Kolkata saved as DataSets/New_Structured_Data/Structured_Kolkata_cars.c

In [41]:
# File paths for the structured data
file_paths = [
    'DataSets/Structured_Bangalore_cars.csv',
    'DataSets/Structured_Chennai_cars.csv',
    'DataSets/Structured_Delhi_cars.csv',
    'DataSets/Structured_Hyderabad_cars.csv',
    'DataSets/Structured_Jaipur_cars.csv',
    'DataSets/Structured_Kolkata_cars.csv'
]

# List to hold data from all files
dataframes = []

# Read each file and append to the list
for file_path in file_paths:
    try:
        df = pd.read_csv(file_path) 
        dataframes.append(df)
        print(f"Loaded data from {file_path} with shape {df.shape}")
    except Exception as e:
        print(f"Error loading file {file_path}: {e}")

# Concatenate all dataframes into one
combined_data = pd.concat(dataframes, ignore_index=True)

# Save the combined data to a single file
output_file = 'DataSets/Combined_Cars_Data.csv'
combined_data.to_csv(output_file, index=False)

print(f"Combined data saved to {output_file} with shape {combined_data.shape}")

Loaded data from DataSets/New_Structured_Data/Structured_Bangalore_cars.csv with shape (1481, 76)
Loaded data from DataSets/New_Structured_Data/Structured_Chennai_cars.csv with shape (1419, 76)
Loaded data from DataSets/New_Structured_Data/Structured_Delhi_cars.csv with shape (1485, 76)
Loaded data from DataSets/New_Structured_Data/Structured_Hyderabad_cars.csv with shape (1483, 76)
Loaded data from DataSets/New_Structured_Data/Structured_Jaipur_cars.csv with shape (1120, 76)
Loaded data from DataSets/New_Structured_Data/Structured_Kolkata_cars.csv with shape (1381, 76)
Combined data saved to DataSets/New_Structured_Data/Combined_Cars_Data.csv with shape (8369, 76)
