In [21]:
import pandas as pd
import ast
import glob
import os

import numpy as np
import re
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler, StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

In [22]:
file_paths = glob.glob(os.path.join('../cleaned_data/', '*.xlsx'))

In [23]:
def Clean_third_party_insurance(df):
    df['Insurance_Validity_Period']=df['Insurance_Validity_Period'].replace({
        'Third Party insurance':'Third Party',
        'Zero Dep':'Zero Depreciation',
        np.nan:'Unknown',
        'Not Available': 'Unknown',
        '1': 'One Year',
        '2':'Two Years'
    })

    return df
    

In [24]:
def Clean_number_of_seats(df):
    
    df['Number_of_Seats'] = df['Number_of_Seats'].str.replace('Seats', '',regex= False).str.strip()
    df['Number_of_Seats'] = pd.to_numeric(df['Number_of_Seats'], errors='coerce')
    mean_value = df['Number_of_Seats'].mean()
    df['Number_of_Seats'] = df['Number_of_Seats'].fillna(mean_value)
    df['Number_of_Seats'] = df['Number_of_Seats'].astype(int)
    
    return df

In [25]:
def Clean_engine_capacity(df):
    df['Engine_Capacity'] = df['Engine_Capacity'].str.replace('CC', '', regex=False).str.strip()
    df['Engine_Capacity'] = pd.to_numeric(df['Engine_Capacity'], errors='coerce')
    df['Engine_Capacity'] = df.groupby('Car_Model')['Engine_Capacity'].transform(lambda x: x.fillna(x.mean()))
    #print("Engine_Capacity nulls after cleaning:", df['Engine_Capacity'].isnull().sum())
    remaining_missing = df['Engine_Capacity'].isnull().sum()
    if remaining_missing > 0:
        global_mean = df['Engine_Capacity'].mean()
        df['Engine_Capacity'].fillna(global_mean, inplace=True)
    
    df['Engine_Capacity'] = df['Engine_Capacity'].astype(float)
    return df
   
    # def extract_first_numeric(value):
    #     if isinstance(value, str):
    #         numbers = re.findall(r'\d+\.?\d*', value)  # Find all numeric sequences
    #         return numbers[0] if numbers else np.nan
    #     return np.nan
    # df['Engine_Capacity'] = df['Engine_Capacity'].astype(str).apply(extract_first_numeric)
    #return df

In [26]:
def Clean_List_feature_columns(df):
    df['Comfort_and_Convenience'] = df['Comfort_and_Convenience'].astype(str)
    df['Comfort_and_Convenience'] = df['Comfort_and_Convenience'].fillna("unknown")
    df['Interior_Features'] = df['Interior_Features'].astype(str)
    df['Interior_Features'] = df['Interior_Features'].fillna("unknown")
    df['Exterior_Features'] = df['Exterior_Features'].astype(str)
    df['Exterior_Features'] = df['Exterior_Features'].fillna("unknown")
    df['Safety_Features'] = df['Safety_Features'].astype(str)
    df['Safety_Features'] = df['Safety_Features'].fillna("unknown")
    df['Entertainment_and_Communication'] = df['Entertainment_and_Communication'].astype(str)
    df['Entertainment_and_Communication'] = df['Entertainment_and_Communication'].fillna("unknown")


    return df

In [27]:
def Clean_Mileage(df):
    df['Mileage_(km/l)'] = df['Mileage_(km/l)'].str.replace('kmpl','',regex = False).str.strip().str.replace('km/kg','',regex = False).str.strip()
    df['Mileage_(km/l)'] = pd.to_numeric(df['Mileage_(km/l)'], errors='coerce')
    overall_mean = df['Mileage_(km/l)'].mean()
    df['Mileage_(km/l)'] = df['Mileage_(km/l)'].fillna(overall_mean)
    return df

In [28]:
def Clean_Maximum_Power(value):
    value = str(value).strip()
    
    # Extract numeric value
    numeric_value = re.sub(r'[^\d.]+', '', value)
    
    # Conversion factors
    ps_to_bhp = 0.986
    kw_to_bhp = 1.341
    
    if 'PS' in value:
        return float(numeric_value) * ps_to_bhp
    elif 'kW' in value:
        return float(numeric_value) * kw_to_bhp
    elif 'bhp' in value:
        return float(numeric_value)
    else:
        try:
            return float(numeric_value)  # Default case for numeric values with no unit
        except ValueError:
            return np.nan  # Return NaN for any unrecognized format



In [29]:
def Clean_torque(df):
    df['Torque'] = df['Torque'].str.replace('Nm','',regex = False).str.strip().str.replace('nm','',regex = False).str.strip()
    df['Torque'] = pd.to_numeric(df['Torque'], errors='coerce')
    df['Torque'] = df['Torque'].fillna(df['Torque'].mean())
    return df

In [30]:
def Clean_wheel_size(df):
    df['Wheel_Size'] = df['Wheel_Size'].str.replace('R','',regex = False).str.strip()
    df['Wheel_Size'] = pd.to_numeric(df['Wheel_Size'], errors='coerce')
    df['Wheel_Size'] = df['Wheel_Size'].fillna(df['Wheel_Size'].mean())
    return df

In [31]:
def Clean_Battery_type(df):
    most_frequent_value = df['Battery_Type'].mode()[0]
    df['Battery_Type'] = df['Battery_Type'].fillna(most_frequent_value)
    return df

In [32]:
# Correct the loop to iterate over each tuple correctly
dfs = []
for file_path in file_paths:
    city = os.path.basename(file_path).split('_')[0]
    df = pd.read_excel(file_path)
    df['city'] = city
    df = pd.read_excel(file_path)  # Load the data
    df = df.drop(columns=['Regional_Transport_Office', 'Original_Equipment_Manufacturer', 'Central_Variant_ID', 'Variant_Name'])
    df = Clean_third_party_insurance(df)
    df = Clean_number_of_seats(df)
    df = Clean_List_feature_columns(df)
    df = Clean_engine_capacity(df)
    df = Clean_Mileage(df)
    df['Maximum_Power'] = df['Maximum_Power'].apply(Clean_Maximum_Power)
    mean_value = df['Maximum_Power'].mean()
    df['Maximum_Power'] = df['Maximum_Power'].fillna(mean_value)
    df = Clean_torque(df)
    df = Clean_wheel_size(df)
    df = Clean_Battery_type(df)
    dfs.append(df)  # Append the DataFrame to the list
    

# Concatenate all DataFrames into one
combined_df = pd.concat(dfs, ignore_index=True)
print(combined_df['city'].unique())

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Engine_Capacity'].fillna(global_mean, inplace=True)


['bangalore' 'chennai' 'delhi' 'hyderabad' 'jaipur' 'kolkata']


In [33]:
combined_df.isnull().sum()

Insurance_Validity_Period          0
Fuel_Type                          0
Number_of_Seats                    0
Transmission_Type                  0
Top_Features                       0
Comfort_and_Convenience            0
Interior_Features                  0
Exterior_Features                  0
Safety_Features                    0
Entertainment_and_Communication    0
Mileage_(km/l)                     0
Engine_Capacity                    0
Maximum_Power                      0
Torque                             0
Wheel_Size                         0
Battery_Type                       0
Kilometers_Driven                  0
Number_of_Owners                   0
Car_Model                          0
Model_Year                         0
Listed_Price                       0
Actual_Price                       0
city                               0
dtype: int64

In [34]:
combined_df['city'].value_counts()

city
delhi        1485
hyderabad    1483
bangalore    1481
chennai      1419
kolkata      1381
jaipur       1120
Name: count, dtype: int64

In [35]:
combined_df.dtypes

Insurance_Validity_Period           object
Fuel_Type                           object
Number_of_Seats                      int64
Transmission_Type                   object
Top_Features                        object
Comfort_and_Convenience             object
Interior_Features                   object
Exterior_Features                   object
Safety_Features                     object
Entertainment_and_Communication     object
Mileage_(km/l)                     float64
Engine_Capacity                    float64
Maximum_Power                      float64
Torque                             float64
Wheel_Size                         float64
Battery_Type                        object
Kilometers_Driven                   object
Number_of_Owners                     int64
Car_Model                           object
Model_Year                           int64
Listed_Price                       float64
Actual_Price                       float64
city                                object
dtype: obje

In [36]:
# output_path = "../preprocessed_cars/cleaned_entire_datasets.xlsx"
# os.makedirs(os.path.dirname(output_path), exist_ok=True)  # Ensure the directory exists
# combined_df.to_excel(output_path, index=False)

In [37]:
combined_df.value_counts()

Insurance_Validity_Period  Fuel_Type  Number_of_Seats  Transmission_Type  Top_Features                                                                                                                                                                                                 Comfort_and_Convenience                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   Interior_Features                                                                                                                                      

In [41]:
print(combined_df['city'].unique())

['bangalore' 'chennai' 'delhi' 'hyderabad' 'jaipur' 'kolkata']


In [42]:
df = combined_df.to_excel("../preprocessed_cars/cleaned_entire_dataset.xlsx")

In [43]:
df = pd.read_excel("../preprocessed_cars/cleaned_entire_dataset.xlsx")

In [47]:
df.city.value_counts()

city
delhi        1485
hyderabad    1483
bangalore    1481
chennai      1419
kolkata      1381
jaipur       1120
Name: count, dtype: int64