In [1]:
import pandas as pd

# Load the CSV data
df = pd.read_csv("D:/Car price prediction/hyd_car_merged_dataset1.csv")

In [2]:
# Drop multiple columns
df = df.drop(columns=['priceActual','Registration Year', 'it','priceSaving','priceFixedText','owner','Engine','Ownership','Seats.1','transmission','km','Fuel Type','Year of Manufacture'])


In [3]:
df.isnull().sum()

ft                                 0
bt                                 2
ownerNo                            0
oem                                0
model                              0
modelYear                          0
centralVariantId                   0
variantName                        0
price                              0
trendingText                       0
Insurance Validity                 0
Seats                              0
Kms Driven                         1
RTO                              118
Engine Displacement                1
Transmission                       0
Comfort & Convenience              0
Interior                           0
Exterior                           0
Safety                             0
Entertainment & Communication      0
Mileage                           42
Max Power                         11
Torque                            11
Wheel Size                       570
dtype: int64

In [4]:
# Fill missing 'bt' values with "Unknown"
df['bt'].fillna("Unknown", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['bt'].fillna("Unknown", inplace=True)


In [5]:
# Remove ' Kms', ' km', and commas from the 'Kms Driven' column
df['Kms Driven'] = df['Kms Driven'].str.replace(' Kms', '').str.replace(' km', '').str.replace(',', '')

# Convert the column to numeric, allowing for NaN values
df['Kms Driven'] = pd.to_numeric(df['Kms Driven'], errors='coerce')

# Fill NaN values with 0 (or any other value you prefer)
df['Kms Driven'].fillna(0, inplace=True)

# Convert the column to integers
df['Kms Driven'] = df['Kms Driven'].astype(int)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Kms Driven'].fillna(0, inplace=True)


In [6]:
df['RTO'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['RTO'].fillna('Unknown', inplace=True)


In [7]:
# Fill NaN values with 0
df['Engine Displacement'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Engine Displacement'].fillna(0, inplace=True)


In [8]:
df['Mileage'] = pd.to_numeric(df['Mileage'].str.replace(' kmpl', '').str.replace(' km', '').str.strip(), errors='coerce').fillna(0)

In [9]:
df['Max Power'] = pd.to_numeric(df['Max Power'].str.replace('bhp', '').str.strip(), errors='coerce')
df['Torque'] = pd.to_numeric(df['Torque'].str.replace('Nm', '').str.strip(), errors='coerce')
# Fill NaN values in 'Max Power' and 'Torque' with their respective means
df['Max Power'].fillna(df['Max Power'].mean(), inplace=True)
df['Torque'].fillna(df['Torque'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Max Power'].fillna(df['Max Power'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Torque'].fillna(df['Torque'].mean(), inplace=True)


In [10]:
df['Wheel Size'] = pd.to_numeric(df['Wheel Size'], errors='coerce')
mode_value1 = df['Wheel Size'].mode()[0]
# Fill NaN values in 'Wheel Size' with the mode
df['Wheel Size'].fillna(mode_value1, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Wheel Size'].fillna(mode_value1, inplace=True)


In [11]:
# Ensure 'Seats' is treated as a string
df['Seats'] = df['Seats'].astype(str)
# Clean the 'Seats' column by removing the word 'Seats'
df['Seats'] = df['Seats'].str.replace(' Seats', '', regex=False)
# Convert to int, filling NaN values with 0 before conversion
df['Seats'] = pd.to_numeric(df['Seats'], errors='coerce').fillna(0).astype(int)

In [12]:
# Clean the 'Engine' column by removing the 'cc' suffix
df['Engine Displacement'] = df['Engine Displacement'].str.replace(' cc', '', regex=False)
# Convert to numeric if needed
df['Engine Displacement'] = pd.to_numeric(df['Engine Displacement'], errors='coerce')



In [13]:
# List of columns you want to convert to lowercase
a = ['ft', 'bt', 'oem','model','variantName','Insurance Validity','RTO','Transmission'] 
# Convert each specified column to lowercase
for i in a:
    df[i] = df[i].astype(str).str.lower()

In [14]:
# Cleaning the 'price' column
df['price'] = df['price'].replace(
  {'₹ ': '', ',': '', ' Lakh': '*100000', ' Crore': '*10000000'}, regex=True)
# Evaluate the string expressions to convert to numeric
df['price'] = df['price'].apply(lambda x: eval(x))
# Convert to float
df['price'] = df['price'].astype(int)

In [15]:
import pandas as pd

#fill with city name
df['city'] = 'Hyderabad'

# Display the updated DataFrame
print(df)


          ft         bt  ownerNo         oem            model  modelYear  \
0     petrol  hatchback        1  volkswagen  volkswagen polo       2022   
1     petrol  hatchback        2     hyundai      hyundai eon       2014   
2     petrol        suv        1     hyundai    hyundai venue       2023   
3     petrol  hatchback        1      maruti    maruti baleno       2017   
4     petrol        suv        1    mahindra    mahindra thar       2022   
...      ...        ...      ...         ...              ...        ...   
1478  diesel        suv        1    mahindra  mahindra xuv500       2013   
1479  diesel      sedan        1      jaguar        jaguar xf       2014   
1480  diesel      sedan        3         bmw     bmw 5 series       2018   
1481  diesel        suv        2        tata       tata nexon       2019   
1482  diesel        muv        1      toyota    toyota innova       2015   

      centralVariantId                 variantName    price  \
0                 7746  

In [16]:
# Save the cleaned DataFrame as a CSV file
df.to_csv('cleaned_hyd_car.csv', index=False)

In [17]:
df.dtypes

ft                                object
bt                                object
ownerNo                            int64
oem                               object
model                             object
modelYear                          int64
centralVariantId                   int64
variantName                       object
price                              int32
trendingText                      object
Insurance Validity                object
Seats                              int32
Kms Driven                         int32
RTO                               object
Engine Displacement              float64
Transmission                      object
Comfort & Convenience              int64
Interior                           int64
Exterior                           int64
Safety                             int64
Entertainment & Communication      int64
Mileage                          float64
Max Power                        float64
Torque                           float64
Wheel Size      