In [1]:
import pandas as pd

# Load the dataset
file_path = 'car_prices_rsa_update_011.csv'
df = pd.read_csv(file_path)

# Preview the dataset
print("Dataset Overview:")
print(df.head())



Dataset Overview:
     Brand     Model Model_Description     Price  Engine_Size
0   Suzuki  S-Presso            1.0 GL  162900.0          1.0
1   Suzuki  S-Presso           1.0 GL+  169900.0          1.0
2   Suzuki   Celerio            1.0 GA  178900.0          1.0
3  Renault      Kwid          1.0 Life  180999.0          1.0
4   Suzuki  S-Presso     1.0 S-Edition  185900.0          1.0


In [2]:
# 1. Check for missing values
print("\nMissing Values Count:")
print(df.isnull().sum())


Missing Values Count:
Brand                   0
Model                   0
Model_Description      81
Price                   0
Engine_Size          1252
dtype: int64


In [4]:
# Drop columns with more than 50% missing values
threshold = len(df) * 0.5
df_cleaned = df.dropna(thresh=threshold, axis=1)

In [5]:
# 2. Standardize column names
df_cleaned.columns = [col.strip().lower().replace(" ", "_") for col in df_cleaned.columns]

# 3. Handle duplicates
df_cleaned.drop_duplicates(inplace=True)

# 4. Convert data types
# Example: convert 'price' column to numeric
df_cleaned['price'] = pd.to_numeric(df_cleaned['price'], errors='coerce')


In [6]:
# 5. Detect and handle outliers (Example: Price column)
q1 = df_cleaned['price'].quantile(0.25)
q3 = df_cleaned['price'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
df_cleaned = df_cleaned[(df_cleaned['price'] >= lower_bound) & (df_cleaned['price'] <= upper_bound)]


In [9]:
# Save the cleaned dataset
cleaned_file_path = 'cleaned_car_prices.csv'
df_cleaned.to_csv(cleaned_file_path, index=False)

print("\nCleaned Dataset Overview:")
print(df_cleaned.head())



Cleaned Dataset Overview:
     brand     model model_description     price  engine_size
0   Suzuki  S-Presso            1.0 GL  162900.0          1.0
1   Suzuki  S-Presso           1.0 GL+  169900.0          1.0
2   Suzuki   Celerio            1.0 GA  178900.0          1.0
3  Renault      Kwid          1.0 Life  180999.0          1.0
4   Suzuki  S-Presso     1.0 S-Edition  185900.0          1.0
