In [92]:

import pandas as pd
import numpy as np
car_data = pd.read_csv('Group3_Car_Dataset_Errors.txt', sep='\t')
car_data.head()

Unnamed: 0.1,Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,1,Maruti,145500,Diesel,First Owner,450000
1,2,Skoda,120000,Diesel,Second Owner,370000
2,3,Honda,140000,Petrol,Third Owner,158000
3,4,Hyundai,127000,Diesel,First Owner,225000
4,5,Maruti,120000,Petrol,First Owner,130000


### 1: Check and drop rows with all missing values


In [93]:
print(car_data.isnull().sum())

car_data.dropna(inplace=True)
car_data.isnull().sum()

Unnamed: 0       0
brand            1
km_driven        0
fuel             0
owner            0
selling_price    1
dtype: int64


Unnamed: 0       0
brand            0
km_driven        0
fuel             0
owner            0
selling_price    0
dtype: int64

### 2: Replace undetermined values with NAN

In [94]:
car_data.replace(['NA', 'N/A', '?', 'None', '-'], np.nan, inplace=True)


### 3: Remove rid of duplicate rows


In [95]:
car_data.drop_duplicates(inplace=True)


### 4: Fix data types in `km_driven` and `selling_price`


In [96]:
print(car_data.info())
car_data['km_driven'] = pd.to_numeric(car_data['km_driven'], errors='coerce')
car_data['selling_price'] = pd.to_numeric(car_data['selling_price'], errors='coerce')
car_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8126 entries, 0 to 8127
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Unnamed: 0     8126 non-null   int64 
 1   brand          8126 non-null   object
 2   km_driven      8126 non-null   object
 3   fuel           8126 non-null   object
 4   owner          8126 non-null   object
 5   selling_price  8126 non-null   object
dtypes: int64(1), object(5)
memory usage: 444.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 8126 entries, 0 to 8127
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     8126 non-null   int64  
 1   brand          8126 non-null   object 
 2   km_driven      8125 non-null   float64
 3   fuel           8126 non-null   object 
 4   owner          8126 non-null   object 
 5   selling_price  8124 non-null   float64
dtypes: float64(2), int64(1), object(3)
memor

### 5: Handle outliers in `km_driven`


In [97]:
km_mean = car_data['km_driven'].mean()
km_std = car_data['km_driven'].std()
car_data['km_driven'] = np.where(car_data['km_driven'] > km_mean + 3 * km_std, km_mean, car_data['km_driven'])


### 6: Handle outliers in `selling_price`


In [98]:
price_mean = car_data['selling_price'].mean()
price_std = car_data['selling_price'].std()
car_data['selling_price'] = np.where(car_data['selling_price'] > price_mean + 3 * price_std, price_mean, car_data['selling_price'])


### 7: Fill missing `km_driven` with the mean


In [99]:
print(car_data['km_driven'].isnull().sum())
car_data['km_driven'] = car_data['km_driven'].fillna(car_data['km_driven'].mean())
car_data['km_driven'].isnull().sum()

1


np.int64(0)

### 8: Fill missing `selling_price` with median


In [100]:
print(car_data['selling_price'].isnull().sum())
car_data['selling_price'] = car_data['selling_price'].fillna(car_data['selling_price'].median())
car_data['selling_price'].isnull().sum()


2


np.int64(0)

### 9: Remove extra whitespace in text columns


In [101]:
text_cols = car_data.select_dtypes(include=['object']).columns
car_data[text_cols] = car_data[text_cols].apply(lambda x: x.str.strip())


### 10: Set multi-index with `brand` and `owner`


In [102]:
car_data.set_index(['brand', 'owner'], inplace=True)
car_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,km_driven,fuel,selling_price
brand,owner,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Maruti,First Owner,1,145500.0,Diesel,450000.0
Skoda,Second Owner,2,120000.0,Diesel,370000.0
Honda,Third Owner,3,140000.0,Petrol,158000.0
Hyundai,First Owner,4,127000.0,Diesel,225000.0
Maruti,First Owner,5,120000.0,Petrol,130000.0
