## Importing Libraries

In [1]:
# Libraries for loading and preprocessing / cleaning
import pandas as pd
import numpy as np

## Loading Dataset

In [2]:
# Create a variable to generate the dataset
data = pd.read_csv('Datasets/Toyota.csv')

In [3]:
df = data.copy()

In [4]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,0,13500,23.0,46986,Diesel,90,1.0,0,2000,three,1165
1,1,13750,23.0,72937,Diesel,90,1.0,0,2000,3,1165
2,2,13950,24.0,41711,Diesel,90,,0,2000,3,1165
3,3,14950,26.0,48000,Diesel,90,0.0,0,2000,3,1165
4,4,13750,30.0,38500,Diesel,90,0.0,0,2000,3,1170
5,5,12950,32.0,61000,Diesel,90,0.0,0,2000,3,1170
6,6,16900,27.0,??,Diesel,????,,0,2000,3,1245
7,7,18600,30.0,75889,,90,1.0,0,2000,3,1245
8,8,21500,27.0,19700,Petrol,192,0.0,0,1800,3,1185
9,9,12950,23.0,71138,Diesel,????,,0,1900,3,1105


In [5]:
df.tail(10)

Unnamed: 0.1,Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
1426,1426,9950,78.0,30964,Petrol,110,,1,1600,3,1080
1427,1427,8950,,29000,Petrol,86,1.0,1,1300,3,1045
1428,1428,8450,72.0,??,Petrol,86,,0,1300,3,1015
1429,1429,8950,78.0,24000,Petrol,86,1.0,1,1300,5,1065
1430,1430,8450,80.0,23000,Petrol,86,0.0,0,1300,3,1015
1431,1431,7500,,20544,Petrol,86,1.0,0,1300,3,1025
1432,1432,10845,72.0,??,Petrol,86,0.0,0,1300,3,1015
1433,1433,8500,,17016,Petrol,86,0.0,0,1300,3,1015
1434,1434,7250,70.0,??,,86,1.0,0,1300,3,1015
1435,1435,6950,76.0,1,Petrol,110,0.0,0,1600,5,1114


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1436 entries, 0 to 1435
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  1436 non-null   int64  
 1   Price       1436 non-null   int64  
 2   Age         1336 non-null   float64
 3   KM          1436 non-null   object 
 4   FuelType    1336 non-null   object 
 5   HP          1436 non-null   object 
 6   MetColor    1286 non-null   float64
 7   Automatic   1436 non-null   int64  
 8   CC          1436 non-null   int64  
 9   Doors       1436 non-null   object 
 10  Weight      1436 non-null   int64  
dtypes: float64(2), int64(5), object(4)
memory usage: 123.5+ KB


In [7]:
## Finding the missing value
df.isnull().sum()

Unnamed: 0      0
Price           0
Age           100
KM              0
FuelType      100
HP              0
MetColor      150
Automatic       0
CC              0
Doors           0
Weight          0
dtype: int64

In [8]:
## Display the total number of columns and rows
r = df.shape[0]
c = df.shape[1]
print(f"Total Number of Rows: {r}\nTotal Number of Columns: {c}")

Total Number of Rows: 1436
Total Number of Columns: 11


In [9]:
## display to the total count for unique items 
df.nunique()

Unnamed: 0    1436
Price          236
Age             77
KM            1256
FuelType         3
HP              13
MetColor         2
Automatic        2
CC              12
Doors            7
Weight          59
dtype: int64

In [10]:
for each_col in df.columns:
    print(df[each_col].unique())

[   0    1    2 ... 1433 1434 1435]
[13500 13750 13950 14950 12950 16900 18600 21500 20950 19950 19600 22500
 22000 22750 17950 16750 16950 15950 16250 17495 15750 15500 14750 19000
 15800 21950 20500 13250 15250 18950 15999 16500 18750 22250 12995 18450
 16895 14900 17250 15450 16650 17450 16450 18900 18990 18500 19450 18800
 32500 31000 31275 24950 22950 24990 17900 19250 16350 21750 15850 23000
 19900 23950 24500 17200 19500 16868 19750 20750 17650 17795 18245 23750
 18700 21125  6950  9500 11950  7750  4350  4750 11750 11900  9950 11495
 11250 10500 10450 11500 12500 10950 11450 11790 12450 11690 12750 11925
 12900 11650 10850  9940 13450 12495 12000 11480 14990 12850 11700 11895
 13875 12295 13995  9900 11990 10750 11695 11000 12400 12200 12695 14350
 10250  6500  6400  7000  8900  8500  8950  9250  9450  8250  4450  9000
  5150  7900 10900  9750 11290 10895 10995  9850  8695 10990  8750  9930
  9799  9700  9990  9475 10000 10495  9400  9650  9550 13000 11710  9980
 12250 11930 10

In [11]:
df.KM

0       46986
1       72937
2       41711
3       48000
4       38500
        ...  
1431    20544
1432       ??
1433    17016
1434       ??
1435        1
Name: KM, Length: 1436, dtype: object

## Data Preprocessing
- drop the unnamed column
- Replace special charactres

In [12]:
## deleting a column
del df['Unnamed: 0']
df

Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,13500,23.0,46986,Diesel,90,1.0,0,2000,three,1165
1,13750,23.0,72937,Diesel,90,1.0,0,2000,3,1165
2,13950,24.0,41711,Diesel,90,,0,2000,3,1165
3,14950,26.0,48000,Diesel,90,0.0,0,2000,3,1165
4,13750,30.0,38500,Diesel,90,0.0,0,2000,3,1170
...,...,...,...,...,...,...,...,...,...,...
1431,7500,,20544,Petrol,86,1.0,0,1300,3,1025
1432,10845,72.0,??,Petrol,86,0.0,0,1300,3,1015
1433,8500,,17016,Petrol,86,0.0,0,1300,3,1015
1434,7250,70.0,??,,86,1.0,0,1300,3,1015


In [13]:
## replacing the special characters
df['KM'] = df['KM'].replace('??',np.nan)

In [14]:
df['HP'] = df['HP'].replace('????',np.nan)

In [15]:
df['Doors'] = df['Doors'].replace('three',3)
df['Doors'] = df['Doors'].replace('four',4)
df['Doors'] = df['Doors'].replace('five',5)

In [16]:
df.head()

Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,13500,23.0,46986,Diesel,90,1.0,0,2000,3,1165
1,13750,23.0,72937,Diesel,90,1.0,0,2000,3,1165
2,13950,24.0,41711,Diesel,90,,0,2000,3,1165
3,14950,26.0,48000,Diesel,90,0.0,0,2000,3,1165
4,13750,30.0,38500,Diesel,90,0.0,0,2000,3,1170


In [17]:
## Converting the column types
df['KM'] = df['KM'].astype('float64')

In [18]:
df['FuelType'] = df['FuelType'].astype('category')

In [19]:
df['HP'] = df['HP'].astype('float64')

In [20]:
df['MetColor'] = df['MetColor'].astype('category')

In [21]:
df['Automatic'] = df['Automatic'].astype('category')

In [22]:
df['Doors'] = df['Doors'].astype('category')

In [23]:
df['CC'] = df['CC'].astype('category')

In [24]:
df.dtypes

Price           int64
Age           float64
KM            float64
FuelType     category
HP            float64
MetColor     category
Automatic    category
CC           category
Doors        category
Weight          int64
dtype: object

In [25]:
## finding the mean for the column
df['KM'] = df['KM'].fillna(df['KM'].mean())

In [26]:
df['HP'] = df['HP'].fillna(df['HP'].mean())

In [27]:
df['Age'] = df['Age'].fillna(df['Age'].mean())

In [28]:
df['MetColor'].mode()

0    1.0
Name: MetColor, dtype: category
Categories (2, float64): [0.0, 1.0]

In [29]:
df['MetColor'] = df['MetColor'].fillna(df['MetColor'].mode().index[0])

In [30]:
df['FuelType'].value_counts()

FuelType
Petrol    1177
Diesel     144
CNG         15
Name: count, dtype: int64

In [31]:
df['FuelType'] = df['FuelType'].fillna(df['FuelType'].value_counts().index[0])

In [32]:
df.head(10)

Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,13500,23.0,46986.0,Diesel,90.0,1.0,0,2000,3,1165
1,13750,23.0,72937.0,Diesel,90.0,1.0,0,2000,3,1165
2,13950,24.0,41711.0,Diesel,90.0,0.0,0,2000,3,1165
3,14950,26.0,48000.0,Diesel,90.0,0.0,0,2000,3,1165
4,13750,30.0,38500.0,Diesel,90.0,0.0,0,2000,3,1170
5,12950,32.0,61000.0,Diesel,90.0,0.0,0,2000,3,1170
6,16900,27.0,68647.239972,Diesel,101.478322,0.0,0,2000,3,1245
7,18600,30.0,75889.0,Petrol,90.0,1.0,0,2000,3,1245
8,21500,27.0,19700.0,Petrol,192.0,0.0,0,1800,3,1185
9,12950,23.0,71138.0,Diesel,101.478322,0.0,0,1900,3,1105


In [33]:
df.isnull().sum()

Price        0
Age          0
KM           0
FuelType     0
HP           0
MetColor     0
Automatic    0
CC           0
Doors        0
Weight       0
dtype: int64

In [34]:
df.to_csv('cleaned_toyota.csv')