# Reading The Data

In [1]:
import pandas as pd

In [2]:
filename = 'https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data'

In [4]:
headers = ["symboling","normalized-losses","make","fuel-type","aspiration", "num-of-doors","body-style",
         "drive-wheels","engine-location","wheel-base", "length","width","height","curb-weight","engine-type",
         "num-of-cylinders", "engine-size","fuel-system","bore","stroke","compression-ratio","horsepower",
         "peak-rpm","city-mpg","highway-mpg","price"]

In [5]:
df = pd.read_csv(filename,names = headers)

In [6]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


# Identifying and Replacing Missing Values

In [7]:
df.isna().any()

symboling            False
normalized-losses    False
make                 False
fuel-type            False
aspiration           False
num-of-doors         False
body-style           False
drive-wheels         False
engine-location      False
wheel-base           False
length               False
width                False
height               False
curb-weight          False
engine-type          False
num-of-cylinders     False
engine-size          False
fuel-system          False
bore                 False
stroke               False
compression-ratio    False
horsepower           False
peak-rpm             False
city-mpg             False
highway-mpg          False
price                False
dtype: bool

In [9]:
import numpy as np

In [10]:
df.replace('?',np.nan,inplace = True)

In [11]:
df.isnull().any()

symboling            False
normalized-losses     True
make                 False
fuel-type            False
aspiration           False
num-of-doors          True
body-style           False
drive-wheels         False
engine-location      False
wheel-base           False
length               False
width                False
height               False
curb-weight          False
engine-type          False
num-of-cylinders     False
engine-size          False
fuel-system          False
bore                  True
stroke                True
compression-ratio    False
horsepower            True
peak-rpm              True
city-mpg             False
highway-mpg          False
price                 True
dtype: bool

### Counting Missing Values

In [12]:
missing = df.isnull()

In [17]:
missing.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [15]:
for i in missing.columns.values.tolist():
    print(i)
    print(missing[i].value_counts())

symboling
False    205
Name: symboling, dtype: int64
normalized-losses
False    164
True      41
Name: normalized-losses, dtype: int64
make
False    205
Name: make, dtype: int64
fuel-type
False    205
Name: fuel-type, dtype: int64
aspiration
False    205
Name: aspiration, dtype: int64
num-of-doors
False    203
True       2
Name: num-of-doors, dtype: int64
body-style
False    205
Name: body-style, dtype: int64
drive-wheels
False    205
Name: drive-wheels, dtype: int64
engine-location
False    205
Name: engine-location, dtype: int64
wheel-base
False    205
Name: wheel-base, dtype: int64
length
False    205
Name: length, dtype: int64
width
False    205
Name: width, dtype: int64
height
False    205
Name: height, dtype: int64
curb-weight
False    205
Name: curb-weight, dtype: int64
engine-type
False    205
Name: engine-type, dtype: int64
num-of-cylinders
False    205
Name: num-of-cylinders, dtype: int64
engine-size
False    205
Name: engine-size, dtype: int64
fuel-system
False    205
Name: 

### Scaling

In [16]:
rep1 = df

In [19]:
av1 = rep1['bore'].astype('float').mean()

In [20]:
av2 = rep1['stroke'].astype('float').mean()

In [21]:
rep1['bore'].replace(np.nan,av1,inplace = True)

In [22]:
rep1['stroke'].replace(np.nan,av2,inplace = True)

In [24]:
av3 = rep1['normalized-losses'].astype('float').mean()

In [29]:
rep1['num-of-doors'].value_counts().idxmax()

'four'

In [30]:
rep1['normalized-losses'].replace(np.nan,av3,inplace = True)