In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt

In [2]:
names = ["symboling", "normalized-losses", "make", "fuel-type", "aspiration",
         "num-of-doors", "body-style", "drive-wheels", "engine-location", "wheel-base",
        "length", "width", "height", "curb-weight", "engine-type", "num-of-cylinders",
        "engine-size", "fuel-system", "bore", "stroke", "compression-ratio", "horsepower",
        "peak-rpm", "city-mpg", "highway-mpg", "price"]

# Dataset Head

In [3]:
data = pd.read_csv('imports-85.csv', names=names)
print(data.head(5))

   symboling normalized-losses         make fuel-type aspiration num-of-doors  \
0          3                 ?  alfa-romero       gas        std          two   
1          3                 ?  alfa-romero       gas        std          two   
2          1                 ?  alfa-romero       gas        std          two   
3          2               164         audi       gas        std         four   
4          2               164         audi       gas        std         four   

    body-style drive-wheels engine-location  wheel-base  ...  engine-size  \
0  convertible          rwd           front        88.6  ...          130   
1  convertible          rwd           front        88.6  ...          130   
2    hatchback          rwd           front        94.5  ...          152   
3        sedan          fwd           front        99.8  ...          109   
4        sedan          4wd           front        99.4  ...          136   

   fuel-system  bore  stroke compression-ratio hor

# Dataset Statistics

In [4]:
print(data.describe(include="all"))

         symboling normalized-losses    make fuel-type aspiration  \
count   205.000000               205     205       205        205   
unique         NaN                52      22         2          2   
top            NaN                 ?  toyota       gas        std   
freq           NaN                41      32       185        168   
mean      0.834146               NaN     NaN       NaN        NaN   
std       1.245307               NaN     NaN       NaN        NaN   
min      -2.000000               NaN     NaN       NaN        NaN   
25%       0.000000               NaN     NaN       NaN        NaN   
50%       1.000000               NaN     NaN       NaN        NaN   
75%       2.000000               NaN     NaN       NaN        NaN   
max       3.000000               NaN     NaN       NaN        NaN   

       num-of-doors body-style drive-wheels engine-location  wheel-base  ...  \
count           205        205          205             205  205.000000  ...   
unique     

In [5]:
data.replace('?', np.nan, inplace=True)
print(data.head(1))

   symboling normalized-losses         make fuel-type aspiration num-of-doors  \
0          3               NaN  alfa-romero       gas        std          two   

    body-style drive-wheels engine-location  wheel-base  ...  engine-size  \
0  convertible          rwd           front        88.6  ...          130   

   fuel-system  bore  stroke compression-ratio horsepower  peak-rpm city-mpg  \
0         mpfi  3.47    2.68               9.0        111      5000       21   

  highway-mpg  price  
0          27  13495  

[1 rows x 26 columns]


# Doctest

In [6]:
import doctest
"""
  >>> print(data['normalized-losses'].iat[1])
  nan
  >>> print(data['peak-rpm'].iat[131])
  nan
  >>> print(data['horsepower'].iat[130])
  nan
"""

doctest.testmod()

TestResults(failed=0, attempted=3)

# Count NaN

In [7]:
missing_data = data.isnull()
for i in data.columns:
    count = missing_data[i].sum()
    if count > 0:
        print(f"{i} has {count} missing values")

normalized-losses has 41 missing values
num-of-doors has 2 missing values
bore has 4 missing values
stroke has 4 missing values
horsepower has 2 missing values
peak-rpm has 2 missing values
price has 4 missing values


# Doctest

In [8]:
import doctest
"""
  >>> print(missing_data['normalized-losses'].value_counts())
  normalized-losses
  False    164
  True      41
  Name: count, dtype: int64
  >>> print(missing_data['symboling'].value_counts())
  symboling
  False    205
  Name: count, dtype: int64
  >>> print(missing_data['bore'].value_counts())
  bore
  False    201
  True       4
  Name: count, dtype: int64
  >>> print(missing_data['horsepower'].value_counts())
  horsepower
  False    203
  True       2
  Name: count, dtype: int64
"""

doctest.testmod()

TestResults(failed=0, attempted=4)

# Mean

In [9]:
data['normalized-losses'] = pd.to_numeric(data['normalized-losses'], errors = 'coerce')
mean_normalized_losses = data['normalized-losses'].mean()
data['normalized-losses'] = data['normalized-losses'].fillna(mean_normalized_losses)

# Bore

In [10]:
data['bore'] = pd.to_numeric(data['bore'], errors = 'coerce')
most_common_bore = data['bore'].value_counts().idxmax()
print(most_common_bore)
data['bore'] = data['bore'].fillna(most_common_bore)

3.62


# Doors

In [11]:
most_common_doors = data['num-of-doors'].value_counts().idxmax()
data['num-of-doors'] = data['num-of-doors'].fillna(most_common_doors)

# Drop non priced

In [12]:
data = data.dropna(subset=['price'], axis = 0)
print(f'missing data: {data.isnull().sum()}')

missing data: symboling            0
normalized-losses    0
make                 0
fuel-type            0
aspiration           0
num-of-doors         0
body-style           0
drive-wheels         0
engine-location      0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-type          0
num-of-cylinders     0
engine-size          0
fuel-system          0
bore                 0
stroke               4
compression-ratio    0
horsepower           2
peak-rpm             2
city-mpg             0
highway-mpg          0
price                0
dtype: int64


In [13]:
print(data.columns)

Index(['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
       'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
       'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type',
       'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
       'highway-mpg', 'price'],
      dtype='object')


In [14]:
print(data['num-of-doors'].unique())

['two' 'four']


In [15]:
import doctest
"""
  >>> print(data['normalized-losses'].iat[1])
  122.0
  >>> print(data['normalized-losses'].iat[15])
  122.0
"""

doctest.testmod()

TestResults(failed=0, attempted=2)

In [16]:
import doctest
"""
  >>> print(data['num-of-doors'].iat[27])
  four
  >>> print(data['bore'].iat[57])
  3.62
"""

doctest.testmod()

**********************************************************************
File "__main__", line 5, in __main__
Failed example:
    print(data['bore'].iat[57])
Expected:
    3.62
Got:
    3.39
[31m**********************************************************************[0m
1 item had failures:
   1 of   2 in __main__
[1;31m***Test Failed*** 1 failure[0m.


TestResults(failed=1, attempted=2)

In [17]:
import doctest
"""
  >>> print(len(data['price']))
  201
"""

doctest.testmod()

TestResults(failed=0, attempted=1)