In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

from sklearn.impute import SimpleImputer

In [2]:
df = pd.read_csv("cartrain.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,180,-1,90.0,toyota,gas,std,four,sedan,rwd,front,...,171,mpfi,3.27,3.35,9.2,156.0,5200.0,20,24,15690.0
1,17,0,,bmw,gas,std,four,sedan,rwd,front,...,209,mpfi,3.62,3.39,8.0,182.0,5400.0,15,20,36880.0
2,18,2,121.0,chevrolet,gas,std,two,hatchback,fwd,front,...,61,2bbl,2.91,3.03,9.5,48.0,5100.0,47,53,5151.0
3,199,-1,74.0,volvo,gas,turbo,four,wagon,rwd,front,...,130,mpfi,3.62,3.15,7.5,162.0,5100.0,17,22,18950.0
4,114,0,,peugot,diesel,turbo,four,wagon,rwd,front,...,152,idi,3.7,3.52,21.0,95.0,4150.0,25,25,17075.0


In [4]:
#msno.matrix(df)

In [5]:
df.isna().sum()

Unnamed: 0            0
symboling             0
normalized-losses    37
make                  0
fuel-type             0
aspiration            0
num-of-doors          0
body-style            0
drive-wheels          0
engine-location       0
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-type           0
num-of-cylinders      0
engine-size           0
fuel-system           0
bore                  4
stroke                4
compression-ratio     0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 4
dtype: int64

In [6]:
df = df.dropna(subset=["price"],how='any')

In [7]:
#note bunch of functions created that could have been done differnetly

In [8]:
def missing_data_values(df,feature):       #fucntion to show % missing data
    df = df.copy()
    print(f'{feature} has {round(df[feature].isna().sum()/len(df.index),4)}% missing values')
          
for feature in df.columns:
    missing_data_values(df,feature)

Unnamed: 0 has 0.0% missing values
symboling has 0.0% missing values
normalized-losses has 0.1833% missing values
make has 0.0% missing values
fuel-type has 0.0% missing values
aspiration has 0.0% missing values
num-of-doors has 0.0% missing values
body-style has 0.0% missing values
drive-wheels has 0.0% missing values
engine-location has 0.0% missing values
wheel-base has 0.0% missing values
length has 0.0% missing values
width has 0.0% missing values
height has 0.0% missing values
curb-weight has 0.0% missing values
engine-type has 0.0% missing values
num-of-cylinders has 0.0% missing values
engine-size has 0.0% missing values
fuel-system has 0.0% missing values
bore has 0.0222% missing values
stroke has 0.0222% missing values
compression-ratio has 0.0% missing values
horsepower has 0.0111% missing values
peak-rpm has 0.0111% missing values
city-mpg has 0.0% missing values
highway-mpg has 0.0% missing values
price has 0.0% missing values


In [9]:
df = df.drop('normalized-losses',axis=1)

df.head()

Unnamed: 0.1,Unnamed: 0,symboling,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,180,-1,toyota,gas,std,four,sedan,rwd,front,104.5,...,171,mpfi,3.27,3.35,9.2,156.0,5200.0,20,24,15690.0
1,17,0,bmw,gas,std,four,sedan,rwd,front,110.0,...,209,mpfi,3.62,3.39,8.0,182.0,5400.0,15,20,36880.0
2,18,2,chevrolet,gas,std,two,hatchback,fwd,front,88.4,...,61,2bbl,2.91,3.03,9.5,48.0,5100.0,47,53,5151.0
3,199,-1,volvo,gas,turbo,four,wagon,rwd,front,104.3,...,130,mpfi,3.62,3.15,7.5,162.0,5100.0,17,22,18950.0
4,114,0,peugot,diesel,turbo,four,wagon,rwd,front,114.2,...,152,idi,3.7,3.52,21.0,95.0,4150.0,25,25,17075.0


In [10]:
df[['bore','stroke','horsepower','peak-rpm']].dtypes

bore          float64
stroke        float64
horsepower    float64
peak-rpm      float64
dtype: object

In [11]:
missing_data_columns = ['bore','stroke','horsepower','peak-rpm']

def replacing_missing_values_by_mode(feature):
    df[feature] = df[feature].fillna(df[feature].mode()[0])      #there are different modes, use [0]
    
for feature in missing_data_columns:
    replacing_missing_values_by_mode(feature)

In [12]:
df.isna().sum()

Unnamed: 0           0
symboling            0
make                 0
fuel-type            0
aspiration           0
num-of-doors         0
body-style           0
drive-wheels         0
engine-location      0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-type          0
num-of-cylinders     0
engine-size          0
fuel-system          0
bore                 0
stroke               0
compression-ratio    0
horsepower           0
peak-rpm             0
city-mpg             0
highway-mpg          0
price                0
dtype: int64

In [13]:
df.dtypes

Unnamed: 0             int64
symboling              int64
make                  object
fuel-type             object
aspiration            object
num-of-doors          object
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders      object
engine-size            int64
fuel-system           object
bore                 float64
stroke               float64
compression-ratio    float64
horsepower           float64
peak-rpm             float64
city-mpg               int64
highway-mpg            int64
price                float64
dtype: object

In [14]:
#list numerical values only

numerical_features = [feature for feature in df.columns if df[feature].dtypes != 'object']

In [15]:
numerical_features

['Unnamed: 0',
 'symboling',
 'wheel-base',
 'length',
 'width',
 'height',
 'curb-weight',
 'engine-size',
 'bore',
 'stroke',
 'compression-ratio',
 'horsepower',
 'peak-rpm',
 'city-mpg',
 'highway-mpg',
 'price']