In [1]:
import pandas as pd
import matplotlib as plt
import numpy as np


In [2]:
data = pd.read_csv('train.csv')

In [3]:
print(data.isna().sum())

Unnamed: 0            0
symboling             0
normalized-losses    37
make                  0
fuel-type             0
aspiration            0
num-of-doors          0
body-style            0
drive-wheels          0
engine-location       0
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-type           0
num-of-cylinders      0
engine-size           0
fuel-system           0
bore                  4
stroke                4
compression-ratio     0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 4
dtype: int64


In [4]:
data.dropna(subset=['price'], inplace=True)

In [5]:
def missing_data_values(df,feature):
    df = df.copy()
    print(f'the feature {feature} has {round(df[feature].isna().sum()/len(df.index),2)}')

for feature in data.columns :
    missing_data_values(data,feature)

the feature Unnamed: 0 has 0.0
the feature symboling has 0.0
the feature normalized-losses has 0.18
the feature make has 0.0
the feature fuel-type has 0.0
the feature aspiration has 0.0
the feature num-of-doors has 0.0
the feature body-style has 0.0
the feature drive-wheels has 0.0
the feature engine-location has 0.0
the feature wheel-base has 0.0
the feature length has 0.0
the feature width has 0.0
the feature height has 0.0
the feature curb-weight has 0.0
the feature engine-type has 0.0
the feature num-of-cylinders has 0.0
the feature engine-size has 0.0
the feature fuel-system has 0.0
the feature bore has 0.02
the feature stroke has 0.02
the feature compression-ratio has 0.0
the feature horsepower has 0.01
the feature peak-rpm has 0.01
the feature city-mpg has 0.0
the feature highway-mpg has 0.0
the feature price has 0.0


In [6]:
data.drop('normalized-losses',axis=1, inplace=True)

In [7]:

print(data.isna().sum())

Unnamed: 0           0
symboling            0
make                 0
fuel-type            0
aspiration           0
num-of-doors         0
body-style           0
drive-wheels         0
engine-location      0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-type          0
num-of-cylinders     0
engine-size          0
fuel-system          0
bore                 4
stroke               4
compression-ratio    0
horsepower           2
peak-rpm             2
city-mpg             0
highway-mpg          0
price                0
dtype: int64


In [8]:
data[['bore','stroke','horsepower','peak-rpm']].dtypes

bore          float64
stroke        float64
horsepower    float64
peak-rpm      float64
dtype: object

In [9]:
missing_data_columns = ['bore','stroke','horsepower','peak-rpm']
def replacing_missing_values_by_mode(feature) :
    data[feature] = data[feature].fillna(data[feature].mode()[0])

In [10]:
for feature in missing_data_columns:
    replacing_missing_values_by_mode(feature)

In [11]:
data.dtypes


Unnamed: 0             int64
symboling              int64
make                  object
fuel-type             object
aspiration            object
num-of-doors          object
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders      object
engine-size            int64
fuel-system           object
bore                 float64
stroke               float64
compression-ratio    float64
horsepower           float64
peak-rpm             float64
city-mpg               int64
highway-mpg            int64
price                float64
dtype: object

In [12]:
# numerical values

In [13]:
numerical_features = [feature for feature in data.columns if data[feature].dtypes != 'object']

In [14]:
numerical_features = ['Unnamed: 0',
 'symboling',
 'wheel-base',
 'length',
 'width',
 'height',
 'curb-weight',
 'engine-size',
 'bore',
 'stroke',
 'compression-ratio',
 'horsepower',
 'peak-rpm',
 'city-mpg',
 'highway-mpg',
 'price']

In [15]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [16]:
regressor = RandomForestRegressor()
X = data[numerical_features]
X = X.drop('price',axis=1)
y = data['price']


In [17]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [18]:
regressor.fit(X_train,y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [19]:
regressor.score(X_test,y_test)

0.8878985193024753

In [20]:
array_of_importance = regressor.feature_importances_

In [21]:
mask = array_of_importance >= 0.15

In [22]:
copy = data[numerical_features].copy()
copy = copy.loc[:,mask]

In [23]:
numerical_features = ['curb-weight', 'engine-size']

In [24]:
# categorical data
categorical_features = [feature for feature in data.columns if data[feature].dtypes == "object"]

In [27]:
categorical_features = ['make',
 'fuel-type',
 'aspiration',
 'num-of-doors',
 'body-style',
 'drive-wheels',
 'engine-location',
 'engine-type',
 'num-of-cylinders',
 'fuel-system']

In [28]:
copy = data[categorical_features].copy()

In [29]:
copy = pd.get_dummies(copy,drop_first=True)

In [30]:
dummy_features = [feature for feature in copy.columns if feature not in categorical_features ]

In [32]:
dummy_features = ['make_audi',
 'make_bmw',
 'make_chevrolet',
 'make_dodge',
 'make_honda',
 'make_isuzu',
 'make_jaguar',
 'make_mazda',
 'make_mercedes-benz',
 'make_mercury',
 'make_mitsubishi',
 'make_nissan',
 'make_peugot',
 'make_plymouth',
 'make_porsche',
 'make_renault',
 'make_saab',
 'make_subaru',
 'make_toyota',
 'make_volkswagen',
 'make_volvo',
 'fuel-type_gas',
 'aspiration_turbo',
 'num-of-doors_four',
 'num-of-doors_two',
 'body-style_hardtop',
 'body-style_hatchback',
 'body-style_sedan',
 'body-style_wagon',
 'drive-wheels_fwd',
 'drive-wheels_rwd',
 'engine-location_rear',
 'engine-type_l',
 'engine-type_ohc',
 'engine-type_ohcf',
 'engine-type_ohcv',
 'engine-type_rotor',
 'num-of-cylinders_five',
 'num-of-cylinders_four',
 'num-of-cylinders_six',
 'num-of-cylinders_three',
 'num-of-cylinders_twelve',
 'num-of-cylinders_two',
 'fuel-system_2bbl',
 'fuel-system_4bbl',
 'fuel-system_idi',
 'fuel-system_mfi',
 'fuel-system_mpfi',
 'fuel-system_spdi',
 'fuel-system_spfi']

In [33]:
regressor = RandomForestRegressor()
X = copy
y = data['price']

X_train,X_test,y_train, y_test = train_test_split(X,y,test_size=0.2)


In [34]:
regressor.fit(X_train,y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [35]:
regressor.score(X_test,y_test)

0.7207873573110156

In [38]:
importance_array = regressor.feature_importances_

In [39]:
mask = importance_array >= 0.15

In [40]:
mask

array([False, False, False, False, False, False, False, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False])

In [41]:
copy = copy.loc[:,mask]

In [42]:
copy.columns

Index(['make_mercedes-benz', 'num-of-cylinders_four'], dtype='object')

In [None]:
categorical_data = ['make_mercedes-benz', 'num-of-cylinders_four']