In [134]:
import numpy as np
import pandas as pd
import matplotlib as plt
%matplotlib inline

In [148]:
## Reading the file which has missign data
car_sales_missing = pd.read_csv("C:\Jayaprakash\DataScience\Projects\Test Data\car-sales-extended-missing-data.csv")
car_sales_missing.head(5)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [145]:
#Finding out is there any null value
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

# Option 1 : Filling NaN data using Pandas

In [137]:
#Filling null value with hard coded value 
car_sales_missing["Make"].fillna("missing",inplace=True)
car_sales_missing["Colour"].fillna("missing", inplace=True)
car_sales_missing["Odometer (KM)"].fillna(car_sales_missing["Odometer (KM)"].mean(), inplace = True )
car_sales_missing["Doors"].fillna(4, inplace=True)
car_sales_missing.head(3)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0


In [138]:
#Validating is null value replaced for features
car_sales_missing.isna().sum()

Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

In [149]:
# Dropping all null value labels and validating the count of null value label
car_sales_missing.dropna(inplace = True)
car_sales_missing.isna().sum(), len(car_sales_missing)

(Make             0
 Colour           0
 Odometer (KM)    0
 Doors            0
 Price            0
 dtype: int64,
 773)

In [150]:
# Creating features(X) and labels (y)
X = car_sales_missing.drop(["Price"],axis=1)
y = car_sales_missing["Price"]
X.shape, y.shape

((773, 4), (773,))

In [152]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

#Setting up the onehotencoder 
categorical_features = ["Make","Colour","Doors"]
onehotencoder =  OneHotEncoder()
transformer = ColumnTransformer([("onehot",onehotencoder,categorical_features)],remainder="passthrough")    

#Fitting the onehotencoder
transformer_X = transformer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(transformer_X,y,test_size = 0.2)

In [153]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train,y_train)
model.score(X_test,y_test)
#y_pred = model.predict(X_test)
#y_pred

0.2352354511821512

# Option 2 : Filling NaN data using scikit learn

In [317]:
## Reading the csv file and stroing it in Dataframe
sk_car_sales_missing = pd.read_csv("C:\Jayaprakash\DataScience\Projects\Test Data\car-sales-extended-missing-data.csv")

##Checking for nan count
sk_car_sales_missing.isna().sum(), len(sk_car_sales_missing)

(Make             49
 Colour           50
 Odometer (KM)    50
 Doors            50
 Price            50
 dtype: int64,
 1000)

In [318]:
##Dropping only the "price" column
sk_car_sales_missing.dropna(subset=["Price"], inplace =  True)
sk_car_sales_missing.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [319]:
## creating Features by retaining all column execpt price
X = sk_car_sales_missing.drop(["Price"], axis=1)
## finalizing Labels
y = sk_car_sales_missing["Price"]

##immeditally split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)

In [324]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

##Filling the null values with hard code values
cat_imputer = SimpleImputer(fill_value="missing",strategy="constant")
door_imputer = SimpleImputer(fill_value=4, strategy="constant")
num_imputer = SimpleImputer(strategy="mean")

## Setting the column names , looks 
cat_feature = ["Make","Colour"]
door_feature = ["Doors"]
num_feature = ["Odometer (KM)"]

imputer = ColumnTransformer([("cat_imputer",cat_imputer,cat_feature),
                            ("door_imputer",door_imputer,["Doors"]),
                             ("num_imputer",num_imputer,num_feature)
                            ])

## Please note that this fit is not to fit the model, it is only to fit to change the values of where we have nan values
filled_X_train = imputer.fit_transform(X_train)
filled_X_test = imputer.transform(X_test) ##Only training data should be fit 

## Note : transform returns nparry, hence we need to convert it into DataFrame, hence the below step

sk_car_sales_filled_train = pd.DataFrame(filled_X_train, columns=["Make","Colour","Doors","Odometer (KM)"] )
sk_car_sales_filled_test = pd.DataFrame(filled_X_test, columns=["Make","Colour","Doors","Odometer (KM)"] )
##type(sk_car_sales_filled_train)
##sk_car_sales_filled_train.isna().sum(), sk_car_sales_filled_test.isna().sum()

In [325]:
categorical_features = ["Make","Colour","Doors"]

onehotencoder = OneHotEncoder()
transformer =  ColumnTransformer([("onehotencoder",onehotencoder,categorical_features)], remainder="passthrough")

## Please note that this fit is not to fit the model, it is only to fit to change the values to one hot encoder
transformed_X_train = transformer.fit_transform(sk_car_sales_filled_train)
transformed_X_test = transformer.transform(sk_car_sales_filled_test)

In [326]:
pd.DataFrame(transformed_X_train.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,130350.667129
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,82429.000000
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,178134.000000
3,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,137233.000000
4,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,173693.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
755,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,162665.000000
756,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,149413.000000
757,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,45513.000000
758,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,130350.667129


In [327]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

model = RandomForestRegressor()
model.fit(transformed_X_train,y_train)
model.score(transformed_X_test,y_test)

0.26116902969720956