In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
car_sales = pd.read_csv("car-sales-extended.csv")

In [7]:
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043
...,...,...,...,...,...
995,Toyota,Black,35820,4,32042
996,Nissan,White,155144,3,5716
997,Nissan,Blue,66604,4,31570
998,Honda,White,215883,4,4001


In [8]:
car_sales.describe()

Unnamed: 0,Odometer (KM),Doors,Price
count,1000.0,1000.0,1000.0
mean,131073.708,4.014,16045.665
std,68859.723885,0.379405,8630.794219
min,10148.0,3.0,2796.0
25%,71238.0,4.0,9481.5
50%,131202.0,4.0,14264.0
75%,192372.75,4.0,20738.75
max,249860.0,5.0,52458.0


In [12]:
# split the data into x and y

x = car_sales.drop("Price",axis = 1)
y = car_sales["Price"]


#splitthe data into training and test set
from sklearn.model_selection import train_test_split
x_test, x_train, y_test, y_train = train_test_split(x, y,test_size = 0.2)

In [16]:
# Build machine learning model
from sklearn.ensemble import RandomForestRegressor

#initate model for the data 
model = RandomForestRegressor()

model.fit(x_train, y_train)
model.score(x_test, y_test)

ValueError: could not convert string to float: 'Nissan'

In [18]:
# The above error occured because inorder the machine to learn from the data, the data must be recogonisable by the machine 
# the data must be in 0's and 1's
# so we have to convert the object data types into number

In [19]:
car_sales.info()

# when we see the info the data have two object dtypes. and we have to deal with them

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Make           1000 non-null   object
 1   Colour         1000 non-null   object
 2   Odometer (KM)  1000 non-null   int64 
 3   Doors          1000 non-null   int64 
 4   Price          1000 non-null   int64 
dtypes: int64(3), object(2)
memory usage: 39.2+ KB


In [20]:
car_sales["Make"].unique()

array(['Honda', 'BMW', 'Toyota', 'Nissan'], dtype=object)

In [23]:
car_sales["Colour"].unique()

array(['White', 'Blue', 'Red', 'Green', 'Black'], dtype=object)

In [None]:
# so one way to convert these unique datas into 1's and 0's is OneHotEncoder method
# OneHotEncoder is a preprocessing tool in scikit-learn that converts categorical data into a format suitable for machine learning algorithms.
# It transforms each category into a new binary (0 or 1) column, ensuring no ordinal relationships are implied between categories.

In [29]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose  import ColumnTransformer

catagorical_features = ["Make", "Doors", "Colour"]
one_hot = OneHotEncoder()

transformer = ColumnTransformer([('one_hot',
                                  one_hot,
                                  catagorical_features)],
                               remainder = "passthrough")

transformed_x = transformer.fit_transform(x)
transformed_x


#ColumnTransformer in scikit-learn is a powerful tool for preprocessing datasets with both numerical and categorical features.
# It allows you to apply different preprocessing steps to specific columns within your dataset, making it useful for handling mixed data types.

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 2.48360e+05]], shape=(1000, 13))

In [30]:
pd.DataFrame(transformed_x)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,154365.0
4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,35820.0
996,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,155144.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,66604.0
998,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,215883.0


In [None]:
# so the question gonna be "How we supposed to have 13 columns?"
# and the thing is we transformed esch object unique types to columns

In [35]:
dummies = pd.get_dummies(car_sales[["Make", "Colour", "Doors"]])
dummies

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,False,True,False,False,False,False,False,False,True
1,5,True,False,False,False,False,True,False,False,False
2,4,False,True,False,False,False,False,False,False,True
3,4,False,False,False,True,False,False,False,False,True
4,3,False,False,True,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...
995,4,False,False,False,True,True,False,False,False,False
996,3,False,False,True,False,False,False,False,False,True
997,4,False,False,True,False,False,True,False,False,False
998,4,False,True,False,False,False,False,False,False,True


In [38]:
# Now everything is perfect and lets refit the model 

np.random.seed(42)
x_train, x_test, y_train, y_test = train_test_split(transformed_x, y , test_size= 0.2)

model.fit(x_train, y_train)

In [42]:
model.score(x_test, y_test)

0.3148175799961177

In [41]:
np.random.seed(42)
for i in range(10, 100, 10):
    print(f"Trying model with {i} estimators...")
    model = RandomForestRegressor(n_estimators = i).fit(x_train, y_train)
    print(f"Model accuracy on test set: {model.score(x_test, y_test)* 100:.2f}%")

Trying model with 10 estimators...
Model accuracy on test set: 27.44%
Trying model with 20 estimators...
Model accuracy on test set: 29.10%
Trying model with 30 estimators...
Model accuracy on test set: 27.08%
Trying model with 40 estimators...
Model accuracy on test set: 29.59%
Trying model with 50 estimators...
Model accuracy on test set: 30.15%
Trying model with 60 estimators...
Model accuracy on test set: 31.15%
Trying model with 70 estimators...
Model accuracy on test set: 32.18%
Trying model with 80 estimators...
Model accuracy on test set: 29.93%
Trying model with 90 estimators...
Model accuracy on test set: 31.48%


In [43]:
# save a model and load it
import pickle

pickle.dump(model, open("random_forest_model_1.pkl", "wb"))

In [45]:
loaded_model = pickle.load(open("random_forest_model_1.pkl", "rb"))
loaded_model.score(x_test, y_test)

0.3148175799961177

In [None]:
#Interpretation of 0.3148:

    # The model explains 30.2% of the variance in the test data.
    # 69.8% of the variance in the test data is not captured by the model.
    # The model has room for improvement.