<a href="https://colab.research.google.com/github/Kassa-Hun/MachineLearning/blob/main/Model/4.2%20Scikit-learn-All-together.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Steps to follow
1. Fill missing values
2. Convert the data into numbers
3. Build a model

In [18]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

data_URL="https://raw.githubusercontent.com/Kassa-Hun/MachineLearning/main/Data/car-sales-extended-missing-data.csv"
data=pd.read_csv(data_URL)

np.random.seed(42)

data.dropna(subset=["Price"],inplace=True)

# Define d/t categogies and transformer Pipeline

cat_features=["Make","Colour"]
cat_trasnformer=Pipeline(steps=[ 
                                ("imputer",SimpleImputer(strategy="constant",fill_value="missing")),
                                ("onehot",OneHotEncoder(handle_unknown="ignore"))
])

door_feature=["Doors"]
door_transformer=Pipeline(steps=[
                                 ("imputer",SimpleImputer(strategy="constant",fill_value=4))
])

num_feature=["Odometer (KM)"]
num_transformer=Pipeline(steps=[
                                ("imputer",SimpleImputer(strategy="mean"))
])

#Setup preprocessing steps (Fill missing values, then convert to numbers)

preprocessor=ColumnTransformer(
    transformers=[
                  ("cat",cat_trasnformer,cat_features),
                  ("door",door_transformer,door_feature),
                  ("num",num_transformer,num_feature)
    ]
)

# Create a preprocessing and modeling Pipeline

model=Pipeline(steps=[
                      ("preprocessor",preprocessor),
                      ("model",RandomForestRegressor())
])

X=data.drop("Price",axis=1)
y=data["Price"]

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

model.fit(X_train,y_train)
model.score(X_test,y_test)


0.22188417408787875

In [29]:
pipe_grid={
    "preprocessor__num__imputer__strategy":["mean","median"],
    "model__n_estimators":[100,200,300,500,1000,1200],
    "model__max_depth":[None,5,10,15],
    "model__max_features":["auto"],
    "model__min_samples_split":[2,4,6]
}

gs_model=GridSearchCV(model,pipe_grid,cv=5,verbose=2)
gs_model.fit(X_train,y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean, total=   0.3s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean, total=   0.3s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean 
[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean, total=   0.3s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean 
[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean, total=   0.3s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean 
[CV]  model__max_depth=None, model__max_features=auto, 

[Parallel(n_jobs=1)]: Done 720 out of 720 | elapsed: 13.5min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('cat',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                                    

In [30]:
gs_model.score(X_test,y_test)

0.338971729313237

In [None]:
0.33180777591164146

In [31]:
gs_model.best_params_

{'model__max_depth': 5,
 'model__max_features': 'auto',
 'model__min_samples_split': 6,
 'model__n_estimators': 1000,
 'preprocessor__num__imputer__strategy': 'mean'}