In [1]:
!pip install category_encoders



In [2]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
import joblib

In [3]:
from category_encoders.leave_one_out import LeaveOneOutEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [6]:
car_dataset = pd.read_csv('car data.csv')

In [7]:
car_dataset.isna().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64

In [10]:
for i in car_dataset.select_dtypes(exclude = np.number).columns:
    print(car_dataset[i].value_counts())
    print('-'*100)


city                        26
corolla altis               16
verna                       14
fortuner                    11
brio                        10
                            ..
Bajaj Discover 100           1
Bajaj Avenger Street 220     1
etios gd                     1
baleno                       1
Bajaj Avenger 150            1
Name: Car_Name, Length: 98, dtype: int64
----------------------------------------------------------------------------------------------------
Petrol    239
Diesel     60
CNG         2
Name: Fuel_Type, dtype: int64
----------------------------------------------------------------------------------------------------
Dealer        195
Individual    106
Name: Seller_Type, dtype: int64
----------------------------------------------------------------------------------------------------
Manual       261
Automatic     40
Name: Transmission, dtype: int64
----------------------------------------------------------------------------------------------------


In [11]:
cat_cols = car_dataset.select_dtypes(exclude = np.number).columns
num_cols = car_dataset.select_dtypes(include = np.number).drop(columns = 'Selling_Price').columns

In [13]:
category_pipe = Pipeline(steps = [('encode',LeaveOneOutEncoder())])
numeric_pipe = Pipeline(steps = [('scale',StandardScaler())])

In [14]:
from sklearn.compose import ColumnTransformer

In [15]:
preprocess_pipe = ColumnTransformer([
    ('cat_pipe',category_pipe,cat_cols),
    ('numeric_pipe',numeric_pipe,num_cols)
])

In [16]:
pipe = Pipeline([
    (
        'preprocess',preprocess_pipe
    ),
    (
        'linearrregression',LinearRegression()
    )
])

In [17]:
X = car_dataset.drop(columns = 'Selling_Price')
Y = car_dataset['Selling_Price']

In [18]:
pipe.fit(X,Y)

Pipeline(memory=None,
         steps=[('preprocess',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('cat_pipe',
                                                  Pipeline(memory=None,
                                                           steps=[('encode',
                                                                   LeaveOneOutEncoder(cols=None,
                                                                                      drop_invariant=False,
                                                                                      handle_missing='value',
                                                                                      handle_unknown='value',
                                                                                      random_state=None,
                      

In [19]:
car_dataset.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [32]:
car_dataset.columns

Index(['Car_Name', 'Year', 'Selling_Price', 'Present_Price', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner'],
      dtype='object')

In [29]:
test_data = pd.DataFrame(['ritz',2012,5.60,10000,"Petrol",'Individual','Automatic',0]).T
test_data.columns = X.columns

In [30]:
pipe.predict(test_data)

array([3.13488911])

In [31]:
joblib.dump(pipe,'pipe.pkl')

['pipe.pkl']