In [1]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.6.3


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import joblib

In [3]:
from category_encoders.leave_one_out import LeaveOneOutEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [4]:
car_data = pd.read_csv("https://raw.githubusercontent.com/Laxminarayen/Datascience-Batch-22/main/42.%20Flask-Deployment/car%20data.csv")

In [19]:
car_data.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [5]:
car_data.isna().sum()

Unnamed: 0,0
Car_Name,0
Year,0
Selling_Price,0
Present_Price,0
Kms_Driven,0
Fuel_Type,0
Seller_Type,0
Transmission,0
Owner,0


In [6]:
car_data.describe()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Owner
count,301.0,301.0,301.0,301.0,301.0
mean,2013.627907,4.661296,7.628472,36947.20598,0.043189
std,2.891554,5.082812,8.644115,38886.883882,0.247915
min,2003.0,0.1,0.32,500.0,0.0
25%,2012.0,0.9,1.2,15000.0,0.0
50%,2014.0,3.6,6.4,32000.0,0.0
75%,2016.0,6.0,9.9,48767.0,0.0
max,2018.0,35.0,92.6,500000.0,3.0


In [7]:
car_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


In [9]:
for i in car_data.select_dtypes(exclude =np.number):
  print(car_data[i].value_counts())
  print('-'*100)

Car_Name
city                        26
corolla altis               16
verna                       14
fortuner                    11
brio                        10
                            ..
Honda CB Trigger             1
Yamaha FZ S                  1
Bajaj Pulsar 135 LS          1
Activa 4g                    1
Bajaj Avenger Street 220     1
Name: count, Length: 98, dtype: int64
----------------------------------------------------------------------------------------------------
Fuel_Type
Petrol    239
Diesel     60
CNG         2
Name: count, dtype: int64
----------------------------------------------------------------------------------------------------
Seller_Type
Dealer        195
Individual    106
Name: count, dtype: int64
----------------------------------------------------------------------------------------------------
Transmission
Manual       261
Automatic     40
Name: count, dtype: int64
------------------------------------------------------------------------------------

In [11]:
cat_cols = car_data.select_dtypes(exclude = np.number).columns
num_cols = car_data.select_dtypes(include = np.number).drop(columns = 'Selling_Price').columns

In [12]:
category_pipe = Pipeline(steps = [('encode',LeaveOneOutEncoder())])
numeric_pipe = Pipeline(steps = [('scale',StandardScaler())])

In [14]:
from sklearn.compose import ColumnTransformer
preprocess_pipe = ColumnTransformer([
    ('cat_pipe', category_pipe,cat_cols),
    ('numeric_pipe',numeric_pipe, num_cols)
])

In [16]:
pipe = Pipeline([
    (
        'preprocess',preprocess_pipe
    ),
    (
        'linearregression',LinearRegression()
    )
])

In [17]:
X = car_data.drop(columns = 'Selling_Price')
Y = car_data['Selling_Price']

In [18]:
pipe.fit(X,Y)

In [None]:
ritz	2014	3.35	5.59	27000	Petrol	Dealer	Manual	0

In [20]:
car_data.columns

Index(['Car_Name', 'Year', 'Selling_Price', 'Present_Price', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner'],
      dtype='object')

In [22]:
test_data = pd.DataFrame(['ritz','2010','5.59','50000','Petrol','Dealer','Manual',0]).T
test_data.columns = X.columns

In [23]:
pipe.predict(test_data)

array([1.69011156])

In [24]:
joblib.dump(pipe,'pipe.pkl')

['pipe.pkl']