In [6]:
import pandas as pd

In [7]:
df = pd.read_csv('carprices.csv')

In [8]:
df

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs)
0,BMW X5,69000,18000,6
1,BMW X5,35000,34000,3
2,BMW X5,57000,26100,5
3,BMW X5,22500,40000,2
4,BMW X5,46000,31500,4
5,Audi A5,59000,29400,5
6,Audi A5,52000,32000,5
7,Audi A5,72000,19300,6
8,Audi A5,91000,12000,8
9,Mercedez Benz C class,67000,22000,6


#### In order to fit ML algorithms you have to convert categorical (non-numeric) variables to numeric features before building your model. This process is known as categorical data encoding

#### There are two popular and commonly used methods:  scikit-learn OneHotEncoder(OHE) and Pandas get_dummies method.

#### get_dummies is a bit more convenient and probably that's why it's a popular method. Basically, get_dummies can be used in exploratory analysis, whereas OneHotEncoder in computation and estimation.

## The Dummy Variable Trap

#### When dealing with categorical features, a common convention is to drop one of the new columns from each feature. The argument comes from statistics: without dropping a column, we know that the sum of all these columns will be 1 in every row. For example, encoding gender as two variables, is_male and is_female, produces two features which are perfectly negatively correlated. This is called the dummy variable trap: perfect multicollinearity between the predictors.

#### One-Hot encoding should not be performed if the number of categories is very high. This would result in sparse data. Depending on the use case, we will need to do some exploratory data analysis to do some feature engineering

## 1. Pandas get_dummies()

In [9]:
dummies = pd.get_dummies(df['Car Model'])

In [10]:
dummies

Unnamed: 0,Audi A5,BMW X5,Mercedez Benz C class
0,False,True,False
1,False,True,False
2,False,True,False
3,False,True,False
4,False,True,False
5,True,False,False
6,True,False,False
7,True,False,False
8,True,False,False
9,False,False,True


#### Merge it with original dataframe

In [11]:
mergedDF = pd.concat([df, dummies], axis = 'columns')

In [12]:
mergedDF

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs),Audi A5,BMW X5,Mercedez Benz C class
0,BMW X5,69000,18000,6,False,True,False
1,BMW X5,35000,34000,3,False,True,False
2,BMW X5,57000,26100,5,False,True,False
3,BMW X5,22500,40000,2,False,True,False
4,BMW X5,46000,31500,4,False,True,False
5,Audi A5,59000,29400,5,True,False,False
6,Audi A5,52000,32000,5,True,False,False
7,Audi A5,72000,19300,6,True,False,False
8,Audi A5,91000,12000,8,True,False,False
9,Mercedez Benz C class,67000,22000,6,False,False,True


#### Remove the Car Model column as we have already created categorical columns

In [16]:
final = mergedDF.drop(['Car Model'], axis = 'columns')

In [17]:
final

Unnamed: 0,Mileage,Sell Price($),Age(yrs),Audi A5,BMW X5,Mercedez Benz C class
0,69000,18000,6,False,True,False
1,35000,34000,3,False,True,False
2,57000,26100,5,False,True,False
3,22500,40000,2,False,True,False
4,46000,31500,4,False,True,False
5,59000,29400,5,True,False,False
6,52000,32000,5,True,False,False
7,72000,19300,6,True,False,False
8,91000,12000,8,True,False,False
9,67000,22000,6,False,False,True


#### Drop any one dummy variable column to avoid dummy variable trap

In [18]:
finalDF = final.drop(['Mercedez Benz C class'], axis = 'columns')

In [19]:
finalDF

Unnamed: 0,Mileage,Sell Price($),Age(yrs),Audi A5,BMW X5
0,69000,18000,6,False,True
1,35000,34000,3,False,True
2,57000,26100,5,False,True
3,22500,40000,2,False,True
4,46000,31500,4,False,True
5,59000,29400,5,True,False
6,52000,32000,5,True,False
7,72000,19300,6,True,False
8,91000,12000,8,True,False
9,67000,22000,6,False,False


In [20]:
from sklearn.linear_model import LinearRegression

In [21]:
model = LinearRegression()
x = finalDF.drop('Sell Price($)', axis = 'columns')
y = finalDF['Sell Price($)']

In [22]:
x

Unnamed: 0,Mileage,Age(yrs),Audi A5,BMW X5
0,69000,6,False,True
1,35000,3,False,True
2,57000,5,False,True
3,22500,2,False,True
4,46000,4,False,True
5,59000,5,True,False
6,52000,5,True,False
7,72000,6,True,False
8,91000,8,True,False
9,67000,6,False,False


In [23]:
y

0     18000
1     34000
2     26100
3     40000
4     31500
5     29400
6     32000
7     19300
8     12000
9     22000
10    20000
11    21000
12    33000
Name: Sell Price($), dtype: int64

In [24]:
model.fit(x,y)

In [25]:
model.predict([[74000, 4, 0, 1]])



array([19519.56914769])

## 2. Sklearn's OneHotEncoder

In [26]:
df

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs)
0,BMW X5,69000,18000,6
1,BMW X5,35000,34000,3
2,BMW X5,57000,26100,5
3,BMW X5,22500,40000,2
4,BMW X5,46000,31500,4
5,Audi A5,59000,29400,5
6,Audi A5,52000,32000,5
7,Audi A5,72000,19300,6
8,Audi A5,91000,12000,8
9,Mercedez Benz C class,67000,22000,6


In [27]:
from sklearn.preprocessing import OneHotEncoder

#### Extract categorical columns from the dataframe

In [28]:
categorical_columns = df.select_dtypes(include = ['object']).columns.tolist()

In [32]:
categorical_columns

['Car Model']

In [33]:
encoder = OneHotEncoder(sparse_output = False)

In [34]:
one_hot_encoded = encoder.fit_transform(df[categorical_columns])

In [35]:
one_hot_encoded

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

#### Create a dataframe with one hot encoded columns

In [36]:
one_hot_df = pd.DataFrame(one_hot_encoded, columns = encoder.get_feature_names_out(categorical_columns))

In [37]:
one_hot_df

Unnamed: 0,Car Model_Audi A5,Car Model_BMW X5,Car Model_Mercedez Benz C class
0,0.0,1.0,0.0
1,0.0,1.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,0.0,1.0,0.0
5,1.0,0.0,0.0
6,1.0,0.0,0.0
7,1.0,0.0,0.0
8,1.0,0.0,0.0
9,0.0,0.0,1.0


#### Concatinate the one-hot encoded dataframe with the original dataframe

In [38]:
df_encoded = pd.concat([df, one_hot_df], axis = 1)

In [39]:
df_encoded

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs),Car Model_Audi A5,Car Model_BMW X5,Car Model_Mercedez Benz C class
0,BMW X5,69000,18000,6,0.0,1.0,0.0
1,BMW X5,35000,34000,3,0.0,1.0,0.0
2,BMW X5,57000,26100,5,0.0,1.0,0.0
3,BMW X5,22500,40000,2,0.0,1.0,0.0
4,BMW X5,46000,31500,4,0.0,1.0,0.0
5,Audi A5,59000,29400,5,1.0,0.0,0.0
6,Audi A5,52000,32000,5,1.0,0.0,0.0
7,Audi A5,72000,19300,6,1.0,0.0,0.0
8,Audi A5,91000,12000,8,1.0,0.0,0.0
9,Mercedez Benz C class,67000,22000,6,0.0,0.0,1.0


#### Drop the original categorical columns

In [40]:
df_encoded = df_encoded.drop(categorical_columns, axis = 1)

In [46]:
df_encoded.head()

Unnamed: 0,Mileage,Sell Price($),Age(yrs),Car Model_Audi A5,Car Model_BMW X5,Car Model_Mercedez Benz C class
0,69000,18000,6,0.0,1.0,0.0
1,35000,34000,3,0.0,1.0,0.0
2,57000,26100,5,0.0,1.0,0.0
3,22500,40000,2,0.0,1.0,0.0
4,46000,31500,4,0.0,1.0,0.0


In [42]:
input = df_encoded.drop('Sell Price($)', axis = 1)

In [47]:
input.head()

Unnamed: 0,Mileage,Age(yrs),Car Model_Audi A5,Car Model_BMW X5,Car Model_Mercedez Benz C class
0,69000,6,0.0,1.0,0.0
1,35000,3,0.0,1.0,0.0
2,57000,5,0.0,1.0,0.0
3,22500,2,0.0,1.0,0.0
4,46000,4,0.0,1.0,0.0


In [44]:
output = df_encoded['Sell Price($)']

In [48]:
output.head()

0    18000
1    34000
2    26100
3    40000
4    31500
Name: Sell Price($), dtype: int64

In [49]:
model.fit(input, output)

In [50]:
model.predict([[74000, 4, 0, 1,0]])



array([19519.56914769])

### Same answer in both the methods :)