In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [6]:
df = pd.read_csv('cars.csv') 

In [7]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


## Checking for unique values

In [None]:
df['brand'].nunique()

In [None]:
df['brand'].value_counts()

In [None]:
df['fuel'].nunique()

In [None]:
df['fuel'].value_counts()

In [None]:
df['owner'].nunique()

In [None]:
df['owner'].value_counts()

## OneHotEncoding

One-hot encoding is a technique used in data preprocessing and feature engineering. It's employed to convert categorical variables into a binary matrix, where each category becomes a binary column and is represented by either a 1 or 0. This is done to make the data suitable for machine learning algorithms that require numerical inputs, as well as to prevent any ordinal relationship assumptions between categories.

![OneHotEncoding](https://miro.medium.com/max/1400/1*ggtP4a5YaRx6l09KQaYOnw.png)


## 1. OneHotEncoding using Pandas

In [None]:
pd.get_dummies(df,columns=['fuel','owner'])

## 2. K-1 OneHotEncoding

* The reason for using K-1 encoding instead of K encoding columns is to avoid the "dummy variable trap." The dummy variable trap is a situation where the inclusion of all K columns introduces perfect multicollinearity into the dataset, which can negatively impact the performance of models like linear regression. Perfect multicollinearity occurs when two or more independent variables are highly correlated, making it difficult for the model to distinguish their individual effects on the target variable.

In [None]:
pd.get_dummies(df,columns=['fuel','owner'],drop_first=True)

* while doing machine learning projects we cannot implement pandas onehot encoding because pandas does not remember that which column is at which position,so we cannot implement this on machine learning project. 

## 3. OneHotEncoding using Sklearn

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,0:4],df.iloc[:,-1],test_size=0.2,random_state=True)

In [None]:
X_train.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
ohe = OneHotEncoder(drop='first',sparse=False)

In [None]:
X_train_new = ohe.fit_transform(X_train[['fuel','owner']])

In [None]:
X_test_new = ohe.transform(X_test[['fuel','owner']])

In [None]:
X_train_new.shape

In [None]:
X_train[['brand','km_driven']].values

Now, X_train_new and X_train[['brand','km_driven']].values both are numpy array and we have add them horizontally 

In [None]:
np.hstack((X_train[['brand','km_driven']].values,X_train_new))

In [None]:
np.hstack((X_train[['brand','km_driven']].values,X_train_new)).shape

## 4. OneHotEncoding with Top Categories

In [None]:
counts = df['brand'].value_counts()

In [None]:
df['brand'].nunique()
threshold = 100

In [None]:
repl = counts[counts <= threshold].index

In [None]:
pd.get_dummies(df['brand'].replace(repl,'uncommon'))