# Dealing With Categorical Values

## Importing Libraries

In [1]:
import pandas as pd
import numpy as np

## Importing Dataset

In [2]:
dataset = pd.read_csv("Salary_Dataset.csv")

In [3]:
dataset.head()

Unnamed: 0,country,Salary,YearsExperience,Purchased
0,Dubai,39343.0,1.1,No
1,Canada,46205.0,1.3,Yes
2,Canada,37731.0,1.5,No
3,Canada,43525.0,2.0,No
4,USA,39891.0,2.2,No


# Lets perform Encoding

## Applying One-Hot Encoding

<h3> First Do it with pandas </h3>

Categorical value converts into some label/numerical values using some methods- 
1. One-Hot Encoding, Alphabetically sorting
2. Label Encoding

For country:- One-hot Encoding

For Purchased:- Label Encoding

In [4]:
# country_dummy = pd.get_dummies(dataset['country'], dtype=int)
# country_dummy.head()
country_dummy = pd.get_dummies(dataset['country']).astype(int)
country_dummy.head()

Unnamed: 0,Australia,Canada,Dubai,USA
0,0,0,1,0
1,0,1,0,0
2,0,1,0,0
3,0,1,0,0
4,0,0,0,1


In [5]:
# concatenate the dataframes into original dataframes.
dataset = pd.concat([dataset, country_dummy], axis=1)
dataset.head(10)

Unnamed: 0,country,Salary,YearsExperience,Purchased,Australia,Canada,Dubai,USA
0,Dubai,39343.0,1.1,No,0,0,1,0
1,Canada,46205.0,1.3,Yes,0,1,0,0
2,Canada,37731.0,1.5,No,0,1,0,0
3,Canada,43525.0,2.0,No,0,1,0,0
4,USA,39891.0,2.2,No,0,0,0,1
5,Dubai,56642.0,2.9,No,0,0,1,0
6,Canada,60150.0,3.0,Yes,0,1,0,0
7,Australia,54445.0,3.2,No,1,0,0,0
8,Dubai,64445.0,3.2,Yes,0,0,1,0
9,Dubai,57189.0,3.7,No,0,0,1,0


In [6]:
# Drop the country column and rearrange the index of column.
dataset.drop('country', axis=1, inplace=True)
dataset.head()

Unnamed: 0,Salary,YearsExperience,Purchased,Australia,Canada,Dubai,USA
0,39343.0,1.1,No,0,0,1,0
1,46205.0,1.3,Yes,0,1,0,0
2,37731.0,1.5,No,0,1,0,0
3,43525.0,2.0,No,0,1,0,0
4,39891.0,2.2,No,0,0,0,1


In [7]:
dataset= dataset[['Australia', 'Canada', 'Dubai', 'USA', 'YearsExperience', 'Salary', 'Purchased']]
dataset.head()

Unnamed: 0,Australia,Canada,Dubai,USA,YearsExperience,Salary,Purchased
0,0,0,1,0,1.1,39343.0,No
1,0,1,0,0,1.3,46205.0,Yes
2,0,1,0,0,1.5,37731.0,No
3,0,1,0,0,2.0,43525.0,No
4,0,0,0,1,2.2,39891.0,No


In [8]:
# dataset_1.to_csv('New_salary', index=False)

# <h3> Let's do it with Scikit-Learn </h3>
Also Restart your kernel if you are using same notebook


In [8]:
dataset = pd.read_csv("Salary_Dataset.csv")

In [9]:
dataset.head(5)

Unnamed: 0,country,Salary,YearsExperience,Purchased
0,Dubai,39343.0,1.1,No
1,Canada,46205.0,1.3,Yes
2,Canada,37731.0,1.5,No
3,Canada,43525.0,2.0,No
4,USA,39891.0,2.2,No


In [10]:
# Lets first perform label encoding
# Label encoding will be performed on Purchased column as it has only two unique value in it.
from sklearn.preprocessing import LabelEncoder
# have to make an instance 
le = LabelEncoder()
dataset['Purchased'] = le.fit_transform(dataset['Purchased'])
dataset.head()

Unnamed: 0,country,Salary,YearsExperience,Purchased
0,Dubai,39343.0,1.1,0
1,Canada,46205.0,1.3,1
2,Canada,37731.0,1.5,0
3,Canada,43525.0,2.0,0
4,USA,39891.0,2.2,0


In [None]:
# Lets perform one-hot encoding on country column.
# one-hot encoding with ColumnTransformer 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
dataset = pd.DataFrame(ct.fit_transform(dataset))
dataset.columns = ['Australia',	'Canada', 'Dubai', 'USA', 'Salary', 'YearsExperience','Purchased']
dataset.head()


ValueError: Length mismatch: Expected axis has 10 elements, new values have 7 elements

In [11]:
#Lets link the headers name.