In [1]:
import pandas as pd
import numpy as np

In [2]:
data=pd.read_csv("homeprices.csv")
data.head()

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000


#### Using pandas to get dummies

In [3]:
dummies=pd.get_dummies(data.town)
dummies=dummies.astype(int)
dummies

Unnamed: 0,monroe township,robinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,1,0


In [4]:
merged=pd.concat([data,dummies],axis=1)
merged

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


In [5]:
# from this we can now remove town column because now we have encoded columns, and also we can remove one of the resultant coloumn to avoid multicolinearity
final=merged.drop(["town","west windsor"],axis=1)
final

Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0
5,2600,585000,0,0
6,2800,615000,0,0
7,3300,650000,0,0
8,3600,710000,0,0
9,2600,575000,0,1


#### Using scikit learn library

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   town    13 non-null     object
 1   area    13 non-null     int64 
 2   price   13 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 440.0+ bytes


In [7]:
cat_columns=data.select_dtypes(include=["object"]).columns
cat_columns

Index(['town'], dtype='object')

In [8]:
from sklearn.preprocessing import OneHotEncoder
ohe=OneHotEncoder(drop='first',sparse_output=False)

In [9]:
for col in cat_columns:
    col_encoded=ohe.fit_transform(data[col].values.reshape(-1,1))
    col_encoded_df=pd.DataFrame(col_encoded,columns=ohe.get_feature_names_out([col]))
    data=pd.concat([data.drop(col,axis=1),col_encoded_df],axis=1)

In [10]:
data

Unnamed: 0,area,price,town_robinsville,town_west windsor
0,2600,550000,0.0,0.0
1,3000,565000,0.0,0.0
2,3200,610000,0.0,0.0
3,3600,680000,0.0,0.0
4,4000,725000,0.0,0.0
5,2600,585000,0.0,1.0
6,2800,615000,0.0,1.0
7,3300,650000,0.0,1.0
8,3600,710000,0.0,1.0
9,2600,575000,1.0,0.0


In [12]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
data = pd.read_csv('homeprices.csv') 

X = data.drop('price', axis=1) 
categorical_columns = ['town']

# Create a ColumnTransformer to apply one-hot encoding to the categorical columns
preprocessor = ColumnTransformer(transformers=[('onehot', OneHotEncoder(drop='first'), categorical_columns)],remainder='passthrough')

# Apply the transformations
X_encoded = preprocessor.fit_transform(X)

# Display the DataFrame after one-hot encoding
df_encoded = pd.DataFrame(X_encoded, columns=preprocessor.get_feature_names_out(X.columns))
df_encoded


Unnamed: 0,onehot__town_robinsville,onehot__town_west windsor,remainder__area
0,0.0,0.0,2600.0
1,0.0,0.0,3000.0
2,0.0,0.0,3200.0
3,0.0,0.0,3600.0
4,0.0,0.0,4000.0
5,0.0,1.0,2600.0
6,0.0,1.0,2800.0
7,0.0,1.0,3300.0
8,0.0,1.0,3600.0
9,1.0,0.0,2600.0
