In [None]:
#Label Encoding

In [1]:
import numpy as np
import pandas as pd

In [3]:
from sklearn.preprocessing import LabelEncoder
data = {'Category': ['Red', 'Green', 'Blue', 'Red', 'Green']}
df = pd.DataFrame(data)
le = LabelEncoder()
df['Category_LabelEncoded'] = le.fit_transform(df['Category'])

print(df)

  Category  Category_LabelEncoded
0      Red                      2
1    Green                      1
2     Blue                      0
3      Red                      2
4    Green                      1


In [None]:
# One-hot encoding
df_encoded = pd.get_dummies(df, columns=['Category'], prefix='Category')
print(df_encoded)


   Category_LabelEncoded  Category_Blue  Category_Green  Category_Red
0                      2              0               0             1
1                      1              0               1             0
2                      0              1               0             0
3                      2              0               0             1
4                      1              0               1             0


In [None]:
#Ordinal encoding

In [None]:
from sklearn.preprocessing import OrdinalEncoder
data = {'Size': ['Small', 'Medium', 'Large', 'Medium', 'Small']}
df = pd.DataFrame(data)

# Define the ordinal mapping
ordinal_mapping = {'Small': 1, 'Medium': 2, 'Large': 3}

# Apply ordinal encoding
df['Size_OrdinalEncoded'] = df['Size'].map(ordinal_mapping)

print(df)

     Size  Size_OrdinalEncoded
0   Small                    1
1  Medium                    2
2   Large                    3
3  Medium                    2
4   Small                    1


In [None]:
#Frequency encoding

In [None]:
data = {'Color': ['Red', 'Blue', 'Green', 'black', 'Red', 'Red', 'Blue', 'Green', 'pink']}
df = pd.DataFrame(data)

# Calculate frequencies of each category
frequency_map = df['Color'].value_counts().to_dict()

# Apply frequency encoding
df['Color_FrequencyEncoded'] = df['Color'].map(frequency_map)

print(df)


   Color  Color_FrequencyEncoded
0    Red                       3
1   Blue                       2
2  Green                       2
3  black                       1
4    Red                       3
5    Red                       3
6   Blue                       2
7  Green                       2
8   pink                       1


In [None]:
# Target encoding

In [None]:
data = {'Category': ['Red', 'Green', 'Blue', 'Red', 'Green'],
        'Target': [1, 0, 1, 0, 1]}
df = pd.DataFrame(data)
# Target encoding
target = df.groupby('Category')['Target'].mean()
df['Category_TargetEncoded'] = df['Category'].map(target)
print(df)


  Category  Target  Category_TargetEncoded
0      Red       1                     0.5
1    Green       0                     0.5
2     Blue       1                     1.0
3      Red       0                     0.5
4    Green       1                     0.5


In [None]:
#Binary encoding

In [None]:
# Install the category_encoders library
!pip install category_encoders



In [None]:
import pandas as pd
import category_encoders as ce
data = {'Color': [ 'Blue', 'Green', 'Green', 'red', 'red', 'pink', 'pink']}
df = pd.DataFrame(data)

# Initialize BinaryEncoder
encoder = ce.BinaryEncoder(cols=['Color'])

# Apply binary encoding
df_binary = encoder.fit_transform(df)
print(df_binary)

   Color_0  Color_1  Color_2
0        0        0        1
1        0        1        0
2        0        1        0
3        0        1        1
4        0        1        1
5        1        0        0
6        1        0        0


In [None]:
#hashing trick

In [None]:
data = {'Color': ['Red', 'Blue', 'Green', 'Red', 'Green']}
df = pd.DataFrame(data)

# Apply hashing trick
hash_range = 100  # Choose the desired range for the hash values
df['Color_Hash'] = df['Color'].apply(lambda x: hash(x) % hash_range)
print(df)


   Color  Color_Hash
0    Red           1
1   Blue          98
2  Green           9
3    Red           1
4  Green           9


In [None]:
# Handling categorical data - Example: One-Hot Encoding for the 'Gender' column in employees dataset

In [None]:
import pandas as pd
path_to_dataset = '/content/employees.csv'
# Load the dataset
df = pd.read_csv(path_to_dataset)

# Display the initial structure of the dataset
print("Initial structure:")
print(df.head())

df = pd.get_dummies(df, columns=['Gender'], prefix='Gender')
print("Modified structure after one-hot encoding:")
print(df.head())

Initial structure:
  First Name  Gender Start Date Last Login Time  Salary  Bonus %  \
0    Douglas    Male   8/6/1993        12:42 PM   97308    6.945   
1     Thomas    Male  3/31/1996         6:53 AM   61933    4.170   
2      Maria  Female  4/23/1993        11:17 AM  130590   11.858   
3      Jerry    Male   3/4/2005         1:00 PM  138705    9.340   
4      Larry    Male  1/24/1998         4:47 PM  101004    1.389   

  Senior Management             Team  
0              True        Marketing  
1              True              NaN  
2             False          Finance  
3              True          Finance  
4              True  Client Services  
Modified structure after one-hot encoding:
  First Name Start Date Last Login Time  Salary  Bonus % Senior Management  \
0    Douglas   8/6/1993        12:42 PM   97308    6.945              True   
1     Thomas  3/31/1996         6:53 AM   61933    4.170              True   
2      Maria  4/23/1993        11:17 AM  130590   11.858     