In [3]:
import numpy as np
import pandas as pd 

In [4]:
df = pd.read_csv("train.csv",usecols=["Sex"])
df.head()

Unnamed: 0,Sex
0,male
1,female
2,female
3,female
4,male


# Handling Categorical Data

### 1. One Hot Encoding

In [5]:
pd.get_dummies(df,drop_first=True).head()

Unnamed: 0,Sex_male
0,1
1,0
2,0
3,0
4,1


In [6]:
df = pd.read_csv("Mercedes.csv",usecols=["X0","X1","X2","X3","X4","X5","X6"])
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,k,v,at,a,d,u,j
1,k,t,av,e,d,y,l
2,az,w,n,c,d,x,j
3,az,t,n,f,d,x,l
4,az,v,n,f,d,h,d


### One hot encoding for lots of variables

In [7]:
list_10 = df.X1.value_counts().sort_values(ascending=False).index[:10]
list_10

Index(['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o'], dtype='object')

In [8]:
for categories in list_10:
    df[categories] = np.where(df["X1"]==categories,1,0)
df

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,aa,s,b,l,v,r,i,a,c,o
0,k,v,at,a,d,u,j,0,0,0,0,1,0,0,0,0,0
1,k,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,az,w,n,c,d,x,j,0,0,0,0,0,0,0,0,0,0
3,az,t,n,f,d,x,l,0,0,0,0,0,0,0,0,0,0
4,az,v,n,f,d,h,d,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,ak,s,as,c,d,aa,d,0,1,0,0,0,0,0,0,0,0
4205,j,o,t,d,d,aa,h,0,0,0,0,0,0,0,0,0,1
4206,ak,v,r,a,d,aa,g,0,0,0,0,1,0,0,0,0,0
4207,al,r,e,f,d,aa,l,0,0,0,0,0,1,0,0,0,0


### 2. Ordinal Number Encoding

In [9]:
import datetime

In [10]:
today_date = datetime.datetime.today()
today_date

datetime.datetime(2021, 11, 24, 18, 28, 11, 92140)

In [11]:
# make list of last 15 days
days = [today_date-datetime.timedelta(x) for x in range(15)]

In [12]:
data = pd.DataFrame({"Day":days})
data.head()

Unnamed: 0,Day
0,2021-11-24 18:28:11.092140
1,2021-11-23 18:28:11.092140
2,2021-11-22 18:28:11.092140
3,2021-11-21 18:28:11.092140
4,2021-11-20 18:28:11.092140


In [13]:
data["weekday"] = data["Day"].dt.day_name()
data.head()

Unnamed: 0,Day,weekday
0,2021-11-24 18:28:11.092140,Wednesday
1,2021-11-23 18:28:11.092140,Tuesday
2,2021-11-22 18:28:11.092140,Monday
3,2021-11-21 18:28:11.092140,Sunday
4,2021-11-20 18:28:11.092140,Saturday


In [14]:
dict = {
    'Monday': 1,
    'Tuesday': 2,
    'Wednesday': 3,
    'Thursday': 4,
    'Friday': 5,
    'Saturday': 6,
    'Sunday': 7
}


In [15]:
data["weekday_ordinal"] = data["weekday"].map(dict)

In [16]:
data.head()

Unnamed: 0,Day,weekday,weekday_ordinal
0,2021-11-24 18:28:11.092140,Wednesday,3
1,2021-11-23 18:28:11.092140,Tuesday,2
2,2021-11-22 18:28:11.092140,Monday,1
3,2021-11-21 18:28:11.092140,Sunday,7
4,2021-11-20 18:28:11.092140,Saturday,6


### 3. Count/Frequency Encoding

In [17]:
train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header = None,index_col=None) 
train_set.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [18]:
columns = [1,3,5,6,7,8,9,13]
data = train_set[columns]

In [19]:
data.columns = ["Emp", "Degree", "Status", "Designation","family_job","Race","Sex","Country"]
data.head()

Unnamed: 0,Emp,Degree,Status,Designation,family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [20]:
data.nunique()

Emp             9
Degree         16
Status          7
Designation    15
family_job      6
Race            5
Sex             2
Country        42
dtype: int64

In [21]:
country_map = data["Country"].value_counts().to_dict()


In [22]:
data["Country_freq"] = data["Country"].map(country_map)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["Country_freq"] = data["Country"].map(country_map)


In [23]:
data.head()

Unnamed: 0,Emp,Degree,Status,Designation,family_job,Race,Sex,Country,Country_freq
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,29170
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,29170
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,29170
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,29170
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,95


### 4. Target Guided Ordinal Encoding

In [24]:
df = pd.read_csv("train.csv", usecols=["Survived","Cabin"])
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [25]:
df["Cabin"].fillna("Missing",inplace=True)

In [26]:
df["Cabin"] = df["Cabin"].astype(str).str[0]
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [27]:
ordinal_labels = df.groupby(["Cabin"])["Survived"].mean().sort_values().index

In [28]:
ordinal_labels2 = {k:i for i,k in enumerate(ordinal_labels)}
ordinal_labels2

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [29]:
df["Cabin_ordinalTarget"] = df["Cabin"].map(ordinal_labels2)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinalTarget
0,0,M,1
1,1,C,4
2,1,M,1
3,1,C,4
4,0,M,1


### 4. Mean Ordinal Encoding

In [30]:
mean_ordinal = df.groupby(["Cabin"])["Survived"].mean().to_dict()
mean_ordinal

{'A': 0.4666666666666667,
 'B': 0.7446808510638298,
 'C': 0.5932203389830508,
 'D': 0.7575757575757576,
 'E': 0.75,
 'F': 0.6153846153846154,
 'G': 0.5,
 'M': 0.29985443959243085,
 'T': 0.0}

In [31]:
df["Cabon_meanOrdinal"] = df["Cabin"].map(mean_ordinal)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinalTarget,Cabon_meanOrdinal
0,0,M,1,0.299854
1,1,C,4,0.59322
2,1,M,1,0.299854
3,1,C,4,0.59322
4,0,M,1,0.299854


In [32]:
### 5. Probability Ratio Encoding

In [33]:
df = pd.read_csv("train.csv", usecols=["Survived","Cabin"])
df["Cabin"].fillna("Missing",inplace=True)
df["Cabin"] = df["Cabin"].astype(str).str[0]
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [34]:
prob_df = df.groupby(["Cabin"])["Survived"].mean()

In [35]:
prob_df = pd.DataFrame(prob_df)
prob_df

Unnamed: 0_level_0,Survived
Cabin,Unnamed: 1_level_1
A,0.466667
B,0.744681
C,0.59322
D,0.757576
E,0.75
F,0.615385
G,0.5
M,0.299854
T,0.0


In [36]:
prob_df["Died"] = 1-prob_df["Survived"]
prob_df.head()

Unnamed: 0_level_0,Survived,Died
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466667,0.533333
B,0.744681,0.255319
C,0.59322,0.40678
D,0.757576,0.242424
E,0.75,0.25


In [37]:
prob_df["Probability_Ratio"] = prob_df["Survived"]/prob_df["Died"]
prob_df.head()

Unnamed: 0_level_0,Survived,Died,Probability_Ratio
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.466667,0.533333,0.875
B,0.744681,0.255319,2.916667
C,0.59322,0.40678,1.458333
D,0.757576,0.242424,3.125
E,0.75,0.25,3.0


In [38]:
prob_encoded = prob_df["Probability_Ratio"].to_dict()
prob_encoded

{'A': 0.875,
 'B': 2.916666666666666,
 'C': 1.4583333333333333,
 'D': 3.125,
 'E': 3.0,
 'F': 1.6000000000000003,
 'G': 1.0,
 'M': 0.42827442827442824,
 'T': 0.0}

In [39]:
df["Cabin_encodedProbability"] = df["Cabin"].map(prob_encoded)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_encodedProbability
0,0,M,0.428274
1,1,C,1.458333
2,1,M,0.428274
3,1,C,1.458333
4,0,M,0.428274
