The EDA Playlist by KN: https://www.youtube.com/watch?v=6WDFfaYtN6s&list=PLZoTAELRMXVPwYGE2PXD3x0bfKnR0cJjN&index=1

In [3]:
import pandas as pd
import numpy as np

In [10]:
data = pd.read_csv('train.csv', usecols = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6']) 
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


data source: https://www.kaggle.com/code/aditya1702/mercedes-benz-data-exploration/data

In [11]:
for col in data.columns:
    print(col, ':' , len(data[col].unique()), 'labels')

X1 : 27 labels
X2 : 44 labels
X3 : 7 labels
X4 : 4 labels
X5 : 29 labels
X6 : 12 labels


In [12]:
pd.get_dummies(data, drop_first = True).shape

(4209, 117)

In [13]:
data.X2.value_counts().sort_values(ascending=False).head(20)

as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
aq      63
ay      54
a       47
t       29
k       25
i       25
b       21
ao      20
ag      19
z       19
Name: X2, dtype: int64

In [14]:
top_10 = [x for x in data.X2.value_counts().sort_values(ascending=False).head(10).index]
top_10

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']

In [16]:
for label in top_10:
    data[label] = np.where(data['X2']==label, 1, 0)
    
data[['X2'] + top_10].head(40)

Unnamed: 0,X2,as,ae,ai,m,ak,r,n,s,f,e
0,at,0,0,0,0,0,0,0,0,0,0
1,av,0,0,0,0,0,0,0,0,0,0
2,n,0,0,0,0,0,0,1,0,0,0
3,n,0,0,0,0,0,0,1,0,0,0
4,n,0,0,0,0,0,0,1,0,0,0
5,e,0,0,0,0,0,0,0,0,0,1
6,e,0,0,0,0,0,0,0,0,0,1
7,as,1,0,0,0,0,0,0,0,0,0
8,as,1,0,0,0,0,0,0,0,0,0
9,aq,0,0,0,0,0,0,0,0,0,0


## Feature Engineering

- Handling ordinal data - ordinal data encoding
- Data that has a sequence or level of importance

In [2]:
import pandas as pd
import numpy as np
import datetime

In [52]:
df_base = datetime.datetime.today()
df_date_list = [df_base - datetime.timedelta(days=x) for x in range(0, 20)]
df = pd.DataFrame(df_date_list)
df.columns = ['day']
df

Unnamed: 0,day
0,2025-01-14 19:13:39.977794
1,2025-01-13 19:13:39.977794
2,2025-01-12 19:13:39.977794
3,2025-01-11 19:13:39.977794
4,2025-01-10 19:13:39.977794
5,2025-01-09 19:13:39.977794
6,2025-01-08 19:13:39.977794
7,2025-01-07 19:13:39.977794
8,2025-01-06 19:13:39.977794
9,2025-01-05 19:13:39.977794


In [53]:
df['days_of_week'] = df['day'].dt.day_name()
df.head()

Unnamed: 0,day,days_of_week
0,2025-01-14 19:13:39.977794,Tuesday
1,2025-01-13 19:13:39.977794,Monday
2,2025-01-12 19:13:39.977794,Sunday
3,2025-01-11 19:13:39.977794,Saturday
4,2025-01-10 19:13:39.977794,Friday


In [56]:
# Engineer categorical variables by ordinal number replacement

weekday_map = {'Monday':1,
               'Tuesday':2,
               'Wednesday':3,
               'Thursday':4,
               'Friday':5,
               'Saturday':6,
               'Sunday':7
              }

In [58]:
df['day_ordinal'] = df.days_of_week.map(weekday_map)
df.head(10)

Unnamed: 0,day,days_of_week,day_ordinal
0,2025-01-14 19:13:39.977794,Tuesday,2
1,2025-01-13 19:13:39.977794,Monday,1
2,2025-01-12 19:13:39.977794,Sunday,7
3,2025-01-11 19:13:39.977794,Saturday,6
4,2025-01-10 19:13:39.977794,Friday,5
5,2025-01-09 19:13:39.977794,Thursday,4
6,2025-01-08 19:13:39.977794,Wednesday,3
7,2025-01-07 19:13:39.977794,Tuesday,2
8,2025-01-06 19:13:39.977794,Monday,1
9,2025-01-05 19:13:39.977794,Sunday,7


For a lager number of ordinal categories, this approach is not suitable.

## Handling missing values

1. Deleting the row - not recommended
2. Replacing with the most frequent category - not recommended
3. Apply classification to identify the missing value
4. Apply unsupervised ML (clustering) to identify the missing value

In [1]:
import pandas as pd

Handling Missing Values using Titanic dataset

In [6]:
df = pd.read_csv('titanic/train.csv')

In [13]:
df.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [8]:
df.shape

(891, 12)

In [14]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

There is relationship of Age and Cabin. So we can not say that there is no relationship between the missing value and other features. We can not put this into MCAR.

In [16]:
df[df['Embarked'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


This 'Embarked' is actually the MCAR. These 2 values are used nowhere.

In [19]:
# converting NaN to 1 and otherwise 0

import numpy as np
df['cabin_null'] = np.where(df['Cabin'].isnull(),1,0)

In [20]:
# find the percentage of null value
df['cabin_null'].mean()

0.7710437710437711

In [21]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'cabin_nill',
       'cabin_null'],
      dtype='object')

In [25]:
df = df.drop(['cabin_nill'], axis=1)

In [27]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'cabin_null'],
      dtype='object')

In [28]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,cabin_null
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1


In [30]:
## let's see if people survived are missing or not
## We converted NaN (assumed missing) as 1- which is expected as not-survived (0)
df.groupby(['Survived'])['cabin_null'].mean()

Survived
0    0.876138
1    0.602339
Name: cabin_null, dtype: float64

- 87% of the non_survived people = assumed missing
- 60% of the survived people = assumed missing

#### Computing NaN values with a new feature

In [10]:
df = pd.read_csv('titanic/train.csv', usecols=['Age', 'Fare', 'Survived'])
df.head(10)

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05
5,0,,8.4583
6,0,54.0,51.8625
7,0,2.0,21.075
8,1,27.0,11.1333
9,1,14.0,30.0708


In [11]:
df['Age_NaN'] = np.where(df['Age'].isnull(),1,0)

In [12]:
df.head(10)

Unnamed: 0,Survived,Age,Fare,Age_NaN
0,0,22.0,7.25,0
1,1,38.0,71.2833,0
2,1,26.0,7.925,0
3,1,35.0,53.1,0
4,0,35.0,8.05,0
5,0,,8.4583,1
6,0,54.0,51.8625,0
7,0,2.0,21.075,0
8,1,27.0,11.1333,0
9,1,14.0,30.0708,0


In [13]:
df.Age.mean()

29.69911764705882

In [14]:
df.Age.median()

28.0

In [16]:
df['Age'].fillna(df.Age.median(),inplace=True)
df.head(20)

Unnamed: 0,Survived,Age,Fare,Age_NaN
0,0,22.0,7.25,0
1,1,38.0,71.2833,0
2,1,26.0,7.925,0
3,1,35.0,53.1,0
4,0,35.0,8.05,0
5,0,28.0,8.4583,1
6,0,54.0,51.8625,0
7,0,2.0,21.075,0
8,1,27.0,11.1333,0
9,1,14.0,30.0708,0
