# Two Types of Duplicate Features in Machine Learning

## 1. Duplicate Values (Same value for each record)

## Example
<img src="t1.png">

In [1]:
import pandas as pd

In [45]:
ds = {
    
    "Car_Make" : ["Toyota","Toyota","Toyota","Toyota","Toyota","Toyota"],
    "Car_Model": ["Camry","Corola","Camry","Corola","Camry","Camry"],
    "Car_Year" : [2018,2019,2018,2019,2018,2018],
    "Sell_Year" :[2018,2019,2018,2019,2018,2018]
}

In [46]:
data = pd.DataFrame(ds)
data

Unnamed: 0,Car_Make,Car_Model,Car_Year,Sell_Year
0,Toyota,Camry,2018,2018
1,Toyota,Corola,2019,2019
2,Toyota,Camry,2018,2018
3,Toyota,Corola,2019,2019
4,Toyota,Camry,2018,2018
5,Toyota,Camry,2018,2018


# install fast_ml lib
## pip install fast_ml

In [47]:
from fast_ml.utilities import display_all

from fast_ml.feature_selection import get_duplicate_features

## check duplicate column

In [48]:
dub_data = get_duplicate_features(data)
dub_data

Unnamed: 0,Desc,feature1,feature2
0,Duplicate Values,Car_Year,Sell_Year
1,Duplicate Index,Car_Model,Car_Year
2,Duplicate Index,Car_Model,Sell_Year


# Feature 1

In [51]:
duplicate_index_features_list = dub_data.query("Desc=='Duplicate Index'")['feature1'].to_list()

print(duplicate_index_features_list)

['Car_Model', 'Car_Model']


# Store all the duplicate features as a list for removing from the dataset.



# Feature 2

# Index

In [52]:
duplicate_index_features_list = dub_data.query("Desc=='Duplicate Index'")['feature2'].to_list()

print(duplicate_index_features_list)

['Car_Year', 'Sell_Year']


# Before

In [53]:
data.shape

(6, 4)

In [54]:
data.head()

Unnamed: 0,Car_Make,Car_Model,Car_Year,Sell_Year
0,Toyota,Camry,2018,2018
1,Toyota,Corola,2019,2019
2,Toyota,Camry,2018,2018
3,Toyota,Corola,2019,2019
4,Toyota,Camry,2018,2018


# Drop columns

In [55]:
data.drop(columns=duplicate_index_features_list,inplace=True)

# After dropping data

In [56]:
data.shape

(6, 2)

In [57]:
data.head()

Unnamed: 0,Car_Make,Car_Model
0,Toyota,Camry
1,Toyota,Corola
2,Toyota,Camry
3,Toyota,Corola
4,Toyota,Camry


# 2. Duplicate Index (value of two features are different but they occur at the same index)

<img src="t2.png">

In [61]:
data = pd.DataFrame(ds)
data

Unnamed: 0,Car_Make,Car_Model,Car_Year,Sell_Year
0,Toyota,Camry,2018,2018
1,Toyota,Corola,2019,2019
2,Toyota,Camry,2018,2018
3,Toyota,Corola,2019,2019
4,Toyota,Camry,2018,2018
5,Toyota,Camry,2018,2018


In [62]:
dub_data = get_duplicate_features(data)
dub_data

Unnamed: 0,Desc,feature1,feature2
0,Duplicate Values,Car_Year,Sell_Year
1,Duplicate Index,Car_Model,Car_Year
2,Duplicate Index,Car_Model,Sell_Year


# Values

In [65]:
duplicate_index_features_list = dub_data.query("Desc=='Duplicate Values'")['feature2'].to_list()

print(duplicate_index_features_list)

['Sell_Year']


In [68]:
print(data.shape)
data.head()

(6, 4)


Unnamed: 0,Car_Make,Car_Model,Car_Year,Sell_Year
0,Toyota,Camry,2018,2018
1,Toyota,Corola,2019,2019
2,Toyota,Camry,2018,2018
3,Toyota,Corola,2019,2019
4,Toyota,Camry,2018,2018


In [69]:
data.drop(columns=duplicate_index_features_list,inplace=True)

In [70]:
print(data.shape)
data.head()

(6, 3)


Unnamed: 0,Car_Make,Car_Model,Car_Year
0,Toyota,Camry,2018
1,Toyota,Corola,2019
2,Toyota,Camry,2018
3,Toyota,Corola,2019
4,Toyota,Camry,2018


# Duplicate Pairs

In [43]:
from fast_ml.feature_selection import get_duplicate_pairs

In [73]:
data = pd.DataFrame(ds)
data

Unnamed: 0,Car_Make,Car_Model,Car_Year,Sell_Year
0,Toyota,Camry,2018,2018
1,Toyota,Corola,2019,2019
2,Toyota,Camry,2018,2018
3,Toyota,Corola,2019,2019
4,Toyota,Camry,2018,2018
5,Toyota,Camry,2018,2018


In [104]:
dub_pair = get_duplicate_pairs(data)
dub_pair

{'Car_Year': ['Sell_Year']}

## remove Sell_Year

In [106]:
data.drop(columns=dub_pair)

Unnamed: 0,Car_Make,Car_Model,Sell_Year
0,Toyota,Camry,2018
1,Toyota,Corola,2019
2,Toyota,Camry,2018
3,Toyota,Corola,2019
4,Toyota,Camry,2018
5,Toyota,Camry,2018


# constant feature

In [107]:
from fast_ml.feature_selection import get_constant_features

In [109]:
data

Unnamed: 0,Car_Make,Car_Model,Car_Year,Sell_Year
0,Toyota,Camry,2018,2018
1,Toyota,Corola,2019,2019
2,Toyota,Camry,2018,2018
3,Toyota,Corola,2019,2019
4,Toyota,Camry,2018,2018
5,Toyota,Camry,2018,2018


In [111]:
const = get_constant_features(data)
const

Unnamed: 0,Desc,Var,Value,Perc
0,Constant,Car_Make,Toyota,100.0


In [126]:
const_value = const.query("Desc=='Constant'")["Var"].to_list()
const_value

['Car_Make']

In [127]:
data.get(const_value)

Unnamed: 0,Car_Make
0,Toyota
1,Toyota
2,Toyota
3,Toyota
4,Toyota
5,Toyota


# Remove Duplicate rows

In [130]:
data

Unnamed: 0,Car_Make,Car_Model,Car_Year,Sell_Year
0,Toyota,Camry,2018,2018
1,Toyota,Corola,2019,2019
2,Toyota,Camry,2018,2018
3,Toyota,Corola,2019,2019
4,Toyota,Camry,2018,2018
5,Toyota,Camry,2018,2018


In [131]:
data.duplicated()

0    False
1    False
2     True
3     True
4     True
5     True
dtype: bool

In [132]:
data.duplicated().sum()

4

# drop rows 

In [134]:
data.drop_duplicates()

Unnamed: 0,Car_Make,Car_Model,Car_Year,Sell_Year
0,Toyota,Camry,2018,2018
1,Toyota,Corola,2019,2019
