![ine-divider](https://user-images.githubusercontent.com/7065401/92672068-398e8080-f2ee-11ea-82d6-ad53f7feb5c0.png)
<hr>

# Feature Engineering

## Hands-on Feature Encodings

In this project, you will practice the different feature encondings you learned on previous lessons.


In [1]:
# Necessary imports
import pandas as pd

![orange-divider](https://user-images.githubusercontent.com/7065401/98619088-44ab6000-22e1-11eb-8f6d-5532e68ab274.png)

## Binary Encoding

In [8]:
# Enter the local filepath to your binary_data.csv file
fpath = "datasets/binary_data.csv"
df_binary = pd.read_csv(fpath)
df_binary.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4
0,0,0,0,T,Y
1,0,1,0,T,Y
2,0,0,0,F,Y
3,0,1,0,F,Y
4,0,0,0,F,N


In [9]:
# Convert bin_4 Y/N to numeric
df_binary['bin_4_num'] = df_binary['bin_4'].map({
    'Y':1,
    'N':0
})
df_binary[['bin_4', 'bin_4_num']].head()

Unnamed: 0,bin_4,bin_4_num
0,Y,1
1,Y,1
2,Y,1
3,Y,1
4,N,0


In [13]:
# And if we want to map these values into the same column without creating a new one 
df_binary['bin_4'] = df_binary['bin_4'].replace({
    'Y':1,
    'N':0
}).astype(int)

df_binary.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,bin_4_num
0,0,0,0,T,1,1
1,0,1,0,T,1,1
2,0,0,0,F,1,1
3,0,1,0,F,1,1
4,0,0,0,F,0,0


![orange-divider](https://user-images.githubusercontent.com/7065401/98619088-44ab6000-22e1-11eb-8f6d-5532e68ab274.png)

## Nominal/Multi-category/Discrete Encoding

In [14]:
fpath = 'datasets/nominal_data.csv'
df_nom = pd.read_csv(fpath)
df_nom.head()

Unnamed: 0,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9
0,Green,Triangle,Snake,Finland,Bassoon,50f116bcf,3ac1b8814,68f6ad3e9,c389000ab,2f4cb3d51
1,Green,Trapezoid,Hamster,Russia,Piano,b3b4d25d0,fbcb50fc1,3b6dd5612,4cd920251,f83c56c21
2,Blue,Trapezoid,Lion,Russia,Theremin,3263bdce5,0922e3cb8,a6a36f527,de9c9f684,ae6800dd0
3,Red,Trapezoid,Snake,Canada,Oboe,f12246592,50d7ad46a,ec69236eb,4ade6ab69,8270f0d71
4,Red,Trapezoid,Lion,Canada,Oboe,5b0f5acd5,1fe17a1fd,04ddac2be,cb43ab175,b164b72a7


In [20]:
# Dummy encode nom_1
# Fill in your code here
df_nom_dummies = pd.get_dummies(df_nom.nom_1,drop_first=True).astype(int)
df_nom_dummies.head()

Unnamed: 0,Polygon,Square,Star,Trapezoid,Triangle
0,0,0,0,0,1
1,0,0,0,1,0
2,0,0,0,1,0
3,0,0,0,1,0
4,0,0,0,1,0


In [28]:
# Check that the number of dummy columns is 1 fewer than the number
# of unique categories in nom_1
print(f'df_non_dummies has {len(df_nom_dummies.columns)} unique categories')
print(f'df_nom has {df_nom["nom_1"].nunique()} unique categories')

df_non_dummies has 5 unique categories
df_nom has 6 unique categories


![orange-divider](https://user-images.githubusercontent.com/7065401/98619088-44ab6000-22e1-11eb-8f6d-5532e68ab274.png)

## Ordinal Encoding

In [29]:
fpath = 'datasets/ordinal_data.csv'
df_ord = pd.read_csv(fpath)
df_ord.head()

Unnamed: 0,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5
0,2,Grandmaster,Cold,h,D,kr
1,1,Grandmaster,Hot,a,A,bF
2,1,Expert,Lava Hot,h,R,Jc
3,1,Grandmaster,Boiling Hot,i,D,kW
4,1,Grandmaster,Freezing,a,R,qP


In [32]:
df_ord['ord_2'].unique()

array(['Cold', 'Hot', 'Lava Hot', 'Boiling Hot', 'Freezing', 'Warm'],
      dtype=object)

In [33]:
# Encode ord_2 into a feature called 'ord_2_mapped'
# Fill in your code here
mapping = {
    'Freezing': 0,
    'Cold': 1,
    'Warm': 2, 
    'Hot': 3,
    'Boiling Hot': 4,
    'Lava Hot': 5,
}

df_ord['ord_2_mapped'] = df_ord['ord_2'].map(mapping)

df_ord[['ord_2', 'ord_2_mapped']].head()

Unnamed: 0,ord_2,ord_2_mapped
0,Cold,1
1,Hot,3
2,Lava Hot,5
3,Boiling Hot,4
4,Freezing,0


In [None]:
# Another approach, if the order dosen't matter this will be much faster

# Define the order of values
categories = ['Freezing', 'Cold', 'Warm', 'Hot', 'Boiling Hot', 'Lava Hot']

# Create the mapping dynamically
mapping = {value: idx for idx, value in enumerate(categories)}

# Map the values in the column
df_ord['ord_2_mapped'] = df_ord['ord_2'].map(mapping)

# Display the result
df_ord[['ord_2', 'ord_2_mapped']].head()

Unnamed: 0,ord_2,ord_2_mapped
0,Cold,1
1,Hot,3
2,Lava Hot,5
3,Boiling Hot,4
4,Freezing,0


In [36]:
# Encode ord_4 into a feature called 'ord_4_mapped'
# Fill in your code here
df_ord['ord_4'].unique()

array(['D', 'A', 'R', 'E', 'P', 'K', 'V', 'Q', 'Z', 'L', 'F', 'T', 'U',
       'S', 'Y', 'B', 'H', 'J', 'N', 'G', 'W', 'I', 'O', 'C', 'X', 'M'],
      dtype=object)

In [40]:
# Here we can use the above approach because the ordinal values are alphabetical so we can sort them logically 

#sorting the values based on their alphabetical order
alphabetical_sorted = df_ord['ord_4'].sort_values().unique().tolist()

# use a dict comprehension to create mapping dictionary 
alphabetical_mapping = {alpha: idx for idx, alpha in enumerate(alphabetical_sorted)}

df_ord['ord_4_mapped'] = df_ord['ord_4'].map(alphabetical_mapping)

df_ord[['ord_4', 'ord_4_mapped']].head(10)

Unnamed: 0,ord_4,ord_4_mapped
0,D,3
1,A,0
2,R,17
3,D,3
4,R,17
5,E,4
6,P,15
7,K,10
8,V,21
9,Q,16


![orange-divider](https://user-images.githubusercontent.com/7065401/98619088-44ab6000-22e1-11eb-8f6d-5532e68ab274.png)

## Cyclical Encoding

In [41]:
fpath = 'datasets/cyclical_data.csv'
df_cyclical = pd.read_csv(fpath)
df_cyclical.head()

Unnamed: 0,day,month
0,2,2
1,7,8
2,7,2
3,2,1
4,7,8


In [42]:
# Encode cyclical columm 'month'
# Fill in your code here
df_month_dummies = pd.get_dummies(df_cyclical['month'], drop_first=True).astype(int)

df_month_dummies.head(10)

Unnamed: 0,2,3,4,5,6,7,8,9,10,11,12
0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0
5,1,0,0,0,0,0,0,0,0,0,0
6,0,0,1,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,0,0,0,0,0
8,0,0,1,0,0,0,0,0,0,0,0
9,1,0,0,0,0,0,0,0,0,0,0


In [43]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
# Check for collinearity in your dummy features
# Fill in your code here
X = add_constant(df_month_dummies)
pd.Series([variance_inflation_factor(X.values, i) for i in range(X.shape[1])], index=X.columns)

const    7.344301
2        1.790601
3        1.727955
4        1.476324
5        1.222654
6        1.011880
7        1.387932
8        1.367468
9        1.475247
10       1.427718
11       1.490139
12       1.481168
dtype: float64

In [45]:
# Join the month dummies back to the day column in the cyclical data
# Fill in your code here
df_features = pd.concat([df_cyclical['day'], df_month_dummies], axis=1)

df_features.head()

Unnamed: 0,day,2,3,4,5,6,7,8,9,10,11,12
0,2,1,0,0,0,0,0,0,0,0,0,0
1,7,0,0,0,0,0,0,1,0,0,0,0
2,7,1,0,0,0,0,0,0,0,0,0,0
3,2,0,0,0,0,0,0,0,0,0,0,0
4,7,0,0,0,0,0,0,1,0,0,0,0


![orange-divider](https://user-images.githubusercontent.com/7065401/98619088-44ab6000-22e1-11eb-8f6d-5532e68ab274.png)

## Date Features

In [46]:
fpath = 'datasets/daily_usage.csv'
df_usage = pd.read_csv(fpath)
df_usage.head()

Unnamed: 0,transaction_date,usage
0,2020-10-11 19:22:35,33
1,2020-08-29 07:07:55,23
2,2020-10-15 07:07:25,24
3,2020-01-25 19:55:46,41
4,2020-08-29 21:46:28,18


In [None]:
# Derive a new feature from the transaction_date called 'month'.
# The new feature should be the month in which the transaction occurred.
# You may need to consult pandas documentation to find the method.
# Ensure transaction_date is in datetime format
# Pull the month number from the date
# Fill in your code here

# Format date as datetime object
df_usage['transaction_date'] = pd.to_datetime(df_usage['transaction_date'])

# Make the date feature the index
df_usage.set_index('transaction_date', inplace=True)

df_usage.head()

Unnamed: 0_level_0,usage
transaction_date,Unnamed: 1_level_1
2020-10-11 19:22:35,33
2020-08-29 07:07:55,23
2020-10-15 07:07:25,24
2020-01-25 19:55:46,41
2020-08-29 21:46:28,18


In [49]:
# Extract the month (as number) and add as a new column
df_usage['month'] = df_usage.index.month

df_usage.head()

Unnamed: 0_level_0,usage,month
transaction_date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-10-11 19:22:35,33,10
2020-08-29 07:07:55,23,8
2020-10-15 07:07:25,24,10
2020-01-25 19:55:46,41,1
2020-08-29 21:46:28,18,8


![orange-divider](https://user-images.githubusercontent.com/7065401/98619088-44ab6000-22e1-11eb-8f6d-5532e68ab274.png)
