![ine-divider](https://user-images.githubusercontent.com/7065401/92672068-398e8080-f2ee-11ea-82d6-ad53f7feb5c0.png)
<hr>

# Feature Engineering

## Hands-on Feature Encodings

In this project, you will practice the different feature encondings you learned on previous lessons.  


In [1]:
# Necessary imports
import pandas as pd

![orange-divider](https://user-images.githubusercontent.com/7065401/98619088-44ab6000-22e1-11eb-8f6d-5532e68ab274.png)

## Binary Encoding

In [2]:
fpath = 'datasets/binary_data.csv'
df_binary = pd.read_csv(fpath)
df_binary.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4
0,0,0,0,T,Y
1,0,1,0,T,Y
2,0,0,0,F,Y
3,0,1,0,F,Y
4,0,0,0,F,N


In [3]:
# Convert bin_4 Y/N to numeric
df_binary['bin_4_num'] = df_binary.bin_4.map({'Y':1,'N':0})
df_binary.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,bin_4_num
0,0,0,0,T,Y,1
1,0,1,0,T,Y,1
2,0,0,0,F,Y,1
3,0,1,0,F,Y,1
4,0,0,0,F,N,0


![orange-divider](https://user-images.githubusercontent.com/7065401/98619088-44ab6000-22e1-11eb-8f6d-5532e68ab274.png)

## Nominal/Multi-category/Discrete Encoding

In [4]:
fpath = '/opt/project/datasets/nominal_data.csv'
df_nom = pd.read_csv(fpath)
df_nom.head()

Unnamed: 0,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9
0,Green,Triangle,Snake,Finland,Bassoon,50f116bcf,3ac1b8814,68f6ad3e9,c389000ab,2f4cb3d51
1,Green,Trapezoid,Hamster,Russia,Piano,b3b4d25d0,fbcb50fc1,3b6dd5612,4cd920251,f83c56c21
2,Blue,Trapezoid,Lion,Russia,Theremin,3263bdce5,0922e3cb8,a6a36f527,de9c9f684,ae6800dd0
3,Red,Trapezoid,Snake,Canada,Oboe,f12246592,50d7ad46a,ec69236eb,4ade6ab69,8270f0d71
4,Red,Trapezoid,Lion,Canada,Oboe,5b0f5acd5,1fe17a1fd,04ddac2be,cb43ab175,b164b72a7


In [7]:
# Dummy encode nom_1
df_nom_dummies = pd.get_dummies(df_nom.nom_1,drop_first=True)
df_nom_dummies.head()

(6,
    Polygon  Square  Star  Trapezoid  Triangle
 0        0       0     0          0         1
 1        0       0     0          1         0
 2        0       0     0          1         0
 3        0       0     0          1         0
 4        0       0     0          1         0)

In [8]:
# Check that the number of dummy columns is 1 fewer than the number
# of unique categories in nom_1
print("There are {} unique categories in nom_1".format(df_nom.nom_1.nunique()))
print("There are {} columns in dummies".format(df_nom_dummies.shape[1]))

There are 6 unique categories in nom_1
There are 5 columns in dummies


![orange-divider](https://user-images.githubusercontent.com/7065401/98619088-44ab6000-22e1-11eb-8f6d-5532e68ab274.png)

## Ordinal Encoding

In [9]:
fpath = 'datasets/ordinal_data.csv'
df_ord = pd.read_csv(fpath)
df_ord.head()

Unnamed: 0,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5
0,2,Grandmaster,Cold,h,D,kr
1,1,Grandmaster,Hot,a,A,bF
2,1,Expert,Lava Hot,h,R,Jc
3,1,Grandmaster,Boiling Hot,i,D,kW
4,1,Grandmaster,Freezing,a,R,qP


In [10]:
df_ord.ord_2.unique()

array(['Cold', 'Hot', 'Lava Hot', 'Boiling Hot', 'Freezing', 'Warm'],
      dtype=object)

In [11]:
# Encode ord_2
ord2_mapping = {'Freezing':0,'Cold':1,'Warm':2,'Hot':3,
                'Boiling Hot':4,'Lava Hot':5}
df_ord['ord_2_mapped'] = df_ord.ord_2.map(ord2_mapping)
df_ord.head()

Unnamed: 0,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,ord_2_mapped
0,2,Grandmaster,Cold,h,D,kr,1
1,1,Grandmaster,Hot,a,A,bF,3
2,1,Expert,Lava Hot,h,R,Jc,5
3,1,Grandmaster,Boiling Hot,i,D,kW,4
4,1,Grandmaster,Freezing,a,R,qP,0


In [13]:
# Encode ord_4
# Create mapping by sorting and using the index
alpha_list = df_ord.ord_4.sort_values().unique().tolist()
alpha_mapping = {alpha:idx for idx,alpha in enumerate(alpha_list)}
df_ord['ord_4_mapped'] = df_ord.ord_4.map(alpha_mapping)
df_ord.head()

Unnamed: 0,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,ord_2_mapped,ord_4_mapped
0,2,Grandmaster,Cold,h,D,kr,1,3
1,1,Grandmaster,Hot,a,A,bF,3,0
2,1,Expert,Lava Hot,h,R,Jc,5,17
3,1,Grandmaster,Boiling Hot,i,D,kW,4,3
4,1,Grandmaster,Freezing,a,R,qP,0,17


![orange-divider](https://user-images.githubusercontent.com/7065401/98619088-44ab6000-22e1-11eb-8f6d-5532e68ab274.png)

## Cyclical Encoding

In [14]:
fpath = 'datasets/cyclical_data.csv'
df_cyclical = pd.read_csv(fpath)
df_cyclical.head()

Unnamed: 0,day,month
0,2,2
1,7,8
2,7,2
3,2,1
4,7,8


In [15]:
# Encode cyclical columm 'month'
df_month_dummies = pd.get_dummies(df_cyclical.month,drop_first=True)
df_month_dummies.head()

Unnamed: 0,2,3,4,5,6,7,8,9,10,11,12
0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0


In [16]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
# Check for collinearity in your dummy features
X = add_constant(df_month_dummies)
pd.Series([variance_inflation_factor(X.values, i) 
               for i in range(X.shape[1])], 
              index=X.columns)

const    7.344301
2        1.790601
3        1.727955
4        1.476324
5        1.222654
6        1.011880
7        1.387932
8        1.367468
9        1.475247
10       1.427718
11       1.490139
12       1.481168
dtype: float64

In [19]:
# Join the month dummies back to the day column in the cyclical data
df_joined = pd.concat([df_cyclical[['day']],df_month_dummies],axis=1)
df_joined.head()

Unnamed: 0,day,2,3,4,5,6,7,8,9,10,11,12
0,2,1,0,0,0,0,0,0,0,0,0,0
1,7,0,0,0,0,0,0,1,0,0,0,0
2,7,1,0,0,0,0,0,0,0,0,0,0
3,2,0,0,0,0,0,0,0,0,0,0,0
4,7,0,0,0,0,0,0,1,0,0,0,0


![orange-divider](https://user-images.githubusercontent.com/7065401/98619088-44ab6000-22e1-11eb-8f6d-5532e68ab274.png)

## Date Features

In [20]:
fpath = 'datasets/daily_usage.csv'
df_usage = pd.read_csv(fpath)
df_usage.head()

Unnamed: 0,transaction_date,usage
0,2020-10-11 19:22:35,33
1,2020-08-29 07:07:55,23
2,2020-10-15 07:07:25,24
3,2020-01-25 19:55:46,41
4,2020-08-29 21:46:28,18


In [22]:
# Derive a new feature from the transaction_date called 'month'.
# The new feature should be the month in which the transaction occurred.
# You may need to consult pandas documentation to find the method.

# Ensure transaction_date is in datetime format
df_usage['transaction_date'] = pd.to_datetime(df_usage.transaction_date)

# Pull the month number from the date
df_usage['month'] = df_usage.transaction_date.dt.month
df_usage.head()

Unnamed: 0,transaction_date,usage,month
0,2020-10-11 19:22:35,33,10
1,2020-08-29 07:07:55,23,8
2,2020-10-15 07:07:25,24,10
3,2020-01-25 19:55:46,41,1
4,2020-08-29 21:46:28,18,8


![orange-divider](https://user-images.githubusercontent.com/7065401/98619088-44ab6000-22e1-11eb-8f6d-5532e68ab274.png)
