# FEATURE EXTRACTION
## - CORE PART OF DATA PREPROCESSING OR ML PIPELINE

1. GENERATE SYNTHETIC DATA
2. EXTRACT/ENCODING
   
    A) STATISTICAL FEATURES[MEAN, STD....]
   
    B) DATA/TIME FEATURES
   
    C) ENCODING CATEGORICAL FEATURES
   
    D) TEXT FEATURE EXTRACTION USING TF-IDF

In [1]:
import numpy as np # numerical operations
import pandas as pd # handling tabular data

In [3]:
np.random.seed(52)

In [5]:
data = {
    'id': range(1,101),
    'age': np.random.randint(19,60,100),
    'income': np.random.normal(10000, 15000, 100),
    'signup_date': pd.date_range(start = '2002-01-01', periods = 100, freq='D'),
    'category':  np.random.choice(['A','B','C'],100),
    'feedback':  np.random.choice(['Great Product','Not Good','Average Experience','Loved it','Bad Service'],100)
    
}
data

{'id': range(1, 101),
 'age': array([47, 30, 32, 42, 41, 47, 51, 24, 30, 46, 54, 54, 52, 54, 52, 24, 35,
        30, 34, 31, 58, 55, 41, 26, 58, 26, 39, 53, 33, 39, 29, 19, 20, 56,
        37, 25, 44, 41, 34, 50, 40, 45, 19, 45, 57, 49, 28, 42, 36, 29, 44,
        44, 50, 29, 32, 20, 31, 39, 30, 28, 35, 41, 31, 47, 22, 49, 51, 28,
        56, 32, 26, 39, 26, 51, 48, 36, 31, 56, 33, 49, 32, 52, 55, 30, 33,
        51, 54, 52, 27, 58, 49, 48, 41, 39, 57, 40, 59, 20, 35, 35]),
 'income': array([   724.35039238,  24952.61795517, -21951.93711528,  27282.40137744,
        -10725.74198216,   -874.4447684 ,   8055.61453057, -16611.16964393,
          8168.31691636,  19775.7238158 ,  23364.36329477,   5712.28877997,
          1056.75751611,  35296.11138941,  34795.52552829,  -2532.36873494,
        -28359.0189666 ,   1710.67432668,    887.25593182,  31052.46221692,
          8322.72160314,   -892.2600232 ,  -2149.16221108,   9017.8283577 ,
        -12508.05760408,  -3801.46735509,  -5441.441938

In [7]:
df = pd.DataFrame(data)
df

Unnamed: 0,id,age,income,signup_date,category,feedback
0,1,47,724.350392,2002-01-01,B,Great Product
1,2,30,24952.617955,2002-01-02,B,Not Good
2,3,32,-21951.937115,2002-01-03,B,Average Experience
3,4,42,27282.401377,2002-01-04,A,Not Good
4,5,41,-10725.741982,2002-01-05,C,Great Product
...,...,...,...,...,...,...
95,96,40,20907.394712,2002-04-06,A,Great Product
96,97,59,33408.353447,2002-04-07,C,Great Product
97,98,20,-5843.810075,2002-04-08,B,Great Product
98,99,35,-23649.447557,2002-04-09,B,Loved it


In [9]:
df.head()

Unnamed: 0,id,age,income,signup_date,category,feedback
0,1,47,724.350392,2002-01-01,B,Great Product
1,2,30,24952.617955,2002-01-02,B,Not Good
2,3,32,-21951.937115,2002-01-03,B,Average Experience
3,4,42,27282.401377,2002-01-04,A,Not Good
4,5,41,-10725.741982,2002-01-05,C,Great Product


In [11]:
df.tail()

Unnamed: 0,id,age,income,signup_date,category,feedback
95,96,40,20907.394712,2002-04-06,A,Great Product
96,97,59,33408.353447,2002-04-07,C,Great Product
97,98,20,-5843.810075,2002-04-08,B,Great Product
98,99,35,-23649.447557,2002-04-09,B,Loved it
99,100,35,5704.370782,2002-04-10,B,Average Experience


In [13]:
# Statistical Feature Extraction - insight -> how much income per unit age
df['income_per_age'] = df['income']/df['age']
df

Unnamed: 0,id,age,income,signup_date,category,feedback,income_per_age
0,1,47,724.350392,2002-01-01,B,Great Product,15.411710
1,2,30,24952.617955,2002-01-02,B,Not Good,831.753932
2,3,32,-21951.937115,2002-01-03,B,Average Experience,-685.998035
3,4,42,27282.401377,2002-01-04,A,Not Good,649.580985
4,5,41,-10725.741982,2002-01-05,C,Great Product,-261.603463
...,...,...,...,...,...,...,...
95,96,40,20907.394712,2002-04-06,A,Great Product,522.684868
96,97,59,33408.353447,2002-04-07,C,Great Product,566.243279
97,98,20,-5843.810075,2002-04-08,B,Great Product,-292.190504
98,99,35,-23649.447557,2002-04-09,B,Loved it,-675.698502


In [15]:
df.head()

Unnamed: 0,id,age,income,signup_date,category,feedback,income_per_age
0,1,47,724.350392,2002-01-01,B,Great Product,15.41171
1,2,30,24952.617955,2002-01-02,B,Not Good,831.753932
2,3,32,-21951.937115,2002-01-03,B,Average Experience,-685.998035
3,4,42,27282.401377,2002-01-04,A,Not Good,649.580985
4,5,41,-10725.741982,2002-01-05,C,Great Product,-261.603463


In [17]:
# Compute Z-score of Income -> how far each income value is from the mean in terms of SD
df['income_zscore'] = (df['income']-df['income'].mean())/df['income'].std()
df

Unnamed: 0,id,age,income,signup_date,category,feedback,income_per_age,income_zscore
0,1,47,724.350392,2002-01-01,B,Great Product,15.411710,-0.416061
1,2,30,24952.617955,2002-01-02,B,Not Good,831.753932,1.045485
2,3,32,-21951.937115,2002-01-03,B,Average Experience,-685.998035,-1.783985
3,4,42,27282.401377,2002-01-04,A,Not Good,649.580985,1.186027
4,5,41,-10725.741982,2002-01-05,C,Great Product,-261.603463,-1.106776
...,...,...,...,...,...,...,...,...
95,96,40,20907.394712,2002-04-06,A,Great Product,522.684868,0.801461
96,97,59,33408.353447,2002-04-07,C,Great Product,566.243279,1.555568
97,98,20,-5843.810075,2002-04-08,B,Great Product,-292.190504,-0.812279
98,99,35,-23649.447557,2002-04-09,B,Loved it,-675.698502,-1.886386


In [19]:
df.head()

Unnamed: 0,id,age,income,signup_date,category,feedback,income_per_age,income_zscore
0,1,47,724.350392,2002-01-01,B,Great Product,15.41171,-0.416061
1,2,30,24952.617955,2002-01-02,B,Not Good,831.753932,1.045485
2,3,32,-21951.937115,2002-01-03,B,Average Experience,-685.998035,-1.783985
3,4,42,27282.401377,2002-01-04,A,Not Good,649.580985,1.186027
4,5,41,-10725.741982,2002-01-05,C,Great Product,-261.603463,-1.106776


In [21]:
df.describe()

Unnamed: 0,id,age,income,signup_date,income_per_age,income_zscore
count,100.0,100.0,100.0,100,100.0,100.0
mean,50.5,39.84,7621.459088,2002-02-19 12:00:00,190.706117,-3.8163920000000003e-17
min,1.0,19.0,-35686.808479,2002-01-01 00:00:00,-1115.212765,-2.612528
25%,25.75,30.75,-4677.42361,2002-01-25 18:00:00,-129.924489,-0.7419177
50%,50.5,39.5,8121.949079,2002-02-19 12:00:00,221.787505,0.03019156
75%,75.25,50.0,19929.397113,2002-03-16 06:00:00,519.112839,0.742464
max,100.0,59.0,41058.544752,2002-04-10 00:00:00,2160.97604,2.017059
std,29.011492,11.201299,16577.151507,,512.735237,1.0


In [23]:
df[['age','income','income_per_age','income_zscore']].describe() # df[...] df[[...,...,...,..]]

Unnamed: 0,age,income,income_per_age,income_zscore
count,100.0,100.0,100.0,100.0
mean,39.84,7621.459088,190.706117,-3.8163920000000003e-17
std,11.201299,16577.151507,512.735237,1.0
min,19.0,-35686.808479,-1115.212765,-2.612528
25%,30.75,-4677.42361,-129.924489,-0.7419177
50%,39.5,8121.949079,221.787505,0.03019156
75%,50.0,19929.397113,519.112839,0.742464
max,59.0,41058.544752,2160.97604,2.017059


In [27]:
# Date/Time Feature Extraction
df['signup_day'] = df['signup_date'].dt.day 
df

Unnamed: 0,id,age,income,signup_date,category,feedback,income_per_age,income_zscore,signup_day
0,1,47,724.350392,2002-01-01,B,Great Product,15.411710,-0.416061,1
1,2,30,24952.617955,2002-01-02,B,Not Good,831.753932,1.045485,2
2,3,32,-21951.937115,2002-01-03,B,Average Experience,-685.998035,-1.783985,3
3,4,42,27282.401377,2002-01-04,A,Not Good,649.580985,1.186027,4
4,5,41,-10725.741982,2002-01-05,C,Great Product,-261.603463,-1.106776,5
...,...,...,...,...,...,...,...,...,...
95,96,40,20907.394712,2002-04-06,A,Great Product,522.684868,0.801461,6
96,97,59,33408.353447,2002-04-07,C,Great Product,566.243279,1.555568,7
97,98,20,-5843.810075,2002-04-08,B,Great Product,-292.190504,-0.812279,8
98,99,35,-23649.447557,2002-04-09,B,Loved it,-675.698502,-1.886386,9


In [31]:
df['signup_month'] = df['signup_date'].dt.month
df.head()

Unnamed: 0,id,age,income,signup_date,category,feedback,income_per_age,income_zscore,signup_day,signup_month
0,1,47,724.350392,2002-01-01,B,Great Product,15.41171,-0.416061,1,1
1,2,30,24952.617955,2002-01-02,B,Not Good,831.753932,1.045485,2,1
2,3,32,-21951.937115,2002-01-03,B,Average Experience,-685.998035,-1.783985,3,1
3,4,42,27282.401377,2002-01-04,A,Not Good,649.580985,1.186027,4,1
4,5,41,-10725.741982,2002-01-05,C,Great Product,-261.603463,-1.106776,5,1


In [33]:
df['signup_weekday'] = df['signup_date'].dt.weekday
df.head()

Unnamed: 0,id,age,income,signup_date,category,feedback,income_per_age,income_zscore,signup_day,signup_month,signup_weekday
0,1,47,724.350392,2002-01-01,B,Great Product,15.41171,-0.416061,1,1,1
1,2,30,24952.617955,2002-01-02,B,Not Good,831.753932,1.045485,2,1,2
2,3,32,-21951.937115,2002-01-03,B,Average Experience,-685.998035,-1.783985,3,1,3
3,4,42,27282.401377,2002-01-04,A,Not Good,649.580985,1.186027,4,1,4
4,5,41,-10725.741982,2002-01-05,C,Great Product,-261.603463,-1.106776,5,1,5


In [35]:
df['days_from_signup'] =   (pd.Timestamp.today()- df['signup_date']).dt.days
df.head()

Unnamed: 0,id,age,income,signup_date,category,feedback,income_per_age,income_zscore,signup_day,signup_month,signup_weekday,days_from_signup
0,1,47,724.350392,2002-01-01,B,Great Product,15.41171,-0.416061,1,1,1,8498
1,2,30,24952.617955,2002-01-02,B,Not Good,831.753932,1.045485,2,1,2,8497
2,3,32,-21951.937115,2002-01-03,B,Average Experience,-685.998035,-1.783985,3,1,3,8496
3,4,42,27282.401377,2002-01-04,A,Not Good,649.580985,1.186027,4,1,4,8495
4,5,41,-10725.741982,2002-01-05,C,Great Product,-261.603463,-1.106776,5,1,5,8494


In [37]:
df['years_from_signup'] =   ((pd.Timestamp.today()- df['signup_date']).dt.days)/365
df.head()

Unnamed: 0,id,age,income,signup_date,category,feedback,income_per_age,income_zscore,signup_day,signup_month,signup_weekday,days_from_signup,years_from_signup
0,1,47,724.350392,2002-01-01,B,Great Product,15.41171,-0.416061,1,1,1,8498,23.282192
1,2,30,24952.617955,2002-01-02,B,Not Good,831.753932,1.045485,2,1,2,8497,23.279452
2,3,32,-21951.937115,2002-01-03,B,Average Experience,-685.998035,-1.783985,3,1,3,8496,23.276712
3,4,42,27282.401377,2002-01-04,A,Not Good,649.580985,1.186027,4,1,4,8495,23.273973
4,5,41,-10725.741982,2002-01-05,C,Great Product,-261.603463,-1.106776,5,1,5,8494,23.271233


In [39]:
#Ecode Categorical Variables -> one-hot encoding
df_encoded=pd.get_dummies(df, columns = ['category'], prefix='cat')
df_encoded.head()

Unnamed: 0,id,age,income,signup_date,feedback,income_per_age,income_zscore,signup_day,signup_month,signup_weekday,days_from_signup,years_from_signup,cat_A,cat_B,cat_C
0,1,47,724.350392,2002-01-01,Great Product,15.41171,-0.416061,1,1,1,8498,23.282192,False,True,False
1,2,30,24952.617955,2002-01-02,Not Good,831.753932,1.045485,2,1,2,8497,23.279452,False,True,False
2,3,32,-21951.937115,2002-01-03,Average Experience,-685.998035,-1.783985,3,1,3,8496,23.276712,False,True,False
3,4,42,27282.401377,2002-01-04,Not Good,649.580985,1.186027,4,1,4,8495,23.273973,True,False,False
4,5,41,-10725.741982,2002-01-05,Great Product,-261.603463,-1.106776,5,1,5,8494,23.271233,False,False,True


In [41]:
#Text Feature Extraction(TF-IDF)
from sklearn.feature_extraction.text import TfidfVectorizer


In [43]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['feedback'])
print(tfidf_matrix)

  (0, 4)	0.7071067811865476
  (0, 8)	0.7071067811865476
  (1, 7)	0.7071067811865476
  (1, 3)	0.7071067811865476
  (2, 0)	0.7071067811865476
  (2, 2)	0.7071067811865476
  (3, 7)	0.7071067811865476
  (3, 3)	0.7071067811865476
  (4, 4)	0.7071067811865476
  (4, 8)	0.7071067811865476
  (5, 4)	0.7071067811865476
  (5, 8)	0.7071067811865476
  (6, 4)	0.7071067811865476
  (6, 8)	0.7071067811865476
  (7, 6)	0.7071067811865476
  (7, 5)	0.7071067811865476
  (8, 6)	0.7071067811865476
  (8, 5)	0.7071067811865476
  (9, 1)	0.7071067811865476
  (9, 9)	0.7071067811865476
  (10, 7)	0.7071067811865476
  (10, 3)	0.7071067811865476
  (11, 4)	0.7071067811865476
  (11, 8)	0.7071067811865476
  (12, 4)	0.7071067811865476
  :	:
  (87, 8)	0.7071067811865476
  (88, 7)	0.7071067811865476
  (88, 3)	0.7071067811865476
  (89, 1)	0.7071067811865476
  (89, 9)	0.7071067811865476
  (90, 1)	0.7071067811865476
  (90, 9)	0.7071067811865476
  (91, 4)	0.7071067811865476
  (91, 8)	0.7071067811865476
  (92, 0)	0.7071067811865476

In [45]:
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(),columns=vectorizer.get_feature_names_out())
tfidf_df


Unnamed: 0,average,bad,experience,good,great,it,loved,not,product,service
0,0.000000,0.0,0.000000,0.000000,0.707107,0.000000,0.000000,0.000000,0.707107,0.0
1,0.000000,0.0,0.000000,0.707107,0.000000,0.000000,0.000000,0.707107,0.000000,0.0
2,0.707107,0.0,0.707107,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
3,0.000000,0.0,0.000000,0.707107,0.000000,0.000000,0.000000,0.707107,0.000000,0.0
4,0.000000,0.0,0.000000,0.000000,0.707107,0.000000,0.000000,0.000000,0.707107,0.0
...,...,...,...,...,...,...,...,...,...,...
95,0.000000,0.0,0.000000,0.000000,0.707107,0.000000,0.000000,0.000000,0.707107,0.0
96,0.000000,0.0,0.000000,0.000000,0.707107,0.000000,0.000000,0.000000,0.707107,0.0
97,0.000000,0.0,0.000000,0.000000,0.707107,0.000000,0.000000,0.000000,0.707107,0.0
98,0.000000,0.0,0.000000,0.000000,0.000000,0.707107,0.707107,0.000000,0.000000,0.0


In [47]:
df_combined = pd.concat([df,tfidf_df], axis = 1)
df_combined.head()

Unnamed: 0,id,age,income,signup_date,category,feedback,income_per_age,income_zscore,signup_day,signup_month,...,average,bad,experience,good,great,it,loved,not,product,service
0,1,47,724.350392,2002-01-01,B,Great Product,15.41171,-0.416061,1,1,...,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.707107,0.0
1,2,30,24952.617955,2002-01-02,B,Not Good,831.753932,1.045485,2,1,...,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.707107,0.0,0.0
2,3,32,-21951.937115,2002-01-03,B,Average Experience,-685.998035,-1.783985,3,1,...,0.707107,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,42,27282.401377,2002-01-04,A,Not Good,649.580985,1.186027,4,1,...,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.707107,0.0,0.0
4,5,41,-10725.741982,2002-01-05,C,Great Product,-261.603463,-1.106776,5,1,...,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.707107,0.0


In [49]:
print(df_combined[['feedback']+ list(tfidf_df.columns)].head())


             feedback   average  bad  experience      good     great   it  \
0       Great Product  0.000000  0.0    0.000000  0.000000  0.707107  0.0   
1            Not Good  0.000000  0.0    0.000000  0.707107  0.000000  0.0   
2  Average Experience  0.707107  0.0    0.707107  0.000000  0.000000  0.0   
3            Not Good  0.000000  0.0    0.000000  0.707107  0.000000  0.0   
4       Great Product  0.000000  0.0    0.000000  0.000000  0.707107  0.0   

   loved       not   product  service  
0    0.0  0.000000  0.707107      0.0  
1    0.0  0.707107  0.000000      0.0  
2    0.0  0.000000  0.000000      0.0  
3    0.0  0.707107  0.000000      0.0  
4    0.0  0.000000  0.707107      0.0  
