# <font size='10' color='#1ABC9C'> Feature Engineering </font>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
## When using the 'inline' backend, 
## your matplotlib graphs will be included in your notebook, next to the code.
import seaborn as sns

pd.pandas.set_option('display.max_columns',None)

In [2]:
dataset=pd.read_csv('IncomeData.csv')

##What's the shape of data set
print(dataset.shape)

(4508, 13)


In [3]:
## What are the Top 5 records
dataset.head()

Unnamed: 0,age,yrsed,edcat,yrsempl,income,creddebt,othdebt,default,jobsat,homeown,address,cars,carvalue
0,45,6,1,4,17,0.371518,1.294482,0,4,1,22,1,9.1
1,67,6,1,15,12,0.375552,0.392448,0,3,1,28,1,5.9
2,68,6,1,7,9,0.20097,0.78903,0,5,0,21,1,5.8
3,75,6,1,35,16,0.314096,0.757904,0,4,0,11,1,5.8
4,38,7,1,8,37,0.14319,0.41181,0,3,0,11,1,22.1


## Spliting the Dataset into Training Data and Test Data

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset,dataset['income'], test_size = 0.3,random_state=31) 


In [5]:
X_train.shape ,  X_test.shape

((3155, 13), (1353, 13))

## Check for Missing Values - X_train

In [6]:
## What percentage of Missing Values are there in the each feature

## Create a List of all features with missing values
features_miss_val=[features for features in X_train.columns if X_train[features].isnull().sum()>1] 
print(features_miss_val)

[]


no missing values in the training data set.

## <font size = 8  color='#F1C40F'>Transforming the Skewed Distributions - X_train</font>

### For continuous numerical features
creddebt- Continuous and Numeric Data

othdebt - Continuous and Numeric Data

carvalue- Continuous and Numeric Data

Income - Continuous and Dependent Variable

In [7]:
X_train.head()

Unnamed: 0,age,yrsed,edcat,yrsempl,income,creddebt,othdebt,default,jobsat,homeown,address,cars,carvalue
1727,79,13,2,34,16,0.376672,0.887328,0,4,0,44,2,8.5
880,25,18,4,0,36,4.130604,2.313396,1,2,1,4,1,23.4
983,34,19,4,3,62,1.777912,4.856088,1,2,1,8,1,21.2
2380,65,17,4,23,21,0.926877,1.194123,0,4,0,12,2,9.8
788,29,17,4,3,13,0.25636,0.62764,0,3,1,6,1,5.1


In [8]:
num_features = ['creddebt','othdebt','carvalue','income']

for feature in num_features:
    X_train[feature] = np.log(X_train[feature])
X_train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = np.log(X_train[feature])


Unnamed: 0,age,yrsed,edcat,yrsempl,income,creddebt,othdebt,default,jobsat,homeown,address,cars,carvalue
1727,79,13,2,34,2.772589,-0.97638,-0.119541,0,4,0,44,2,2.140066
880,25,18,4,0,3.583519,1.418424,0.838717,1,2,1,4,1,3.152736
983,34,19,4,3,4.127134,0.57544,1.580233,1,2,1,8,1,3.054001
2380,65,17,4,23,3.044522,-0.075934,0.177412,0,4,0,12,2,2.282382
788,29,17,4,3,2.564949,-1.361173,-0.465789,0,3,1,6,1,1.629241


output after transformation

### Handling Rare Categorical Features - X_train
we need to remove those categorical variables that tend to occur less than 1% of the observation

edcat - Categorical Data

default - Categorical Data

jobsat - Categorical Data

homeown - Categorical Data

In [9]:
categ_features = [feature for feature in X_train.columns if 'edcat' in feature or 'default' in feature or 'jobsat' in feature or 'homeown' in feature]
print(categ_features)

['edcat', 'default', 'jobsat', 'homeown']


In [10]:
# finding the rarity of variable in categorcal feature

for feature in categ_features:
    percent_rarity=X_train.groupby(feature)['income'].count()/len(X_train)
    temp_df=percent_rarity[percent_rarity>0.01].index
    X_train[feature]= np.where(X_train[feature].isin(temp_df),X_train[feature],'Rare_Var')

X_train.head(300)

rare_value_edcat = [value for value in X_train['edcat'].values if 'Rare_Var' in value]
print(rare_value_edcat)


rare_value_default = [value for value in X_train['default'].values if 'Rare_Var' in value]
print(rare_value_default)


rare_value_jobsat = [value for value in X_train['jobsat'].values if 'Rare_Var' in value]
print(rare_value_jobsat)


rare_value_homeown = [value for value in X_train['homeown'].values if 'Rare_Var' in value]
print(rare_value_homeown)


[]
[]
[]
[]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature]= np.where(X_train[feature].isin(temp_df),X_train[feature],'Rare_Var')


It seems we don't have any rare value in the training dataset

### Feature Scaling - X-Train
We need to do feature scaling because we have different feature measured with different units 
hence its good to do scaling before proceeding to apply machine learnign Algorithm.
#### Helps in applying M.L. Algo in a better way
###### Remember scaling is not applied on the Dependent Variable (income)

In [11]:
scale_feature = [feature for feature in X_train.columns if feature not in ['income'] ]
print(scale_feature)
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()

scaler.fit(X_train[scale_feature])

['age', 'yrsed', 'edcat', 'yrsempl', 'creddebt', 'othdebt', 'default', 'jobsat', 'homeown', 'address', 'cars', 'carvalue']


MinMaxScaler()

In [12]:
X_train = pd.concat([X_train['income'].reset_index(drop=True),pd.DataFrame(scaler.transform(X_train[scale_feature]),columns=scale_feature)],axis=1)

In [13]:
X_train.head()

Unnamed: 0,income,age,yrsed,edcat,yrsempl,creddebt,othdebt,default,jobsat,homeown,address,cars,carvalue
0,2.772589,1.0,0.411765,0.25,0.653846,0.475553,0.424034,0.0,0.75,0.0,0.77193,0.166667,0.354501
1,3.583519,0.114754,0.705882,0.75,0.0,0.717623,0.532861,1.0,0.25,1.0,0.070175,0.0,0.620105
2,4.127134,0.262295,0.764706,0.75,0.057692,0.632413,0.617074,1.0,0.25,1.0,0.140351,0.0,0.594209
3,3.044522,0.770492,0.647059,0.75,0.442308,0.566571,0.457758,0.0,0.75,0.0,0.210526,0.166667,0.391828
4,2.564949,0.180328,0.647059,0.75,0.057692,0.436658,0.384711,0.0,0.5,1.0,0.105263,0.0,0.220521


In [14]:
X_train.dtypes

income      float64
age         float64
yrsed       float64
edcat       float64
yrsempl     float64
creddebt    float64
othdebt     float64
default     float64
jobsat      float64
homeown     float64
address     float64
cars        float64
carvalue    float64
dtype: object

In [15]:
X_train.to_csv('X_train.csv',index=False)

### <font color = "Red">We need to do same steps for test data also</font>

## Check for Missing Values - X_test

In [16]:
## What percentage of Missing Values are there in the each feature

## Create a List of all features with missing values
features_miss_val=[features for features in X_test.columns if X_test[features].isnull().sum()>1] 
print(features_miss_val)

[]


## <font size = 8  color='#F1C40F'>Transforming the Skewed Distributions - X_test</font>

### For continuous numerical features
creddebt- Continuous and Numeric Data

othdebt - Continuous and Numeric Data

carvalue- Continuous and Numeric Data

Income - Continuous and Dependent Variable

In [17]:
X_test.head()

Unnamed: 0,age,yrsed,edcat,yrsempl,income,creddebt,othdebt,default,jobsat,homeown,address,cars,carvalue
2022,46,15,3,11,88,0.6622,2.4178,0,4,1,18,2,46.1
1139,53,8,1,14,40,0.21168,0.26832,0,2,1,18,2,15.5
595,33,15,3,6,49,1.20736,2.56564,0,2,1,11,1,25.7
1420,20,12,2,1,18,1.245888,0.608112,1,1,0,0,2,7.5
2486,67,18,4,5,52,0.989976,4.938024,1,1,0,26,2,23.1


In [18]:
num_features = ['creddebt','othdebt','carvalue','income']

for feature in num_features:
    X_test[feature] = np.log(X_test[feature])
X_test.head()

  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[feature] = np.log(X_test[feature])


Unnamed: 0,age,yrsed,edcat,yrsempl,income,creddebt,othdebt,default,jobsat,homeown,address,cars,carvalue
2022,46,15,3,11,4.477337,-0.412188,0.882858,0,4,1,18,2,3.830813
1139,53,8,1,14,3.688879,-1.55268,-1.315575,0,2,1,18,2,2.74084
595,33,15,3,6,3.89182,0.188436,0.942208,0,2,1,11,1,3.246491
1420,20,12,2,1,2.890372,0.219849,-0.497396,1,1,0,0,2,2.014903
2486,67,18,4,5,3.951244,-0.010075,1.596965,1,1,0,26,2,3.139833


output after transformation

### Handling Rare Categorical Features - X_test
we need to remove those categorical variables that tend to occur less than 1% of the observation

edcat - Categorical Data

default - Categorical Data

jobsat - Categorical Data

homeown - Categorical Data

In [19]:
categ_features = [feature for feature in X_test.columns if 'edcat' in feature or 'default' in feature or 'jobsat' in feature or 'homeown' in feature]
print(categ_features)

['edcat', 'default', 'jobsat', 'homeown']


In [20]:
# finding the rarity of variable in categorcal feature

for feature in categ_features:
    percent_rarity=X_test.groupby(feature)['income'].count()/len(X_test)
    temp_df=percent_rarity[percent_rarity>0.01].index
    X_test[feature]= np.where(X_test[feature].isin(temp_df),X_test[feature],'Rare_Var')

rare_value_edcat = [value for value in X_test['edcat'].values if 'Rare_Var' in value]
print(rare_value_edcat)


rare_value_default = [value for value in X_test['default'].values if 'Rare_Var' in value]
print(rare_value_default)


rare_value_jobsat = [value for value in X_test['jobsat'].values if 'Rare_Var' in value]
print(rare_value_jobsat)


rare_value_homeown = [value for value in X_test['homeown'].values if 'Rare_Var' in value]
print(rare_value_homeown)


[]
[]
[]
[]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[feature]= np.where(X_test[feature].isin(temp_df),X_test[feature],'Rare_Var')


### Feature Scaling - X-Test
We need to do feature scaling because we have different feature measured with different units 
hence its good to do scaling before proceeding to apply machine learnign Algorithm.
#### Helps in applying M.L. Algo in a better way
###### Remember scaling is not applied on the Dependent Variable (income)

In [21]:
##### checking for infinity
  
temp = X_test.isin([np.inf, -np.inf])
print(temp)

#### printing the count of infinity values")
  
count = np.isinf(temp).values.sum()
print("It contains " + str(count) + " infinite values")

count = np.isnan(temp).values.sum()
print("It contains " + str(count) + " nan values")

X_test.dtypes

        age  yrsed  edcat  yrsempl  income  creddebt  othdebt  default  \
2022  False  False  False    False   False     False    False    False   
1139  False  False  False    False   False     False    False    False   
595   False  False  False    False   False     False    False    False   
1420  False  False  False    False   False     False    False    False   
2486  False  False  False    False   False     False    False    False   
...     ...    ...    ...      ...     ...       ...      ...      ...   
483   False  False  False    False   False     False    False    False   
1600  False  False  False    False   False     False    False    False   
3505  False  False  False    False   False     False    False    False   
2350  False  False  False    False   False     False    False    False   
4417  False  False  False    False   False     False    False    False   

      jobsat  homeown  address   cars  carvalue  
2022   False    False    False  False     False  
1139   Fals

age           int64
yrsed         int64
edcat        object
yrsempl       int64
income      float64
creddebt    float64
othdebt     float64
default      object
jobsat       object
homeown      object
address       int64
cars          int64
carvalue    float64
dtype: object

In [None]:
scale_feature = [feature for feature in X_test.columns if feature not in ['income'] ]
print(scale_feature)
from sklearn.preprocessing import MinMaxScaler
scaler2=MinMaxScaler()

#this line helps to solve the Value error of infinte and large for float 62
X_test = X_test.replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)
scaler2.fit(X_test[scale_feature])



In [23]:
X_test = pd.concat([X_test['income'].reset_index(drop=True),pd.DataFrame(scaler.transform(X_test[scale_feature]),columns=scale_feature)],axis=1)


In [24]:
X_test.head()

Unnamed: 0,income,age,yrsed,edcat,yrsempl,creddebt,othdebt,default,jobsat,homeown,address,cars,carvalue
0,4.477337,0.459016,0.529412,0.5,0.211538,0.532582,0.537874,0.0,0.75,1.0,0.315789,0.166667,0.797952
1,3.688879,0.57377,0.117647,0.0,0.269231,0.4173,0.288202,0.0,0.25,1.0,0.315789,0.166667,0.512073
2,3.89182,0.245902,0.529412,0.5,0.115385,0.593294,0.544614,0.0,0.25,1.0,0.192982,0.0,0.644696
3,2.890372,0.032787,0.352941,0.25,0.019231,0.596469,0.381121,1.0,0.0,0.0,0.0,0.166667,0.321673
4,3.951244,0.803279,0.705882,0.75,0.096154,0.573228,0.618974,1.0,0.0,0.0,0.45614,0.166667,0.616721


In [25]:
X_test.to_csv('X_test.csv',index=False)

# Now We Are Ready For Feature Selection 