# **Feature Engineering:**
### 1. Feature Creation
### 2. Feature Enoding
### 3. Feature Selection

# Importing Libraries

In [1]:
# Standard Libraries for Data Manipulation
import pandas as pd 
import numpy as np
pd.set_option('display.max_columns', None)

# For Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preventing warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Loading the dataset
df = pd.read_csv("cleaned_survey.csv")
df.head()

Unnamed: 0,Age,Gender,Country,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,benefits,care_options,wellness_program,seek_help,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence
0,37,Female,United States,No,No,Yes,Often,25,No,Yes,Yes,Not sure,No,Yes,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No
1,44,Male,United States,No,No,No,Rarely,1000,No,No,Don't know,No,Don't know,Don't know,Don't know,Don't know,Maybe,No,No,No,No,No,Don't know,No
2,32,Male,Canada,No,No,No,Rarely,25,No,Yes,No,No,No,No,Don't know,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No
3,31,Male,United Kingdom,No,Yes,Yes,Often,100,No,Yes,No,Yes,No,No,No,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes
4,31,Male,United States,No,No,No,Never,500,Yes,Yes,Yes,No,Don't know,Don't know,Don't know,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No


## Feature Creation

### Creating a column on the basis of help and facilities provided by employer or company
For this we will use the columns:
> benfits, care_options, seek_help, leave, anonymity, mental_vs_physical

In [3]:
print("Benefits :", df['benefits'].unique())
print("Care Options :", df["care_options"].unique())
print("Seek Help :", df["seek_help"].unique())
print("Leave :", df["leave"].unique())
print("Anonymity :", df["anonymity"].unique())
print("Mental vs Physical :", df["mental_vs_physical"].unique())

Benefits : ['Yes' "Don't know" 'No']
Care Options : ['Not sure' 'No' 'Yes']
Seek Help : ['Yes' "Don't know" 'No']
Leave : ['Somewhat easy' "Don't know" 'Somewhat difficult' 'Very difficult'
 'Very easy']
Anonymity : ['Yes' "Don't know" 'No']
Mental vs Physical : ['Yes' "Don't know" 'No']


In [4]:
def facilties(row):
    '''
    This function will be used to create the new feature 'facilities'
    '''
    label=0

    # benifits
    if row['benefits'] == 'Yes':
        label += 1
    elif row['benefits'] == "No":
        label += -1
    
    # care_options
    if row['care_options'] == 'Yes':
        label += 1
    elif row['care_options'] == "No":
        label += -1
    
    # seek_help
    if row['seek_help'] == 'Yes':
        label += 1
    elif row['seek_help'] == "No":
        label += -1
    
    # leave
    if row['leave'] == 'Very easy':
        label += 1
    elif row['leave'] == 'Somewhat easy':
        label += 0.5
    elif row['leave'] == "Don't know":
        label += 0
    elif row['leave'] == 'Somewhat difficult':
        label += -0.5
    elif row['leave'] == 'Very difficult':
        label += -1
    
    # anonymity
    if row['anonymity'] == 'Yes':
        label += 1
    elif row['anonymity'] == "No":
        label += -1

    # mental_vs_physical
    if row['mental_vs_physical'] == 'Yes':
        label += 1
    elif row['mental_vs_physical'] == "No":
        label += -1
    
    return label

In [5]:
df["facilities"]=df.apply(facilties, axis=1)
df.drop(columns=['benefits', 'care_options', 'seek_help', 'leave',"anonymity","mental_vs_physical"], inplace=True,axis=1)

## Creating a column on the basis of what employees priotize -- mental or physical health
For this we would use following features:
> mental_health_consequence, phys_health_consequence, mental_health_interview, and phys_health_interview

In [6]:
print("Mental Health Consequence :", df['mental_health_consequence'].unique())
print("Physical Health Consequence :", df['phys_health_consequence'].unique())
print("Mental Health Interview :", df['mental_health_interview'].unique())
print("Physical Health Interview :", df['phys_health_interview'].unique())

Mental Health Consequence : ['No' 'Maybe' 'Yes']
Physical Health Consequence : ['No' 'Yes' 'Maybe']
Mental Health Interview : ['No' 'Yes' 'Maybe']
Physical Health Interview : ['Maybe' 'No' 'Yes']


In [7]:
def priotize_mental_health(row):
    '''
    This function will be used to create the new feature 'priortize mental health'
    '''

    label=0

    # mental_health_consequence
    if row['mental_health_consequence'] == 'Yes':
        label += 1
    elif row['mental_health_consequence'] == "No":
        label += -1
    
    # phys_health_consequence
    if row['phys_health_consequence'] == 'Yes':
        label += -1
    elif row['phys_health_consequence'] == "No":
        label += 1
    
    # mental_health_interview
    if row['mental_health_interview'] == 'Yes':
        label += 1
    elif row['mental_health_interview'] == "No":
        label += -1
    
    # phys_health_interview
    if row['phys_health_interview'] == 'Yes':
        label += -1
    elif row['phys_health_interview'] == "No":
        label += 1
    
    return label

In [8]:
df["priotize_mental_health"]=df.apply(priotize_mental_health, axis=1)
df.head()

Unnamed: 0,Age,Gender,Country,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,wellness_program,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,obs_consequence,facilities,priotize_mental_health
0,37,Female,United States,No,No,Yes,Often,25,No,Yes,No,No,No,Some of them,Yes,No,Maybe,No,4.5,-1
1,44,Male,United States,No,No,No,Rarely,1000,No,No,Don't know,Maybe,No,No,No,No,No,No,-1.0,1
2,32,Male,Canada,No,No,No,Rarely,25,No,Yes,No,No,No,Yes,Yes,Yes,Yes,No,-4.5,0
3,31,Male,United Kingdom,No,Yes,Yes,Often,100,No,Yes,No,Yes,Yes,Some of them,No,Maybe,Maybe,Yes,-3.5,0
4,31,Male,United States,No,No,No,Never,500,Yes,Yes,Don't know,No,No,Some of them,Yes,Yes,Yes,No,0.0,0


In [9]:
df.drop(columns=['mental_health_consequence', 'phys_health_consequence', 'mental_health_interview','phys_health_interview'], inplace=True)
df.head()

Unnamed: 0,Age,Gender,Country,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,wellness_program,coworkers,supervisor,obs_consequence,facilities,priotize_mental_health
0,37,Female,United States,No,No,Yes,Often,25,No,Yes,No,Some of them,Yes,No,4.5,-1
1,44,Male,United States,No,No,No,Rarely,1000,No,No,Don't know,No,No,No,-1.0,1
2,32,Male,Canada,No,No,No,Rarely,25,No,Yes,No,Yes,Yes,No,-4.5,0
3,31,Male,United Kingdom,No,Yes,Yes,Often,100,No,Yes,No,Some of them,No,Yes,-3.5,0
4,31,Male,United States,No,No,No,Never,500,Yes,Yes,Don't know,Some of them,Yes,No,0.0,0


In [10]:
df.head()

Unnamed: 0,Age,Gender,Country,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,wellness_program,coworkers,supervisor,obs_consequence,facilities,priotize_mental_health
0,37,Female,United States,No,No,Yes,Often,25,No,Yes,No,Some of them,Yes,No,4.5,-1
1,44,Male,United States,No,No,No,Rarely,1000,No,No,Don't know,No,No,No,-1.0,1
2,32,Male,Canada,No,No,No,Rarely,25,No,Yes,No,Yes,Yes,No,-4.5,0
3,31,Male,United Kingdom,No,Yes,Yes,Often,100,No,Yes,No,Some of them,No,Yes,-3.5,0
4,31,Male,United States,No,No,No,Never,500,Yes,Yes,Don't know,Some of them,Yes,No,0.0,0


# Feature Encoding

In [11]:
X = df.drop(columns=['Age'], axis=1)   # Features
y = df['Age']                          # Target Variable

# Differentiating features based on feature types
numerical_features = [feature for feature in X.columns if df[feature].dtype !='O']
categorical_features = [feature for feature in X.columns if df[feature].dtype =='O']

print("Numerical Features:", numerical_features)
print("Categorical Features:", categorical_features)

Numerical Features: ['no_employees', 'facilities', 'priotize_mental_health']
Categorical Features: ['Gender', 'Country', 'self_employed', 'family_history', 'treatment', 'work_interfere', 'remote_work', 'tech_company', 'wellness_program', 'coworkers', 'supervisor', 'obs_consequence']


In [12]:
# Differentiating categorical features based on number of unique values
categorical_features_binary= [feature for feature in categorical_features if df[feature].nunique() == 2]
categorical_features_3= [feature for feature in categorical_features if df[feature].nunique()==3]
categorical_feature_low= [feature for feature in categorical_features if 3 < df[feature].nunique() < 8]

print("Binary Categorical Features:", categorical_features_binary)
print("Categorical Features with 3 unique values:", categorical_features_3)
print("Categorical Features with less than 8 unique values:", categorical_feature_low)

Binary Categorical Features: ['self_employed', 'family_history', 'treatment', 'remote_work', 'tech_company', 'obs_consequence']
Categorical Features with 3 unique values: ['Gender', 'wellness_program', 'coworkers', 'supervisor']
Categorical Features with less than 8 unique values: ['work_interfere']


In [13]:
# Making custom encoding maps for each category

binary_encoding_map = {'Yes': 1,'No': 0}
gender_encoding_map= {'Other': 0, 'Female': 1,'Male': 2}
well_encoding_map = {'Yes': 1, 'No': -1, "Don't know": 0}
coworker_encoding_map = {'Yes': 1, 'No': -1, "Some of them": 0}
supervisor_encoding_map = {'Yes': 1, 'No': -1, "Some of them": 0}
work_interfere_encoding_map = {'Never': 0, 'Rarely': 1, 'Sometimes': 2, 'Often': 3}

In [14]:
# Custom encoding map for Countries
counrty_counts= df['Country'].value_counts()
country_encoder_map=counrty_counts.rank(method='dense',ascending=False).astype(int).to_dict()

In [15]:
# Custom class for encoding categorical features
from utilities import Label_Map_Encoder

In [16]:
# Creating Pipeline and ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# Creating a ColumnTransformer for preprocessing
preprocessor = ColumnTransformer(
    [("binary", Pipeline([("encoder",Label_Map_Encoder(binary_encoding_map))]), categorical_features_binary),
     ("gender",Pipeline([("encoder",Label_Map_Encoder(gender_encoding_map))]),[ 'Gender']),
     ("well", Pipeline([("encoder",Label_Map_Encoder(well_encoding_map))]), ['wellness_program']),
     ("coworker", Pipeline([("encoder",Label_Map_Encoder(coworker_encoding_map))]), ['coworkers']),
     ("supervisor", Pipeline([("encoder",Label_Map_Encoder(supervisor_encoding_map))]), ['supervisor']),
     ("work_interfere",Pipeline([("encoder",Label_Map_Encoder(work_interfere_encoding_map)),('standard',StandardScaler())]), ['work_interfere']),
     ("country", Pipeline([('encoder',Label_Map_Encoder(country_encoder_map)),("standard",StandardScaler())]), ['Country']),
     ("standardization", StandardScaler(), numerical_features)]
)
preprocessor

0,1,2
,transformers,"[('binary', ...), ('gender', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,encoding_map,"{'No': 0, 'Yes': 1}"

0,1,2
,encoding_map,"{'Female': 1, 'Male': 2, 'Other': 0}"

0,1,2
,encoding_map,"{""Don't know"": 0, 'No': -1, 'Yes': 1}"

0,1,2
,encoding_map,"{'No': -1, 'Some of them': 0, 'Yes': 1}"

0,1,2
,encoding_map,"{'No': -1, 'Some of them': 0, 'Yes': 1}"

0,1,2
,encoding_map,"{'Never': 0, 'Often': 3, 'Rarely': 1, 'Sometimes': 2}"

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,encoding_map,"{'Australia': 6, 'Austria': 14, 'Belgium': 11, 'Bosnia and Herzegovina': 16, ...}"

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [17]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
features=categorical_features_binary+['Gender']+['wellness_program']+['coworkers']+['supervisor']+['work_interefrer']+["Country"]+numerical_features
# Converting encoded numpy arrays to pandas dataframe
X_encoded=preprocessor.fit_transform(X_train)
X_transformed= pd.DataFrame(X_encoded,columns=features)

## Feature Selection

In [18]:
# Using mutual information regressor for selecting features
from sklearn.feature_selection import mutual_info_regression

X_transformed = preprocessor.fit_transform(X)
mi_scores = mutual_info_regression(X_transformed, y, random_state=42)

mi_series = pd.Series(mi_scores, index=features).sort_values(ascending=False)
print(mi_series)

Country                   0.041052
no_employees              0.038464
Gender                    0.020419
tech_company              0.017553
remote_work               0.003480
self_employed             0.001415
family_history            0.000000
obs_consequence           0.000000
treatment                 0.000000
wellness_program          0.000000
coworkers                 0.000000
work_interefrer           0.000000
supervisor                0.000000
facilities                0.000000
priotize_mental_health    0.000000
dtype: float64


> **Observation**: Country,no_employees,Gender, tech_company has highest dependency on target Age i.e we can continue with just these columns and ignore other features since they have low dependency.

In [19]:
final_preprocessor= ColumnTransformer([
    ("country", Pipeline([('encoder',Label_Map_Encoder(country_encoder_map)),("standard",StandardScaler())]), ['Country']),
    ("standardization", StandardScaler(),['no_employees']),
    ("gender",Pipeline([("encoder",Label_Map_Encoder(gender_encoding_map))]),['Gender']),
    ("binary", Pipeline([("encoder",Label_Map_Encoder(binary_encoding_map))]), ['tech_company'])
])
X_train,X_test=X_train[['Country','no_employees','Gender','tech_company']],X_test[['Country','no_employees','Gender','tech_company']]

# Model Training and Evaluation

In [20]:
# Different to try with
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR


# Dictionary for easy model training and evaluation
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "ElasticNet": ElasticNet(),
    "KNN Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "SVR": SVR()
}

In [21]:
# Model Training and Evaluation
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score,root_mean_squared_error

for name,model in models.items():
    # Creating a pipeline for preprocessing and model training
    pipeline = Pipeline([
        ("preprocessor", final_preprocessor),
        ("model", model)
    ])
    
    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print('-'*10,name,'-'*10)
    print(f"MAE: {mae:.2f}, MSE: {mse:.2f}, RMSE: {rmse:.2f}, R2: {r2:.2f}")
    print("="*45)

---------- Linear Regression ----------
MAE: 5.36, MSE: 46.50, RMSE: 6.82, R2: 0.03
---------- Ridge Regression ----------
MAE: 5.36, MSE: 46.50, RMSE: 6.82, R2: 0.03
---------- Lasso Regression ----------
MAE: 5.51, MSE: 48.12, RMSE: 6.94, R2: 0.00
---------- ElasticNet ----------
MAE: 5.48, MSE: 47.63, RMSE: 6.90, R2: 0.01
---------- KNN Regressor ----------
MAE: 6.16, MSE: 61.31, RMSE: 7.83, R2: -0.27
---------- Decision Tree ----------
MAE: 5.75, MSE: 54.04, RMSE: 7.35, R2: -0.12
---------- Random Forest ----------
MAE: 5.63, MSE: 51.25, RMSE: 7.16, R2: -0.06
---------- SVR ----------
MAE: 5.27, MSE: 45.87, RMSE: 6.77, R2: 0.05


In [22]:
params={
    'Linear Regression':{}, # No parameters to play with
    'Ridge Regression' :{
        'model__alpha':[0.01, 0.1, 1, 10, 100]
    },
    'Lasso Regression':{
        'model__alpha':[0.0001, 0.001, 0.01, 0.1, 1, 10]
    },
    'ElasticNet':{
        'model__alpha': [0.0001, 0.001, 0.01, 0.1, 1],
        'model__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9, 1]
    },
    "KNN Regressor": {
        'model__n_neighbors': [3, 5, 10, 15, 20, 30, 50],
        'model__weights': ['uniform', 'distance'],
        'model__metric': ['euclidean', 'manhattan', 'minkowski']
    },
    "Decision Tree": {
        'model__max_depth': [3, 5, 10, 20, None],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4]
    },
    "Random Forest": {
        'model__n_estimators': [50, 100, 200],
        'model__max_depth': [None, 10, 20, 30],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4],
        'model__max_features': ['auto', 'sqrt', 'log2']
    },
    "SVR": {
        'model__kernel': ['rbf', 'linear', 'poly'],
        'model__C': [0.1, 1, 10, 100],
        'model__epsilon': [0.01, 0.1, 0.2, 0.5],
        'model__gamma': ['scale', 'auto']
    }   
}

In [23]:
from sklearn.model_selection import GridSearchCV
best_models={}
for name, model in models.items():
    model_cv=GridSearchCV(Pipeline([
        ('preprocessor',final_preprocessor),
        ("model",model)
        ]),
        param_grid=params[name],
        scoring='r2',
        cv=5,
        n_jobs=-1,
        verbose=1     
        )
    model_cv.fit(X_train,y_train)
    best_models[name]=model_cv.best_estimator_
    
# Evaluation of tuned models
for name,model in best_models.items():
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print('-'*10,name,'-'*10)
    print(f"MAE: {mae:.2f}, MSE: {mse:.2f}, RMSE: {rmse:.2f}, R2: {r2:.2f}")
    print("="*45)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Fitting 5 folds for each of 42 candidates, totalling 210 fits
Fitting 5 folds for each of 45 candidates, totalling 225 fits
Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Fitting 5 folds for each of 96 candidates, totalling 480 fits
---------- Linear Regression ----------
MAE: 5.36, MSE: 46.50, RMSE: 6.82, R2: 0.03
---------- Ridge Regression ----------
MAE: 5.37, MSE: 46.50, RMSE: 6.82, R2: 0.03
---------- Lasso Regression ----------
MAE: 5.37, MSE: 46.50, RMSE: 6.82, R2: 0.03
---------- ElasticNet ----------
MAE: 5.37, MSE: 46.50, RMSE: 6.82, R2: 0.03
---------- KNN Regressor ----------
MAE: 5.40, MSE: 47.17, RMSE: 6.87, R2: 0.02
---------- Decision Tree ----------
MAE: 5.43, MSE: 47.67, RMSE: 6.90, R2: 0.01
---------- Random Fore

>### **Selection of Model :**
> Since, SVR (Support Vector Regressor) performed best as compared to other models, we would be using SVR for our final predicting model.

## Saving the Model on Hard disk

In [24]:
import joblib

joblib.dump(best_models['SVR'],"SVR_regressor.pkl")

['SVR_regressor.pkl']