# **Feature Engineering:**
### 1. Feature Creation
### 2. Feature Enoding

## Importing Libraries

In [1]:
# Standard Libraries for Data Manipulation
import pandas as pd 
import numpy as np
pd.set_option('display.max_columns', None)

# For Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preventing warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Loading the dataset
df = pd.read_csv("cleaned_survey.csv")
df.head()

Unnamed: 0,Age,Gender,Country,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,benefits,care_options,wellness_program,seek_help,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence
0,37,Female,United States,No,No,Yes,Often,25,No,Yes,Yes,Not sure,No,Yes,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No
1,44,Male,United States,No,No,No,Rarely,1000,No,No,Don't know,No,Don't know,Don't know,Don't know,Don't know,Maybe,No,No,No,No,No,Don't know,No
2,32,Male,Canada,No,No,No,Rarely,25,No,Yes,No,No,No,No,Don't know,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No
3,31,Male,United Kingdom,No,Yes,Yes,Often,100,No,Yes,No,Yes,No,No,No,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes
4,31,Male,United States,No,No,No,Never,500,Yes,Yes,Yes,No,Don't know,Don't know,Don't know,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No


## Feature Creation

### Creating a column on the basis of help and facilities provided by employer or company
For this we will use the columns:
> benfits, care_options, seek_help, leave, anonymity, mental_vs_physical

In [3]:
def facilties(row):
    '''
    This function will be used to create the new feature 'facilities'
    '''
    label=0

    # benifits
    if row['benefits'] == 'Yes':
        label += 1
    elif row['benefits'] == "No":
        label += -1
    
    # care_options
    if row['care_options'] == 'Yes':
        label += 1
    elif row['care_options'] == "No":
        label += -1
    
    # seek_help
    if row['seek_help'] == 'Yes':
        label += 1
    elif row['seek_help'] == "No":
        label += -1
    
    # leave
    if row['leave'] == 'Very easy':
        label += 1
    elif row['leave'] == 'Somewhat easy':
        label += 0.5
    elif row['leave'] == "Don't know":
        label += 0
    elif row['leave'] == 'Somewhat difficult':
        label += -0.5
    elif row['leave'] == 'Very difficult':
        label += -1
    
    # anonymity
    if row['anonymity'] == 'Yes':
        label += 1
    elif row['anonymity'] == "No":
        label += -1

    # mental_vs_physical
    if row['mental_vs_physical'] == 'Yes':
        label += 1
    elif row['mental_vs_physical'] == "No":
        label += -1
    
    return label

In [4]:
df["facilities"]=df.apply(facilties, axis=1)

In [5]:
df.drop(columns=['benefits', 'care_options', 'seek_help', 'leave',"anonymity","mental_vs_physical"], inplace=True,axis=1)

## Creating a column on the basis of what employees priotize -- mental or physical health
For this we would use following features:
> mental_health_consequence, phys_health_consequence, mental_health_interview, and phys_health_interview

In [6]:
print("Mental Health Consequence :", df['mental_health_consequence'].unique())
print("Physical Health Consequence :", df['phys_health_consequence'].unique())
print("Mental Health Interview :", df['mental_health_interview'].unique())
print("Physical Health Interview :", df['phys_health_interview'].unique())

Mental Health Consequence : ['No' 'Maybe' 'Yes']
Physical Health Consequence : ['No' 'Yes' 'Maybe']
Mental Health Interview : ['No' 'Yes' 'Maybe']
Physical Health Interview : ['Maybe' 'No' 'Yes']


In [7]:
def priotize_mental_health(row):
    '''
    This function will be used to create the new feature 'priortize mental health'
    '''

    label=0

    # mental_health_consequence
    if row['mental_health_consequence'] == 'Yes':
        label += 1
    elif row['mental_health_consequence'] == "No":
        label += -1
    
    # phys_health_consequence
    if row['phys_health_consequence'] == 'Yes':
        label += -1
    elif row['phys_health_consequence'] == "No":
        label += 1
    
    # mental_health_interview
    if row['mental_health_interview'] == 'Yes':
        label += 1
    elif row['mental_health_interview'] == "No":
        label += -1
    
    # phys_health_interview
    if row['phys_health_interview'] == 'Yes':
        label += -1
    elif row['phys_health_interview'] == "No":
        label += 1
    
    return label

In [8]:
df["priotize_mental_health"]=df.apply(priotize_mental_health, axis=1)
# Displaying the first few rows of the updated DataFrame
df.head()

Unnamed: 0,Age,Gender,Country,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,wellness_program,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,obs_consequence,facilities,priotize_mental_health
0,37,Female,United States,No,No,Yes,Often,25,No,Yes,No,No,No,Some of them,Yes,No,Maybe,No,4.5,-1
1,44,Male,United States,No,No,No,Rarely,1000,No,No,Don't know,Maybe,No,No,No,No,No,No,-1.0,1
2,32,Male,Canada,No,No,No,Rarely,25,No,Yes,No,No,No,Yes,Yes,Yes,Yes,No,-4.5,0
3,31,Male,United Kingdom,No,Yes,Yes,Often,100,No,Yes,No,Yes,Yes,Some of them,No,Maybe,Maybe,Yes,-3.5,0
4,31,Male,United States,No,No,No,Never,500,Yes,Yes,Don't know,No,No,Some of them,Yes,Yes,Yes,No,0.0,0


In [9]:
df.drop(columns=['mental_health_consequence', 'phys_health_consequence', 'mental_health_interview','phys_health_interview'], inplace=True)
df.head()

Unnamed: 0,Age,Gender,Country,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,wellness_program,coworkers,supervisor,obs_consequence,facilities,priotize_mental_health
0,37,Female,United States,No,No,Yes,Often,25,No,Yes,No,Some of them,Yes,No,4.5,-1
1,44,Male,United States,No,No,No,Rarely,1000,No,No,Don't know,No,No,No,-1.0,1
2,32,Male,Canada,No,No,No,Rarely,25,No,Yes,No,Yes,Yes,No,-4.5,0
3,31,Male,United Kingdom,No,Yes,Yes,Often,100,No,Yes,No,Some of them,No,Yes,-3.5,0
4,31,Male,United States,No,No,No,Never,500,Yes,Yes,Don't know,Some of them,Yes,No,0.0,0


# Feature Encoding

In [10]:
X=df.drop("treatment", axis=1)  # Independent features
y=df["treatment"]               # Target feature

# Differentiating features based on feature types
numerical_features = [feature for feature in X.columns if df[feature].dtype !='O']
categorical_features = [feature for feature in X.columns if df[feature].dtype =='O']

print("Numerical Features:", numerical_features)
print("Categorical Features:", categorical_features)

Numerical Features: ['Age', 'no_employees', 'facilities', 'priotize_mental_health']
Categorical Features: ['Gender', 'Country', 'self_employed', 'family_history', 'work_interfere', 'remote_work', 'tech_company', 'wellness_program', 'coworkers', 'supervisor', 'obs_consequence']


In [11]:
# Differentiating categorical features based on number of unique values
categorical_features_binary= [feature for feature in categorical_features if df[feature].nunique() == 2]
categorical_features_3= [feature for feature in categorical_features if df[feature].nunique()==3]
categorical_feature_low= [feature for feature in categorical_features if 3 < df[feature].nunique() < 8]

print("Binary Categorical Features:", categorical_features_binary)
print("Categorical Features with 3 unique values:", categorical_features_3)
print("Categorical Features with less than 8 unique values:", categorical_feature_low)

Binary Categorical Features: ['self_employed', 'family_history', 'remote_work', 'tech_company', 'obs_consequence']
Categorical Features with 3 unique values: ['Gender', 'wellness_program', 'coworkers', 'supervisor']
Categorical Features with less than 8 unique values: ['work_interfere']


In [12]:
# Making custom encoding maps for each category

binary_encoding_map = {'Yes': 1,'No': 0}
gender_encoding_map= {'Other': 0, 'Female': 1,'Male': 2}
well_encoding_map = {'Yes': 1, 'No': -1, "Don't know": 0}
coworker_encoding_map = {'Yes': 1, 'No': -1, "Some of them": 0}
supervisor_encoding_map = {'Yes': 1, 'No': -1, "Some of them": 0}
work_interfere_encoding_map = {'Never': 0, 'Rarely': 1, 'Sometimes': 2, 'Often': 3}

In [13]:
# Custom encoding map for Countries
counrty_counts= df['Country'].value_counts()
country_encoder_map=counrty_counts.rank(method='dense',ascending=False).astype(int).to_dict()

In [14]:
# Custom class for encoding categorical features
from utilities import Label_Map_Encoder

In [15]:
# Creating Pipeline and ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# Creating a ColumnTransformer for preprocessing
preprocessor = ColumnTransformer(
    [("binary", Pipeline([("encoder",Label_Map_Encoder(binary_encoding_map))]), categorical_features_binary),
     ("gender",Pipeline([("encoder",Label_Map_Encoder(gender_encoding_map))]),[ 'Gender']),
     ("well", Pipeline([("encoder",Label_Map_Encoder(well_encoding_map))]), ['wellness_program']),
     ("coworker", Pipeline([("encoder",Label_Map_Encoder(coworker_encoding_map))]), ['coworkers']),
     ("supervisor", Pipeline([("encoder",Label_Map_Encoder(supervisor_encoding_map))]), ['supervisor']),
     ("work_interfere",Pipeline([("encoder",Label_Map_Encoder(work_interfere_encoding_map)),('standard',StandardScaler())]), ['work_interfere']),
     ("country", Pipeline([('encoder',Label_Map_Encoder(country_encoder_map)),("standard",StandardScaler())]), ['Country']),
     ("standardization", StandardScaler(), numerical_features)]
)
preprocessor

0,1,2
,transformers,"[('binary', ...), ('gender', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,encoding_map,"{'No': 0, 'Yes': 1}"

0,1,2
,encoding_map,"{'Female': 1, 'Male': 2, 'Other': 0}"

0,1,2
,encoding_map,"{""Don't know"": 0, 'No': -1, 'Yes': 1}"

0,1,2
,encoding_map,"{'No': -1, 'Some of them': 0, 'Yes': 1}"

0,1,2
,encoding_map,"{'No': -1, 'Some of them': 0, 'Yes': 1}"

0,1,2
,encoding_map,"{'Never': 0, 'Often': 3, 'Rarely': 1, 'Sometimes': 2}"

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,encoding_map,"{'Australia': 6, 'Austria': 14, 'Belgium': 11, 'Bosnia and Herzegovina': 16, ...}"

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,copy,True
,with_mean,True
,with_std,True


# Model Training and Evaluation

In [16]:
y = y.map({'Yes': 1, 'No': 0})
y.unique()

array([1, 0])

In [17]:
# Splitting the dataset into training and testing sets to prevent data leakage
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Import models to be used
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "SVC (Linear)": SVC(kernel='linear', probability=True),
    "SVC (RBF)": SVC(kernel='rbf', probability=True),
    "SVC (Poly)": SVC(kernel='poly', degree=2, probability=True),
    "SVC (Sigmoid)": SVC(kernel='sigmoid', probability=True),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Extra Trees": ExtraTreesClassifier(),
    "Bagging": BaggingClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0)
}

In [18]:
# Iterating through each model, fitting it, and evaluating its performance
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score,roc_auc_score, confusion_matrix
 
for name,model in models.items():
        # Creating a pipeline for each model
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    # Fitting the model
    pipeline.fit(X_train, y_train)
    
    # Making predictions
    y_pred = pipeline.predict(X_test)
    
    # Evaluating the model
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, ROC AUC: {roc_auc:.4f}")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("-" * 50)

Model: Logistic Regression
Accuracy: 0.6574, F1 Score: 0.6742, Precision: 0.7008, Recall: 0.6496, ROC AUC: 0.6582
Confusion Matrix:
 [[76 38]
 [48 89]]
--------------------------------------------------
Model: Random Forest
Accuracy: 0.7450, F1 Score: 0.7594, Precision: 0.7829, Recall: 0.7372, ROC AUC: 0.7458
Confusion Matrix:
 [[ 86  28]
 [ 36 101]]
--------------------------------------------------
Model: Decision Tree
Accuracy: 0.6693, F1 Score: 0.6937, Precision: 0.7015, Recall: 0.6861, ROC AUC: 0.6676
Confusion Matrix:
 [[74 40]
 [43 94]]
--------------------------------------------------
Model: SVC (Linear)
Accuracy: 0.6693, F1 Score: 0.6693, Precision: 0.7368, Recall: 0.6131, ROC AUC: 0.6750
Confusion Matrix:
 [[84 30]
 [53 84]]
--------------------------------------------------
Model: SVC (RBF)
Accuracy: 0.7092, F1 Score: 0.7203, Precision: 0.7581, Recall: 0.6861, ROC AUC: 0.7115
Confusion Matrix:
 [[84 30]
 [43 94]]
--------------------------------------------------
Model: SVC

## Selection of Model algorithm:
From above output we can observe that RandomForest best compare to all other models.
> Now, Let's do Randomized Search Cross Validation for hyper parameter tuning.

In [19]:

# Parameters for hyperparameter tuning

params = {
    "Logistic Regression": {
        "model__C": [0.01, 0.1, 1, 10],
        "model__solver": ["liblinear", "lbfgs"],
        "model__penalty": ["l2"]
    },
    "Random Forest": {
        "model__n_estimators": [100, 200],
        "model__max_depth": [None, 10, 20],
        "model__min_samples_split": [2, 5]
    },
    "Decision Tree": {
        "model__max_depth": [None, 10, 20],
        "model__min_samples_split": [2, 5, 10],
        "model__criterion": ["gini", "entropy"]
    },
    "SVC (Linear)": {
        "model__C": [0.1, 1, 10]
    },
    "SVC (RBF)": {
        "model__C": [0.1, 1, 10],
        "model__gamma": ['scale', 0.01, 0.001]
    },
    "SVC (Poly)": {
        "model__C": [0.1, 1],
        "model__degree": [2, 3],
        "model__gamma": ['scale']
    },
    "SVC (Sigmoid)": {
        "model__C": [0.1, 1, 10],
        "model__gamma": ['scale', 0.01]
    },
    "K-Nearest Neighbors": {
        "model__n_neighbors": [3, 5, 7],
        "model__weights": ["uniform", "distance"]
    },
    "Gradient Boosting": {
        "model__n_estimators": [100, 200],
        "model__learning_rate": [0.01, 0.1],
        "model__max_depth": [3, 5]
    },
    "AdaBoost": {
        "model__n_estimators": [50, 100, 200],
        "model__learning_rate": [0.01, 0.1, 1.0]
    },
    "Extra Trees": {
        "model__n_estimators": [100, 200],
        "model__max_depth": [None, 10, 20]
    },
    "Bagging": {
        "model__n_estimators": [10, 50, 100],
        "model__max_samples": [0.5, 1.0],
        "model__max_features": [0.5, 1.0]
    },
    "XGBoost": {
        "model__n_estimators": [100, 200],
        "model__learning_rate": [0.01, 0.1],
        "model__max_depth": [3, 5]
    },
    "LightGBM": {
        "model__n_estimators": [100, 200],
        "model__learning_rate": [0.01, 0.1],
        "model__max_depth": [-1, 10, 20]
    },
    "CatBoost": {
        "model__iterations": [100, 200],
        "model__learning_rate": [0.01, 0.1],
        "model__depth": [3, 6, 9]
    }
}


In [20]:
from sklearn.model_selection import RandomizedSearchCV

results = {}

for name, model in models.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor), 
        ("model", model)
    ])
    
    search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=params[name],
        n_iter=10,
        scoring="accuracy",
        n_jobs=-1,
        random_state=42
    )
    
    search.fit(X, y)
    results[name]=search.best_estimator_

[LightGBM] [Info] Number of positive: 632, number of negative: 620
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000346 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 122
[LightGBM] [Info] Number of data points in the train set: 1252, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.504792 -> initscore=0.019170
[LightGBM] [Info] Start training from score 0.019170


In [21]:
# Evaluating the model
for name,model in results.items():
    y_pred_rf = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred_rf)
    f1 = f1_score(y_test, y_pred_rf)
    precision = precision_score(y_test, y_pred_rf)
    recall = recall_score(y_test, y_pred_rf)
    roc_auc = roc_auc_score(y_test, y_pred_rf)

    print(f"Model:{name} (Tuned)")
    print(f"Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, ROC AUC: {roc_auc:.4f}")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
    print("-" * 50)

Model:Logistic Regression (Tuned)
Accuracy: 0.6892, F1 Score: 0.7068, Precision: 0.7287, Recall: 0.6861, ROC AUC: 0.6896
Confusion Matrix:
 [[79 35]
 [43 94]]
--------------------------------------------------
Model:Random Forest (Tuned)
Accuracy: 0.8526, F1 Score: 0.8655, Precision: 0.8623, Recall: 0.8686, ROC AUC: 0.8510
Confusion Matrix:
 [[ 95  19]
 [ 18 119]]
--------------------------------------------------
Model:Decision Tree (Tuned)
Accuracy: 0.8287, F1 Score: 0.8401, Precision: 0.8561, Recall: 0.8248, ROC AUC: 0.8291
Confusion Matrix:
 [[ 95  19]
 [ 24 113]]
--------------------------------------------------
Model:SVC (Linear) (Tuned)
Accuracy: 0.7052, F1 Score: 0.7063, Precision: 0.7739, Recall: 0.6496, ROC AUC: 0.7108
Confusion Matrix:
 [[88 26]
 [48 89]]
--------------------------------------------------
Model:SVC (RBF) (Tuned)
Accuracy: 0.7729, F1 Score: 0.7833, Precision: 0.8175, Recall: 0.7518, ROC AUC: 0.7750
Confusion Matrix:
 [[ 91  23]
 [ 34 103]]
------------------

## Observation : 
We can clearly see that LightGBM really showed best performance out of models
So, selecting LghtGBM as final model for predictions 

In [22]:
final_model=results['LightGBM']
final_model

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('binary', ...), ('gender', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,encoding_map,"{'No': 0, 'Yes': 1}"

0,1,2
,encoding_map,"{'Female': 1, 'Male': 2, 'Other': 0}"

0,1,2
,encoding_map,"{""Don't know"": 0, 'No': -1, 'Yes': 1}"

0,1,2
,encoding_map,"{'No': -1, 'Some of them': 0, 'Yes': 1}"

0,1,2
,encoding_map,"{'No': -1, 'Some of them': 0, 'Yes': 1}"

0,1,2
,encoding_map,"{'Never': 0, 'Often': 3, 'Rarely': 1, 'Sometimes': 2}"

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,encoding_map,"{'Australia': 6, 'Austria': 14, 'Belgium': 11, 'Bosnia and Herzegovina': 16, ...}"

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,20
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


# Saving the models on hard disk

In [23]:
# Saving the best models
import joblib

joblib.dump(final_model, 'Light_GBM_Classification.pkl')

['Light_GBM_Classification.pkl']