In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
import plotly.express as px

sns.set_style('whitegrid')
pd.set_option('display.max_columns', None)
import warnings
warnings.simplefilter('ignore')

In [2]:
df = pd.read_csv(r'C:\Users\Heena\project\dataset\survey.csv')

In [3]:
df.shape

(1259, 27)

In [4]:
# Convert all columns to lowercase
df.columns = df.columns.str.lower()

In [5]:
round((df.isna().sum()/df.shape[0])*100,2)

timestamp                     0.00
age                           0.00
gender                        0.00
country                       0.00
state                        40.91
self_employed                 1.43
family_history                0.00
treatment                     0.00
work_interfere               20.97
no_employees                  0.00
remote_work                   0.00
tech_company                  0.00
benefits                      0.00
care_options                  0.00
wellness_program              0.00
seek_help                     0.00
anonymity                     0.00
leave                         0.00
mental_health_consequence     0.00
phys_health_consequence       0.00
coworkers                     0.00
supervisor                    0.00
mental_health_interview       0.00
phys_health_interview         0.00
mental_vs_physical            0.00
obs_consequence               0.00
comments                     86.97
dtype: float64

In [6]:
df['country'].value_counts()

United States             751
United Kingdom            185
Canada                     72
Germany                    45
Ireland                    27
Netherlands                27
Australia                  21
France                     13
India                      10
New Zealand                 8
Poland                      7
Switzerland                 7
Sweden                      7
Italy                       7
South Africa                6
Belgium                     6
Brazil                      6
Israel                      5
Singapore                   4
Bulgaria                    4
Austria                     3
Finland                     3
Mexico                      3
Russia                      3
Denmark                     2
Greece                      2
Colombia                    2
Croatia                     2
Portugal                    2
Moldova                     1
Georgia                     1
Bahamas, The                1
China                       1
Thailand  

In [7]:
df['state'].unique()

array(['IL', 'IN', nan, 'TX', 'TN', 'MI', 'OH', 'CA', 'CT', 'MD', 'NY',
       'NC', 'MA', 'IA', 'PA', 'WA', 'WI', 'UT', 'NM', 'OR', 'FL', 'MN',
       'MO', 'AZ', 'CO', 'GA', 'DC', 'NE', 'WV', 'OK', 'KS', 'VA', 'NH',
       'KY', 'AL', 'NV', 'NJ', 'SC', 'VT', 'SD', 'ID', 'MS', 'RI', 'WY',
       'LA', 'ME'], dtype=object)

In [8]:
df = df.drop(['country','state','timestamp','comments'], axis = 1)

In [9]:
round(df.describe(include='all'),2)

Unnamed: 0,age,gender,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,benefits,care_options,wellness_program,seek_help,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence
count,1259.0,1259,1241,1259,1259,995,1259,1259,1259,1259,1259,1259,1259,1259,1259,1259,1259,1259,1259,1259,1259,1259,1259
unique,,49,2,2,2,4,6,2,2,3,3,3,3,3,5,3,3,3,3,3,3,3,2
top,,Male,No,No,Yes,Sometimes,6-25,No,Yes,Yes,No,No,No,Don't know,Don't know,No,No,Some of them,Yes,No,Maybe,Don't know,No
freq,,615,1095,767,637,465,290,883,1031,477,501,842,646,819,563,490,925,774,516,1008,557,576,1075
mean,79428150.0,,,,,,,,,,,,,,,,,,,,,,
std,2818299000.0,,,,,,,,,,,,,,,,,,,,,,
min,-1726.0,,,,,,,,,,,,,,,,,,,,,,
25%,27.0,,,,,,,,,,,,,,,,,,,,,,
50%,31.0,,,,,,,,,,,,,,,,,,,,,,
75%,36.0,,,,,,,,,,,,,,,,,,,,,,


In [10]:
df['gender'].value_counts().reset_index()

Unnamed: 0,index,gender
0,Male,615
1,male,206
2,Female,121
3,M,116
4,female,62
5,F,38
6,m,34
7,f,15
8,Make,4
9,Male,3


In [11]:
df['gender'].replace(['Male ', 'male', 'M', 'm', 'Male', 'Cis Male',
                     'Man', 'cis male', 'Mail', 'Male-ish', 'Male (CIS)',
                      'Cis Man', 'msle', 'Malr', 'Mal', 'maile', 'Make',], 'Male', inplace = True)

df['gender'].replace(['Female ', 'female', 'F', 'f', 'Woman', 'Female',
                     'femail', 'Cis Female', 'cis-female/femme', 'Femake', 'Female (cis)',
                     'woman',], 'Female', inplace = True)

df["gender"].replace(['Female (trans)', 'queer/she/they', 'non-binary',
                     'fluid', 'queer', 'Androgyne', 'Trans-female', 'male leaning androgynous',
                      'Agender', 'A little about you', 'Nah', 'All','ostensibly male, unsure what that really means',
                      'Genderqueer', 'Enby', 'p', 'Neuter', 'something kinda male?',
                      'Guy (-ish) ^_^', 'Trans woman',], 'Other', inplace = True)

In [12]:
df['gender'].value_counts()

Male      991
Female    247
Other      21
Name: gender, dtype: int64

In [13]:
# Let us treat age
df.loc[df.age<12,'age']=15
df.loc[df.age>75,'age']=75

In [14]:
df[df['age'] > 80].head()

Unnamed: 0,age,gender,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,benefits,care_options,wellness_program,seek_help,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence


In [15]:
# We should not perform eda on test data as it will cloud your decision of model selections and hyperparameters.
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(df,test_size=0.15,stratify=df['treatment'],random_state=42 )

In [16]:
print(f'Train_data Dimensions : {train_data.shape}\n Test data dimension : {test_data.shape}')

Train_data Dimensions : (1070, 23)
 Test data dimension : (189, 23)


In [17]:
health = train_data.copy()

In [18]:
health.columns

Index(['age', 'gender', 'self_employed', 'family_history', 'treatment',
       'work_interfere', 'no_employees', 'remote_work', 'tech_company',
       'benefits', 'care_options', 'wellness_program', 'seek_help',
       'anonymity', 'leave', 'mental_health_consequence',
       'phys_health_consequence', 'coworkers', 'supervisor',
       'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence'],
      dtype='object')

In [19]:
#First, we need to treat missing values for the columns work_interfere and self_employed.
#Self employed column contains as low as 2% null values , so it is okay to replace it with mode.
se_mode = train_data['self_employed'].mode().values[0] 
train_data['self_employed'].fillna(se_mode,inplace=True)
# Work_interfere contains almost 20% null values which is significant as we have less data.Let us once see the null values to find any pattern
train_data[train_data['work_interfere'].isna()]['treatment'].value_counts()

No     223
Yes      3
Name: treatment, dtype: int64

In [20]:
train_data['work_interfere'].fillna('Never',inplace = True)

In [21]:
# Let us divide training set into independent and dependent variables
X_train = train_data.drop('treatment',axis=1)
y_train = train_data['treatment'].copy()

In [22]:
train_data.columns

Index(['age', 'gender', 'self_employed', 'family_history', 'treatment',
       'work_interfere', 'no_employees', 'remote_work', 'tech_company',
       'benefits', 'care_options', 'wellness_program', 'seek_help',
       'anonymity', 'leave', 'mental_health_consequence',
       'phys_health_consequence', 'coworkers', 'supervisor',
       'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence'],
      dtype='object')

In [23]:
gender_cols = ['Female','Male','Other']
self_employed_cols = ['No','Yes']
family_history_cols = ['No','Yes']
work_interfere_cols = ['Never','Rarely','Sometimes','Often']
no_employees_cols = ['1-5','6-25','26-100','100-500','500-1000','More than 1000']
remote_work_cols = ['No','Yes']
tech_company_cols = ['No','Yes']
benefits_cols = ['No','Don\'t know','Yes'] 
care_options_cols = ['No','Not sure','Yes']
wellness_program_cols  =['No','Don\'t know','Yes']
seek_help_cols = ['No','Don\'t know','Yes']
anonymity_cols = ['No','Don\'t know','Yes']
leave_cols = [ 'Very easy', 'Somewhat easy',"Don't know" ,'Somewhat difficult','Very difficult']
mental_health_consequence_cols = ['No','Maybe','Yes']
phys_health_consequence_cols = ['No','Maybe','Yes']
coworkers_col = ['No','Some of them','Yes']
supervisor_cols = ['No','Some of them','Yes']
mental_health_interview_cols = ['No','Maybe','Yes']
phys_health_interview_cols = ['No','Maybe','Yes']
mental_vs_physical_cols = ["Don't know",'No','Yes']
obs_consequence_cols = ['No','Yes']

columns_for_encoder = [gender_cols,self_employed_cols,family_history_cols,work_interfere_cols,no_employees_cols,remote_work_cols,
                            tech_company_cols,benefits_cols,care_options_cols,wellness_program_cols,seek_help_cols,anonymity_cols,leave_cols,
                            mental_health_consequence_cols,phys_health_consequence_cols,coworkers_col,supervisor_cols,mental_health_interview_cols,
                            phys_health_interview_cols,mental_vs_physical_cols,obs_consequence_cols]

In [24]:
features = list(X_train.columns)

In [25]:
from sklearn.preprocessing import OrdinalEncoder
ord_encoder = OrdinalEncoder(categories=list(columns_for_encoder))
X_train[features[1:]] = ord_encoder.fit_transform(X_train.iloc[:,1:])

In [26]:
X_train.head()

Unnamed: 0,age,gender,self_employed,family_history,work_interfere,no_employees,remote_work,tech_company,benefits,care_options,wellness_program,seek_help,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence
676,22,1.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,1.0,1.0,2.0,1.0,1.0,0.0,0.0
88,29,0.0,0.0,0.0,2.0,2.0,0.0,1.0,2.0,2.0,0.0,0.0,1.0,2.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
86,39,1.0,1.0,0.0,3.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,4.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0
1210,24,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
505,46,1.0,0.0,0.0,2.0,5.0,0.0,1.0,2.0,2.0,2.0,2.0,1.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0


In [27]:
train_data.to_csv('filtered.csv')

In [28]:
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
X_train[features] = std_scaler.fit_transform(X_train)

In [29]:
X_train

Unnamed: 0,age,gender,self_employed,family_history,work_interfere,no_employees,remote_work,tech_company,benefits,care_options,wellness_program,seek_help,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence
676,-1.363472,0.419109,-0.362047,-0.803837,-1.135675,-0.243796,-0.659027,0.480937,-1.319345,-1.111531,-0.650242,-0.869401,-0.461953,0.267568,0.223297,1.212902,0.040848,1.062155,1.546143,0.357235,-0.963586,-0.406912
88,-0.409844,-1.904448,-0.362047,-0.803837,0.720967,-0.243796,-0.659027,0.480937,1.125156,1.197863,-0.650242,-0.869401,-0.461953,0.267568,0.223297,-0.569797,0.040848,-0.124177,-0.461591,0.357235,0.233902,2.457534
86,0.952483,0.419109,2.762070,-0.803837,1.649288,-0.831323,-0.659027,0.480937,-1.319345,-1.111531,-0.650242,-0.869401,-0.461953,2.108713,0.223297,-0.569797,0.040848,-1.310508,-0.461591,0.357235,0.233902,2.457534
1210,-1.091007,0.419109,-0.362047,1.244033,-1.135675,-1.418850,-0.659027,0.480937,-1.319345,-1.111531,-0.650242,-0.869401,-0.461953,0.267568,0.223297,1.212902,-1.577948,-1.310508,-0.461591,-1.058475,-0.963586,-0.406912
505,1.906111,0.419109,-0.362047,-0.803837,0.720967,1.518785,-0.659027,0.480937,1.125156,1.197863,1.893665,1.700376,-0.461953,0.267568,0.223297,-0.569797,0.040848,-1.310508,-0.461591,-1.058475,1.431390,-0.406912
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
988,-1.227240,0.419109,-0.362047,1.244033,0.720967,-0.243796,-0.659027,0.480937,1.125156,0.043166,1.893665,1.700376,1.396280,-0.653004,-1.082318,-0.569797,-1.577948,-1.310508,-0.461591,0.357235,-0.963586,-0.406912
598,1.497413,0.419109,-0.362047,-0.803837,-1.135675,0.931258,1.517389,0.480937,-0.097095,0.043166,0.621712,0.415487,-0.461953,0.267568,-1.082318,-0.569797,1.659645,1.062155,-0.461591,-1.058475,1.431390,-0.406912
363,-0.001146,0.419109,-0.362047,1.244033,-0.207354,1.518785,-0.659027,-2.079275,1.125156,1.197863,1.893665,1.700376,1.396280,0.267568,-1.082318,-0.569797,0.040848,1.062155,-0.461591,1.772946,1.431390,-0.406912
690,-1.227240,0.419109,-0.362047,-0.803837,0.720967,-0.831323,-0.659027,0.480937,1.125156,-1.111531,-0.650242,-0.869401,-0.461953,-0.653004,0.223297,1.212902,0.040848,-0.124177,-0.461591,-1.058475,-0.963586,-0.406912


In [30]:
from sklearn.preprocessing import LabelEncoder
lb_encoder = LabelEncoder()
y_train = lb_encoder.fit_transform(y_train)

In [31]:
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

In [32]:
def train_evaluate(model,X_train,y_train,name):
    model.fit(X_train,y_train)
    y_pred = model.predict(X_train)
    f1_train = f1_score(y_train,y_pred)

    #Cross validation
    f1_val = cross_val_score(model,X_train,y_train,scoring='f1',cv=10)
    
    # returning the scores
    score = pd.DataFrame({'Name' : name ,'F1_score_trainset' : [f1_train], 'F1_score_validationset' : [f1_val.mean()]})
    return score

In [33]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(penalty='l1',solver='liblinear')
train_evaluate(log_reg,X_train,y_train,'Logistic Regression')

Unnamed: 0,Name,F1_score_trainset,F1_score_validationset
0,Logistic Regression,0.826521,0.817808


In [34]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(max_leaf_nodes=4,random_state=42)
train_evaluate(dt_clf,X_train,y_train,'DecisionTreeClassifier')

Unnamed: 0,Name,F1_score_trainset,F1_score_validationset
0,DecisionTreeClassifier,0.851698,0.851769


In [35]:
#SVM
from sklearn.svm import SVC
svc_clf = SVC()
train_evaluate(svc_clf,X_train,y_train,'Support Vector Classifier')

Unnamed: 0,Name,F1_score_trainset,F1_score_validationset
0,Support Vector Classifier,0.885159,0.834448


In [36]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(random_state=42)
train_evaluate(rnd_clf,X_train,y_train,'RandomForestClassifier')

Unnamed: 0,Name,F1_score_trainset,F1_score_validationset
0,RandomForestClassifier,1.0,0.842216


In [37]:
#AdaBoost
from sklearn.ensemble import AdaBoostClassifier
dt_clf_ada = DecisionTreeClassifier()
Ada_clf = AdaBoostClassifier(base_estimator=dt_clf_ada,random_state=42)

train_evaluate(Ada_clf,X_train,y_train,"ADA BOOST CLASSIFIER")

Unnamed: 0,Name,F1_score_trainset,F1_score_validationset
0,ADA BOOST CLASSIFIER,1.0,0.759314


In [38]:
#GradientBoost
from sklearn.ensemble import GradientBoostingClassifier
gdb_clf = GradientBoostingClassifier(random_state=42,subsample=0.8)

train_evaluate(gdb_clf,X_train,y_train,"GradientBoosting CLASSIFIER")

Unnamed: 0,Name,F1_score_trainset,F1_score_validationset
0,GradientBoosting CLASSIFIER,0.897596,0.850009


In [39]:
#XGBoost
from xgboost import XGBClassifier
xgb_clf = XGBClassifier(verbosity=0)

In [40]:
train_evaluate(xgb_clf,X_train,y_train,"XG Boost CLASSIFIER")

Unnamed: 0,Name,F1_score_trainset,F1_score_validationset
0,XG Boost CLASSIFIER,1.0,0.828707


In [41]:
#Fine Tuning Support Vector classifier
from sklearn.model_selection import GridSearchCV
param_distribs = {
        'kernel': ['linear', 'rbf','polynomial'],
        'C': [0.01,0.01,0.1,0.15,0.2,0.25,0.5,0.75,1,2,10,100],
        'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    }
svm_clf = SVC()
grid_cv = GridSearchCV(svm_clf , param_grid = param_distribs,
                              cv=5,scoring='f1',
                              verbose=1)
grid_cv.fit(X_train,y_train)

Fitting 5 folds for each of 180 candidates, totalling 900 fits


GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [0.01, 0.01, 0.1, 0.15, 0.2, 0.25, 0.5, 0.75, 1,
                               2, 10, 100],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['linear', 'rbf', 'polynomial']},
             scoring='f1', verbose=1)

In [42]:
grid_cv.best_estimator_

SVC(C=0.5, gamma=0.01)

In [43]:
train_evaluate(grid_cv.best_estimator_,X_train,y_train,"SVC Tuned")

Unnamed: 0,Name,F1_score_trainset,F1_score_validationset
0,SVC Tuned,0.848908,0.842824


In [44]:
#Fine Tuning Random Forest classifier
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators':[3,10,30,50,100],'max_features':[2,4,6,8],'max_depth' : [1,2,3,4]}
]



forest_clf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(forest_clf, param_grid, cv=5,
                           scoring='f1',
                           return_train_score=True)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid=[{'max_depth': [1, 2, 3, 4],
                          'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30, 50, 100]}],
             return_train_score=True, scoring='f1')

In [45]:
grid_search.best_estimator_

RandomForestClassifier(max_depth=3, max_features=6, n_estimators=10,
                       random_state=42)

In [46]:
train_evaluate(grid_search.best_estimator_,X_train,y_train,"RandomForest Tuned")

Unnamed: 0,Name,F1_score_trainset,F1_score_validationset
0,RandomForest Tuned,0.851698,0.850809


In [47]:
#Fine tuning GradientBoost
param_grid = [
    {'n_estimators':[3,10,30,50,100],
    'max_features':[2,4,6,8,10],
    'max_depth' : [1,2,3,4],
    'subsample': [0.25,0.5,0.75]}
]

gdb_clf2 = GradientBoostingClassifier(random_state=42)
grid_search2 = GridSearchCV(gdb_clf2, param_grid, cv=5,
                           scoring='f1',
                           return_train_score=True)
grid_search2.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=GradientBoostingClassifier(random_state=42),
             param_grid=[{'max_depth': [1, 2, 3, 4],
                          'max_features': [2, 4, 6, 8, 10],
                          'n_estimators': [3, 10, 30, 50, 100],
                          'subsample': [0.25, 0.5, 0.75]}],
             return_train_score=True, scoring='f1')

In [48]:
grid_search2.best_estimator_

GradientBoostingClassifier(max_features=6, n_estimators=10, random_state=42,
                           subsample=0.5)

In [49]:
train_evaluate(grid_search2.best_estimator_,X_train,y_train,"GradientBoosting Tuned")

Unnamed: 0,Name,F1_score_trainset,F1_score_validationset
0,GradientBoosting Tuned,0.856664,0.852743


In [50]:
param_grid = [
    {'n_estimators':[3,10,30,50,100],
    'eta' : [0.01,0.025, 0.05, 0.1],
    'max_features':[2,4,6,8],
    'max_depth' : [1,2,3,4],
    'subsample': [0.5,0.75],
    'booster':['gblinear','gbtree']}
]

xgb_clf = XGBClassifier(verbosity = 0)
grid_search3 = GridSearchCV(xgb_clf, param_grid, cv=5,
                           scoring='f1',
                           return_train_score=True)
grid_search3.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=None,
                                     gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None,...
                                     max_leaves=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                         

In [51]:
grid_search3.best_estimator_

XGBClassifier(base_score=None, booster='gbtree', callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eta=0.05, eval_metric=None,
              feature_types=None, gamma=None, gpu_id=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=2,
              max_features=2, max_leaves=None, min_child_weight=None,
              missing=nan, monotone_constraints=None, n_estimators=100,
              n_jobs=None, num_parallel_tree=None, ...)

In [52]:
train_evaluate(grid_search3.best_estimator_,X_train,y_train,"XGBoost Finetuned")

Unnamed: 0,Name,F1_score_trainset,F1_score_validationset
0,XGBoost Finetuned,0.860051,0.856225


In [53]:
XGBoost_final = grid_search3.best_estimator_

In [54]:
scores = pd.DataFrame({'Train_score' : [0.8489,0.8517,0.8566,0.8601],'Val_score' : [0.8428,0.8508,0.8527,0.8563]} 
                        ,index=['SVM','RandomForest','GDBoost','XGBoost'])
scores

Unnamed: 0,Train_score,Val_score
SVM,0.8489,0.8428
RandomForest,0.8517,0.8508
GDBoost,0.8566,0.8527
XGBoost,0.8601,0.8563


In [55]:
#Repeating the preprocessing steps for the test data
X_test = test_data.drop('treatment',axis=1)
y_test = test_data['treatment'].copy()

In [56]:
X_test['self_employed'].fillna(se_mode,inplace=True)
X_test['work_interfere'].fillna('Never',inplace = True)

In [57]:
from sklearn.preprocessing import OrdinalEncoder
# We should only transform using the learned encoder from the training set
X_test[features[1:]] = ord_encoder.transform(X_test.iloc[:,1:])

In [58]:
X_test[features] = std_scaler.transform(X_test)

In [59]:
# Encoding the target column
y_test = lb_encoder.transform(y_test)

In [60]:
# Evaluating the model on test set with our finalized model
y_test_pred = XGBoost_final.predict(X_test)
print(f'F1_score on Test Set : {f1_score(y_test,y_test_pred)}')

F1_score on Test Set : 0.8269230769230769


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   age                        1259 non-null   int64 
 1   gender                     1259 non-null   object
 2   self_employed              1241 non-null   object
 3   family_history             1259 non-null   object
 4   treatment                  1259 non-null   object
 5   work_interfere             995 non-null    object
 6   no_employees               1259 non-null   object
 7   remote_work                1259 non-null   object
 8   tech_company               1259 non-null   object
 9   benefits                   1259 non-null   object
 10  care_options               1259 non-null   object
 11  wellness_program           1259 non-null   object
 12  seek_help                  1259 non-null   object
 13  anonymity                  1259 non-null   object
 14  leave   

In [62]:
import pickle
pickle.dump(XGBoost_final, open("model.pkl", "wb"))

In [63]:
health.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1070 entries, 676 to 191
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   age                        1070 non-null   int64 
 1   gender                     1070 non-null   object
 2   self_employed              1056 non-null   object
 3   family_history             1070 non-null   object
 4   treatment                  1070 non-null   object
 5   work_interfere             844 non-null    object
 6   no_employees               1070 non-null   object
 7   remote_work                1070 non-null   object
 8   tech_company               1070 non-null   object
 9   benefits                   1070 non-null   object
 10  care_options               1070 non-null   object
 11  wellness_program           1070 non-null   object
 12  seek_help                  1070 non-null   object
 13  anonymity                  1070 non-null   object
 14  leave  

In [64]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1070 entries, 676 to 191
Data columns (total 22 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   age                        1070 non-null   float64
 1   gender                     1070 non-null   float64
 2   self_employed              1070 non-null   float64
 3   family_history             1070 non-null   float64
 4   work_interfere             1070 non-null   float64
 5   no_employees               1070 non-null   float64
 6   remote_work                1070 non-null   float64
 7   tech_company               1070 non-null   float64
 8   benefits                   1070 non-null   float64
 9   care_options               1070 non-null   float64
 10  wellness_program           1070 non-null   float64
 11  seek_help                  1070 non-null   float64
 12  anonymity                  1070 non-null   float64
 13  leave                      1070 non-null   floa

In [65]:
result = XGBoost_final.predict(X_train)
result

array([0, 1, 1, ..., 1, 1, 1])