# AI in India (Improving Mental Health)

## 1. Importing Data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

In [None]:
data = pd.read_csv('/kaggle/input/mental-health-in-tech-survey/survey.csv')

In [None]:
data.head()

In [None]:
data.shape

## Describing features:
This dataset contains the following data:

- Timestamp

- Age

- Gender

- Country

- state: If you live in the United States, which state or territory do you live in?

- self_employed: Are you self-employed?

- family_history: Do you have a family history of mental illness?

- treatment: Have you sought treatment for a mental health condition?

- work_interfere: If you have a mental health condition, do you feel that it interferes with your work?

- no_employees: How many employees does your company or organization have?

- remote_work: Do you work remotely (outside of an office) at least 50% of the time?

- tech_company: Is your employer primarily a tech company/organization?

- benefits: Does your employer provide mental health benefits?

- care_options: Do you know the options for mental health care your employer provides?

- wellness_program: Has your employer ever discussed mental health as part of an employee wellness program?

- seek_help: Does your employer provide resources to learn more about mental health issues and how to seek help?

- anonymity: Is your anonymity protected if you choose to take advantage of mental health or substance abuse treatment resources?

- leave: How easy is it for you to take medical leave for a mental health condition?

- mental_health_consequence: Do you think that discussing a mental health issue with your employer would have negative consequences?

- phys_health_consequence: Do you think that discussing a physical health issue with your employer would have negative consequences?

- coworkers: Would you be willing to discuss a mental health issue with your coworkers?

- supervisor: Would you be willing to discuss a mental health issue with your direct supervisor(s)?

- mental_health_interview: Would you bring up a mental health issue with a potential employer in an interview?

- phys_health_interview: Would you bring up a physical health issue with a potential employer in an interview?

- mental_vs_physical: Do you feel that your employer takes mental health as seriously as physical health?

- obs_consequence: Have you heard of or observed negative consequences for coworkers with mental health conditions in your workplace?

- comments: Any additional notes or comments

## 2. Exploratory Data Analysis

In [None]:
data.head()

## NOTE: All variables except Age are categorical

## Data types validation
Checking if all features are of correct data type

In [None]:
data.dtypes

In [None]:
## Processes on Datetime
#data['Timestamp'] = pd.to_datetime(data['Timestamp'])
#data['Time_Year'] = data['Timestamp'].dt.year
#data['Time_Month'] = data['Timestamp'].dt.month
#We require only year and month
#data.drop('Timestamp', axis = 1, inplace=True)

In [None]:
# Removing less important features to match Indian audience
data.drop(['Timestamp', 'state', 'comments'], axis=1, inplace=True)

In [None]:
# Visualizing features
fig = plt.figure(figsize = (40,70))
for i, col in enumerate(data.columns):
    ax = fig.add_subplot(7, 4, i+1)
    if col=='Age':
        sns.distplot(data[col], ax=ax)
    elif col=='Timestamp':
        pass
    else:
        sns.countplot(data[col], ax=ax)

In [None]:
fig.savefig('plots.png')

In [None]:
from IPython.display import FileLinks
FileLinks('.')

# Data Preparation

## Gender

In [None]:
# There seems to be some disturbance in Gender
data.Gender.value_counts()

In [None]:
data['Gender'] = data['Gender'].str.lower().str.strip()

In [None]:
data['Gender'].value_counts().keys()

In [None]:
male_words = ['m', 'male', 'make', 'cis male', 'man', 'something kinda male?', 'malr', 'male (cis)', 'maile', 'mal', 'guy (-ish) ^_^'
             'male-ish', 'cis man', 'msle', 'male leaning androgynous', 'ostensibly male, unsure what that really means', 'mail']
female_words = ['f', 'female', 'woman', 'femake', 'fluid', 'cis-female/femme', 'female (cis)', 'cis female', 'femail',]

other_words = []
for word in data['Gender'].value_counts().keys():
    if (word not in male_words) & (word not in female_words):
        other_words.append(word)

In [None]:
other_words

In [None]:
dict_replace = {}
dict_replace.update(dict.fromkeys(male_words, 'm'))
dict_replace.update(dict.fromkeys(female_words, 'f'))
dict_replace.update(dict.fromkeys(other_words, 'o'))
dict_replace

In [None]:
data['Gender'] = data['Gender'].replace(dict_replace)

In [None]:
data['Gender'].value_counts()

In [None]:
data.head()

In [None]:
data.isnull().any()

## Country

In [None]:
data['Country'].value_counts()

### No changes required in 'Country' feature

## Self Employed

In [None]:
data.self_employed.value_counts()

In [None]:
data.isnull().sum()

In [None]:
#Replacing null values in self_employed with No

In [None]:
data['self_employed'].replace({'Yes':1, 'No':0, np.nan:0}, inplace=True)

In [None]:
data.self_employed.value_counts()

## Family History

In [None]:
data['family_history'].value_counts()

In [None]:
data['family_history'].replace({'Yes':1, 'No':0}, inplace=True)

## Treatment (Target Variable)

In [None]:
data['treatment'].value_counts()

In [None]:
data['treatment'].replace({'Yes':1, 'No':0}, inplace=True)

## Work Interfere

In [None]:
data['work_interfere'].value_counts()

In [None]:
#Is work_interfere depended on treatment?
data.loc[data.treatment==1, 'work_interfere']

In [None]:
#Replacing null values with a placeholder
data['work_interfere'].replace({np.nan:-1, 'Never':0, 'Rarely':1, 'Sometimes':2, 'Often':3}, inplace=True)

## No. of employees

In [None]:
data.no_employees.value_counts()

In [None]:
data['no_employees'].replace({'1-5':3, '6-25':15.5, '26-100':63, '100-500':300, '500-1000':750, 'More than 1000':1000}, inplace=True)

## Remote Work

In [None]:
data.remote_work.value_counts()

In [None]:
data['remote_work'].replace({'Yes':1, 'No':0}, inplace=True)

## Tech Company

In [None]:
data['tech_company'].value_counts()

In [None]:
data['tech_company'].replace({'Yes':1, 'No':0}, inplace=True)

## Benefits

In [None]:
data['benefits'].value_counts()

In [None]:
data['benefits'].replace({'Yes':1, 'No':-1, 'Don\'t know':0}, inplace=True)

## Care Options

In [None]:
data['care_options'].value_counts()

In [None]:
data['care_options'].replace({'Yes':1, 'No':-1, 'Not sure':0}, inplace=True)

## Wellness Program

In [None]:
data['wellness_program'].value_counts()

In [None]:
data['wellness_program'].replace({'Yes':1, 'No':-1, 'Don\'t know':0}, inplace=True)

## Seek Help

In [None]:
data['seek_help'].value_counts()

In [None]:
data['seek_help'].replace({'Yes':1, 'No':-1, 'Don\'t know':0}, inplace=True)

## Anonymity

In [None]:
data['anonymity'].value_counts()

In [None]:
data['anonymity'].replace({'Yes':1, 'No':-1, 'Don\'t know':0}, inplace=True)

## Leave

In [None]:
data['leave'].value_counts()

In [None]:
data['leave'].replace({'Very difficult':-2, 'Somewhat difficult':-1, 'Don\'t know':0, 'Somewhat easy':1, 'Very easy':2}, inplace=True)

## Mental Health Consequence 

In [None]:
data['mental_health_consequence'].value_counts()

In [None]:
data['mental_health_consequence'].replace({'Yes':1, 'No':-1, 'Maybe':0}, inplace=True)

## Physical Health Consequence

In [None]:
data['phys_health_consequence'].value_counts()

In [None]:
data['phys_health_consequence'].replace({'Yes':1, 'No':-1, 'Maybe':0}, inplace=True)

## Coworkers

In [None]:
data['coworkers'].value_counts()

In [None]:
data['coworkers'].replace({'Yes':1, 'No':-1, 'Some of them':0}, inplace=True)

## Supervisor

In [None]:
data['supervisor'].value_counts()

In [None]:
data['supervisor'].replace({'Yes':1, 'No':-1, 'Some of them':0}, inplace=True)

## Mental Health Interview

In [None]:
data['mental_health_interview'].value_counts()

In [None]:
data['mental_health_interview'].replace({'Yes':1, 'No':-1, 'Maybe':0}, inplace=True)

## Physical Health Interview

In [None]:
data['phys_health_interview'].value_counts()

In [None]:
data['phys_health_interview'].replace({'Yes':1, 'No':-1, 'Maybe':0}, inplace=True)

## Mental vs Physical

In [None]:
data['mental_vs_physical'].value_counts()

In [None]:
data['mental_vs_physical'].replace({'Yes':1, 'No':-1, 'Don\'t know':0}, inplace=True)

## Obs Consequence

In [None]:
data['obs_consequence'].value_counts()

In [None]:
data['obs_consequence'].replace({'Yes':1, 'No':0}, inplace=True)

# Data Preparation Ends Here

In [None]:
data.head()

In [None]:
# One Hot Encoding on gender
final_data = pd.get_dummies(data=data, columns=['Gender', 'Country'], drop_first=True)

In [None]:
# Extracting the labels (target variables)
treatment = final_data.treatment
final_data.drop(['treatment'], axis=1, inplace = True)

In [None]:
final_data.head()

# Splitting of Data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(final_data, treatment, test_size=0.3, random_state=2)

# Baseline Model

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression()

In [None]:
lr.fit(X_train, y_train)

In [None]:
lr.score(X_train, y_train)

In [None]:
lr.score(X_test, y_test)

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtr = DecisionTreeClassifier()

In [None]:
dtr.fit(X_train, y_train)

In [None]:
dtr.score(X_train, y_train)

In [None]:
dtr.score(X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier()

In [None]:
rf.fit(X_train, y_train)

In [None]:
rf.score(X_train, y_train)

In [None]:
rf.score(X_test, y_test)

In [None]:
from sklearn.svm import SVC
svc = SVC()

In [None]:
svc.fit(X_train, y_train)
svc.score(X_train, y_train)

In [None]:
svc.score(X_test, y_test)

# Creating Pipeline

In [None]:
from sklearn.preprocessing import Imputer
class MyTransformer():
    #Class constructor method that takes in a list of values as its argument
    def __init__(self):
        self.mean_imputer = Imputer(strategy='mean', axis=0)
        self.most_freq_imp = Imputer(strategy='most_frequent', axis=0)
        
    #Fit mean value for Age variable
    def fit( self, X, y = None  ):
        data = X.copy()
        self.mean_imputer.fit(data['Age'].values.reshape(-1, 1))
        #self.most_freq_imp.fit(data['Country'].values.reshape(-1, 1))
        return self
    
    #Transformer method we wrote for this transformer 
    def transform(self, X , y = None ):
        data = X.copy()
        
        # Drop useless features
        data.drop(['Timestamp', 'state', 'comments'], axis=1, inplace=True)
        
        # Missing value treatment
        ## For age, use mean of age
        data['Age'] = self.mean_imputer.transform(data['Age'].values.reshape(-1, 1))
        
        ## For Gender and Country, use most freq
        #data[['Gender', 'Country']] = self.most_freq_imp.transform(data['Country'])
        
        ## For work_interfere, use -1
        data['work_interfere'].fillna(-1, inplace=True)
        
        
        ## For others, simply put 0
        data.fillna(0, inplace= True)
        
        
        #Transforming Gender
        ##In future: Use NLP/Regex to categorize
        data['Gender'] = data['Gender'].str.lower().str.strip()
        male_words = ['m', 'male', 'make', 'cis male', 'man', 'something kinda male?', 'malr', 'male (cis)', 'maile', 'mal', 'guy (-ish) ^_^'
                     'male-ish', 'cis man', 'msle', 'male leaning androgynous', 'ostensibly male, unsure what that really means', 'mail']
        female_words = ['f', 'female', 'woman', 'femake', 'fluid', 'cis-female/femme', 'female (cis)', 'cis female', 'femail',]
        other_words = []
        
        for word in data['Gender'].value_counts().keys():
            if (word not in male_words) & (word not in female_words):
                other_words.append(word)

        dict_replace = {}
        dict_replace.update(dict.fromkeys(male_words, 'm'))
        dict_replace.update(dict.fromkeys(female_words, 'f'))
        dict_replace.update(dict.fromkeys(other_words, 'o'))

        data['Gender'] = data['Gender'].replace(dict_replace)
        
        # Treating missing values of Gender with male
        data['Gender'].fillna('m', inplace=True)
        
        #Transforimg Yes/No variables
        data.replace({'Yes':1}, inplace=True)
        data.replace({'Some of them':0, 'Maybe':0, 'Don\'t know':0, 'Not sure':0}, inplace=True)
        maybe_vars = ['benefits', 'care_options', 'wellness_program', 'seek_help', 'anonymity', 'mental_health_consequence', 
                      'phys_health_consequence', 'coworkers', 'supervisor', 'mental_health_interview', 'phys_health_interview',
                      'mental_vs_physical']
        
        # When 3 categories available (Yes/Maybe/No) the use (1/0/-1). For (Yes/No) use (1/0)
        data[maybe_vars].replace({'No':-1}, inplace=True)
        data.replace({'No':0}, inplace=True)
        
        
        
        # Work interfere variable
        data['work_interfere'].replace({np.nan:-1, 'Never':0, 'Rarely':1, 'Sometimes':2, 'Often':3}, inplace=True)
        
        # Leave variable
        data['leave'].replace({'Very difficult':-2, 'Somewhat difficult':-1, 'Don\'t know':0, 'Somewhat easy':1, 'Very easy':2}, inplace=True)
        
        # No of employees variable
        data['no_employees'].replace({'1-5':3, '6-25':15.5, '26-100':63, '100-500':300, '500-1000':750, 'More than 1000':1000}, inplace=True)
        
        # Get dummies
        data = pd.get_dummies(data=data, columns=['Gender', 'Country'], drop_first=True)
        
        return data

# Modelling

In [None]:
data2 = pd.read_csv('/kaggle/input/mental-health-in-tech-survey/survey.csv')

In [None]:
# Splitting
target = data2.treatment
data2.drop('treatment', axis=1, inplace=True)

In [None]:
trans = MyTransformer()
trans.fit(data2)
transformed_data = trans.transform(data2)

In [None]:
transformed_data.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(transformed_data, target, test_size=0.25, random_state=2)

In [None]:
# Models to test
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [None]:
from sklearn.model_selection import GridSearchCV

def train_model(model, X_train, y_train, param_grid, cv=5):
    print("Training model now")
    clf = GridSearchCV(estimator = model, param_grid=param_grid, cv=cv)
    clf.fit(X_train, y_train)
    print('Best_parameters: ', clf.best_params_)
    print('Best validation score: ', clf.best_score_)
    return clf.best_estimator_

### Training Logistic Regression

In [None]:
logistic= LogisticRegression(solver='liblinear')
penalty=['l1','l2']
C=np.logspace(0,4,10)
param_lr = {'C':C, 'penalty':penalty}

In [None]:
best_lr = train_model(logistic, X_train, y_train, param_lr)

In [None]:
best_lr.score(X_train, y_train)

In [None]:
best_lr.score(X_test, y_test)