# This Logistic Regression model was made to predict whether or not an individual has a mental health disorder based on various factors (output: yes or no).

In [1]:
# Import dependencies
import matplotlib.pyplot as plt
import pandas as pd
import sqlite3
import numpy as np

In [2]:
# Load data
csv = pd.read_csv('../../HM/Data_Clean_2017_to_2019.csv')
data = pd.DataFrame(csv)
data.head(50)

Unnamed: 0.1,Unnamed: 0,age,"Gender: F=0, M=1, Non-binary=3","If possibly, what disorder(s) do you believe you have?",Have you ever been diagnosed with a mental health disorder? Y=1 N=0,Have you had a mental health disorder in the past? Y=1 N=0,Do you have a family history of mental illness? Y=1 N=0,Are you self-employed? Y=1 N=0,Is your employer primarily a tech company/organization? Y=1 N=0,How many employees does your company or organization have?,Is your primary role within your company related to tech/IT?,Age Groups
0,0,25.0,1,,0,0,0,0,1,26-100,1,24-30
1,1,51.0,1,"Mood Disorder (Depression, Bipolar Disorder, etc)",0,0,1,0,1,26-100,1,46-55
2,2,27.0,1,,0,0,0,0,1,26-100,1,24-30
3,3,37.0,1,,0,0,1,0,1,100-500,1,31-45
4,4,46.0,1,,0,0,0,0,1,26-100,1,46-55
5,5,36.0,0,,1,1,1,0,1,100-500,0,31-45
6,6,39.0,0,,1,1,1,0,1,26-100,1,31-45
7,7,35.0,1,"Substance Use Disorder, Mood Disorder (Depress...",0,0,1,1,0,,0,31-45
8,8,49.0,2,,0,0,0,0,1,26-100,1,46-55
9,9,45.0,1,"Mood Disorder (Depression, Bipolar Disorder, e...",0,0,0,0,1,05-25,1,31-45


In [3]:
# Rename ind and dep variable columns
renamed_data = data.rename(columns = {'Gender: F=0, M=1, Non-binary=3':'gender',\
                                      'Have you ever been diagnosed with a mental health disorder? Y=1 N=0':'mentalHealthDisorder',\
                                      'Are you self-employed?  Y=1 N=0':'selfEmployment',\
                                      'Have you had a mental health disorder in the past? Y=1 N=0':'pastDisorder',\
                                      'Do you have a family history of mental illness?  Y=1 N=0':'familyHistory',
                                      'Is your employer primarily a tech company/organization?  Y=1 N=0':'primarilyTechOrg',
                                      'How many employees does your company or organization have?':'employeeNumber',
                                      'Is your primary role within your company related to tech/IT?':'techRole',
                                      'Have you had a mental health disorder in the past?  Y=1 N=0':'pastDisorder'})
renamed_data.head()

Unnamed: 0.1,Unnamed: 0,age,gender,"If possibly, what disorder(s) do you believe you have?",mentalHealthDisorder,pastDisorder,familyHistory,selfEmployment,primarilyTechOrg,employeeNumber,techRole,Age Groups
0,0,25.0,1,,0,0,0,0,1,26-100,1,24-30
1,1,51.0,1,"Mood Disorder (Depression, Bipolar Disorder, etc)",0,0,1,0,1,26-100,1,46-55
2,2,27.0,1,,0,0,0,0,1,26-100,1,24-30
3,3,37.0,1,,0,0,1,0,1,100-500,1,31-45
4,4,46.0,1,,0,0,0,0,1,26-100,1,46-55


In [4]:
# Drop na value in ind variable column (primarily tech organzation)        
df = renamed_data.dropna(subset=['familyHistory'])
renamed_data.head(10)
## *BIN RANGES HAVE TO BE CHANGED TO NUMBERS*

Unnamed: 0.1,Unnamed: 0,age,gender,"If possibly, what disorder(s) do you believe you have?",mentalHealthDisorder,pastDisorder,familyHistory,selfEmployment,primarilyTechOrg,employeeNumber,techRole,Age Groups
0,0,25.0,1,,0,0,0,0,1,26-100,1,24-30
1,1,51.0,1,"Mood Disorder (Depression, Bipolar Disorder, etc)",0,0,1,0,1,26-100,1,46-55
2,2,27.0,1,,0,0,0,0,1,26-100,1,24-30
3,3,37.0,1,,0,0,1,0,1,100-500,1,31-45
4,4,46.0,1,,0,0,0,0,1,26-100,1,46-55
5,5,36.0,0,,1,1,1,0,1,100-500,0,31-45
6,6,39.0,0,,1,1,1,0,1,26-100,1,31-45
7,7,35.0,1,"Substance Use Disorder, Mood Disorder (Depress...",0,0,1,1,0,,0,31-45
8,8,49.0,2,,0,0,0,0,1,26-100,1,46-55
9,9,45.0,1,"Mood Disorder (Depression, Bipolar Disorder, e...",0,0,0,0,1,05-25,1,31-45


In [5]:
# Checking data balance
# Initialize variables to track 0 (no mental health disorder = noMHD) and 1 (mental health disorder = mhd) counts
noMHD = 0
mhd = 0

# Iterate through list, if 1 add to mhd, if 0 add to noMHD
for index, row in renamed_data.iterrows():
    if row['mentalHealthDisorder'] == 0:
        noMHD += 1
    elif row['mentalHealthDisorder'] == 1:
        mhd += 1
        
print(noMHD)
print(mhd)

# Percentage of total data with mentalHealthDisorder=1 (42.4% people in the dataset have been diagnosed)
total = noMHD + mhd
print(mhd/total * 100)

878
647
42.42622950819672


In [6]:
## Potential independent variables:
# Age
# Gender
# Self Employment
# Remote Working Situation

## Dependent variable:
# Whether they have been diagnosed with a mental health disorder by a medical professional

In [7]:
# Potential independent variable = whether manager has had formal discussion about mental health importance and 
# opportunities with employee
X = df.familyHistory.values.reshape(-1,1)

In [8]:
# Dependent vairable = whether the employee has been diagnosed with a mental health disorder
y = df.mentalHealthDisorder.values.reshape(-1,1)

In [9]:
# Split data into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)

In [10]:
# Create logistic regression model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', random_state=1)

In [11]:
## Train data
classifier.fit(X_train, y_train.ravel())

LogisticRegression(random_state=1)

In [12]:
## Predict outcomes for test data
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test.ravel()})

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,1,0
3,0,1
4,0,0
...,...,...
377,1,1
378,0,0
379,1,0
380,0,0


In [13]:
# Validate model with test data
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.6335078534031413

In [14]:
# Cleaned Data ML Tests:

## Family History*
### Accuracy: 0.6335078534031413 - *Highest so far, but we can test other variables to find a stronger correlation.
### Greater accuracy tham gender but still not desired output.

## Gender
### Accuracy: 0.6020942408376964 
### Greater accuracy than age but still not at desired output.

## Age
### Accuracy: 0.5774278215223098 
### Age doesn't see to have a strong correlation with mental health disorder (MHD) rates in the tech industry.

## Self-Employment
### Accuracy: 0.5759162303664922 
### Low accuracy - self-employment status does not seem to have a strong correlation with MHD rates in tech.

## Primarily Tech Organizatione
### Accuracy: 0.5759162303664922
### Low accuracy 

## Primarily Tech Role
### Accuracy: 0.5759162303664922
### Low accuracy

In [15]:
# Description of model, preliminary processing/selection, and data splitting in main README.md

In [16]:
# Step 2 - After selecting effective variable, test by generating new data point. 

In [17]:
# ## Generate new data point 
# import numpy as np
# new_data = np.random.randint(2, size=1)

In [18]:
# # Prediction of the new data point
# predictions = classifier.predict(new_data)
# print("Classes are either 0 (no mental health disorder diagnosis) or 1 (mental health disorder diagnosis)")
# print(f"The new point was classified as: {predictions}")