# Data Processing File

In [47]:
# All imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,precision_score,recall_score,accuracy_score

In [6]:
## Load data (already in root directory if repo is cloned)
df = pd.read_csv("data_original.csv")
df.head()

Unnamed: 0,I am currently employed at least part-time,I identify as having a mental illness,Education,I have my own computer separate from a smart phone,I have been hospitalized before for my mental illness,How many days were you hospitalized for your mental illness,I am legally disabled,I have my regular access to the internet,I live with my parents,I have a gap in my resume,...,Obsessive thinking,Mood swings,Panic attacks,Compulsive behavior,Tiredness,Age,Gender,Household Income,Region,Device Type
0,0,0,High School or GED,0,0,0.0,0,1,0,1,...,1.0,0.0,1.0,0.0,0.0,30-44,Male,"$25,000-$49,999",Mountain,Android Phone / Tablet
1,1,1,Some Phd,1,0,0.0,0,1,0,0,...,0.0,0.0,1.0,0.0,1.0,18-29,Male,"$50,000-$74,999",East South Central,MacOS Desktop / Laptop
2,1,0,Completed Undergraduate,1,0,0.0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,30-44,Male,"$150,000-$174,999",Pacific,MacOS Desktop / Laptop
3,0,0,Some Undergraduate,1,0,,0,1,1,1,...,0.0,0.0,0.0,0.0,0.0,30-44,Male,"$25,000-$49,999",New England,Windows Desktop / Laptop
4,1,1,Completed Undergraduate,1,1,35.0,1,1,0,1,...,1.0,1.0,1.0,1.0,1.0,30-44,Male,"$25,000-$49,999",East North Central,iOS Phone / Tablet


In [7]:
df.columns

Index(['I am currently employed at least part-time',
       'I identify as having a mental illness', 'Education',
       'I have my own computer separate from a smart phone',
       'I have been hospitalized before for my mental illness',
       'How many days were you hospitalized for your mental illness',
       'I am legally disabled', 'I have my regular access to the internet',
       'I live with my parents', 'I have a gap in my resume',
       'Total length of any gaps in my resume in months.',
       'Annual income (including any social welfare programs) in USD',
       'I am unemployed', 'I read outside of work and school',
       'Annual income from social welfare programs', 'I receive food stamps',
       'I am on section 8 housing',
       'How many times were you hospitalized for your mental illness',
       'Lack of concentration', 'Anxiety', 'Depression', 'Obsessive thinking',
       'Mood swings', 'Panic attacks', 'Compulsive behavior', 'Tiredness',
       'Age', 'Gender

In [11]:
# Columns we will not be using
cols = ['I have my own computer separate from a smart phone', 
        'I have been hospitalized before for my mental illness',
        'How many days were you hospitalized for your mental illness',
        'I have my regular access to the internet',
        'I have a gap in my resume',
        'I am unemployed',
        'Annual income from social welfare programs',
        'I receive food stamps',
        'I am on section 8 housing',
        'How many times were you hospitalized for your mental illness',
        'Lack of concentration', 
        'Anxiety', 
        'Depression', 
        'Obsessive thinking',
        'Mood swings', 
        'Panic attacks', 
        'Compulsive behavior', 
        'Tiredness',
        'Household Income',
        'Device Type']
        
df1 = df.drop(columns = cols)
df1.head()

Unnamed: 0,I am currently employed at least part-time,I identify as having a mental illness,Education,I am legally disabled,I live with my parents,Total length of any gaps in my resume in months.,Annual income (including any social welfare programs) in USD,I read outside of work and school,Age,Gender,Region
0,0,0,High School or GED,0,0,24,35,1,30-44,Male,Mountain
1,1,1,Some Phd,0,0,1,22,1,18-29,Male,East South Central
2,1,0,Completed Undergraduate,0,0,0,100,1,30-44,Male,Pacific
3,0,0,Some Undergraduate,0,1,11,0,1,30-44,Male,New England
4,1,1,Completed Undergraduate,1,0,33,32,1,30-44,Male,East North Central


In [15]:
# Rename columns
df1.columns = ['employment', 'mental_illness', 'education', 'disability', 'parents', 'resume_gaps', 'income', 'reading', 'age', 'gender', 'region']
cols = ['mental_illness', 'age', 'gender', 'region', 'education', 'employment', 'income', 'resume_gaps', 'disability', 'parents', 'reading']
df2 = df1[cols]
df2.head()

Unnamed: 0,mental_illness,age,gender,region,education,employment,income,resume_gaps,disability,parents,reading
0,0,30-44,Male,Mountain,High School or GED,0,35,24,0,0,1
1,1,18-29,Male,East South Central,Some Phd,1,22,1,0,0,1
2,0,30-44,Male,Pacific,Completed Undergraduate,1,100,0,0,0,1
3,0,30-44,Male,New England,Some Undergraduate,0,0,11,0,1,1
4,1,30-44,Male,East North Central,Completed Undergraduate,1,32,33,1,0,1


In [27]:
# Code up descriptive variables
dict_age = {"18-29" : 0, 
            "30-44" : 1, 
            "45-60" : 2, 
            "> 60" : 3}
dict_gender = {"Male" : 0,
               "Female" : 1}
dict_region = {"East North Central" : 0, 
               "East South Central" : 1, 
               "Middle Atlantic" : 2, 
               "Mountain" : 3,
               "New England" : 4,
               "Pacific" : 5,
               "South Atlantic" : 6,
               "West North Central" : 7,
               "West South Central" : 8}
dict_education = {"Some highschool" : 0,
                  "High School or GED" : 1,
                  "Some Undergraduate" : 2,
                  "Completed Undergraduate" : 3,
                  "Some\xa0Masters" : 4,
                  "Completed Masters" : 5,
                  "Some Phd" : 6,
                  "Completed Phd" : 7}
df3=df2.replace({"age": dict_age, "gender": dict_gender, "region": dict_region, "education": dict_education})
df3.head()

Unnamed: 0,mental_illness,age,gender,region,education,employment,income,resume_gaps,disability,parents,reading
0,0,1,0,3.0,1,0,35,24,0,0,1
1,1,0,0,1.0,6,1,22,1,0,0,1
2,0,1,0,5.0,3,1,100,0,0,0,1
3,0,1,0,4.0,2,0,0,11,0,1,1
4,1,1,0,0.0,3,1,32,33,1,0,1


In [30]:
# Drop rows with missing data
df3 = df3.dropna()

In [31]:
# check data
print(df3.mental_illness.unique())
print(df3.age.unique())
print(df3.gender.unique())
print(df3.region.unique())
print(df3.education.unique())
print(df3.employment.unique())
print(df3.income.unique())
print(df3.resume_gaps.unique())
print(df3.disability.unique())
print(df3.parents.unique())
print(df3.reading.unique())

[0 1]
[1 0 2 3]
[0 1]
[3. 1. 5. 4. 0. 6. 2. 8. 7.]
[1 6 3 2 4 5 7 0]
[0 1]
[ 35  22 100   0  32   1  11  73  12  50  25  59  48  16  55  24  43  38
  75   5  49  45  40  30  20  18   8   7  47  39  27  93  10  46  28   2
  61  90  62  14  78   6  91  44  15  95  33  65  86  92  72  71  80  57
  29  53  97  56  17  19  21   9  31  99  88  69  37  67  70  42  13  96
  60  23   4  36  34  81  41  74  54  82  83  51]
[ 24   1   0  11  33  47  12   6  44  21  32  15  18  36   7  22  43 100
  52  10  23   4  53   9  48   2  14   8  13  16   3  97   5  30  59  20
  28  73  66  45]
[0 1]
[0 1]
[1 0]


In [53]:
# save updated file
df3.to_csv('data_new.csv')

## Train model

In [40]:
y = df3.mental_illness
X = df3.drop(columns = ['mental_illness'])

In [42]:
# Standardize data
ss = StandardScaler()
X = ss.fit_transform(X)
print(X)

[[-0.61436776 -1.06217001 -0.3201417  ... -0.33221664 -0.34874292
   0.35415213]
 [-1.59973054 -1.06217001 -1.10745313 ... -0.33221664 -0.34874292
   0.35415213]
 [-0.61436776 -1.06217001  0.46716974 ... -0.33221664 -0.34874292
   0.35415213]
 ...
 [ 1.3563578   0.94146887  1.25448117 ... -0.33221664 -0.34874292
   0.35415213]
 [-1.59973054  0.94146887  1.64813689 ...  3.01008406  2.86744176
   0.35415213]
 [-1.59973054  0.94146887  0.46716974 ...  3.01008406 -0.34874292
   0.35415213]]


In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [48]:
lr=LogisticRegression()
lr.fit(X_train,y_train)
y_pred=lr.predict(X_test)
accuracy_score(y_test,y_pred)

0.8955223880597015

In [50]:
cf_matrix=confusion_matrix(y_test,y_pred)
print(cf_matrix)

[[56  2]
 [ 5  4]]


## This should run similarly

In [61]:
df = pd.read_csv("data_new.csv")

df = df.iloc[: , 1:]
    
y = df.mental_illness
X = df.drop(columns = ['mental_illness'])
    
# Standardize data
ss = StandardScaler()
X = ss.fit_transform(X)
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
lr=LogisticRegression()
lr.fit(X_train,y_train)
 


LogisticRegression()

In [63]:
X_test.shape

(67, 10)

In [64]:
d = {'age': [0], 'gender': [1], 'region':[1], 'education':[1], 'employment': [1], 'income':[20], 'resume_gaps':[2], 'disability':[0], 'parents':[1], 'reading':[1]}
df = pd.DataFrame(data=d)
ss = StandardScaler()
X = ss.fit_transform(df)

y_pred=lr.predict(X)

In [65]:
y_pred

array([0])