<a href="https://colab.research.google.com/github/Mac1211/Project/blob/main/model_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Load the dataset
data = pd.read_csv('Mental Health Dataset.csv', parse_dates=['Timestamp'], dtype={'self_employed': str, 'family_history': str, 'Growing_Stress': str, 'Changes_Habits': str, 'Mental_Health_History': str, 'Mood_Swings': str, 'Coping_Struggles': str, 'Work_Interest': str, 'Social_Weakness': str, 'care_options': str})

In [3]:
data.head()

Unnamed: 0,Timestamp,Gender,Country,Occupation,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mental_Health_History,Mood_Swings,Coping_Struggles,Work_Interest,Social_Weakness,mental_health_interview,care_options
0,2014-08-27 11:29:00,Female,United States,Corporate,,No,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Not sure
1,2014-08-27 11:31:00,Female,United States,Corporate,,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,No
2,2014-08-27 11:32:00,Female,United States,Corporate,,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Yes
3,2014-08-27 11:37:00,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,Maybe,Yes
4,2014-08-27 11:43:00,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Yes


In [4]:
data.columns

Index(['Timestamp', 'Gender', 'Country', 'Occupation', 'self_employed',
       'family_history', 'treatment', 'Days_Indoors', 'Growing_Stress',
       'Changes_Habits', 'Mental_Health_History', 'Mood_Swings',
       'Coping_Struggles', 'Work_Interest', 'Social_Weakness',
       'mental_health_interview', 'care_options'],
      dtype='object')

In [5]:
data.describe()

Unnamed: 0,Timestamp
count,292364
mean,2014-09-09 04:08:59.621978368
min,2014-08-27 11:29:00
25%,2014-08-27 14:18:00
50%,2014-08-28 00:43:00
75%,2014-08-28 22:22:00
max,2016-02-01 23:04:00


In [6]:
data.shape

(292364, 17)

In [7]:
# Handle missing values
data = data.dropna()

In [8]:
data.shape

(287162, 17)

In [9]:
# Encode categorical variables
label_encoder = LabelEncoder()
categorical_cols = ['Gender', 'Country', 'Occupation', 'self_employed', 'family_history', 'Growing_Stress', 'Changes_Habits', 'Mental_Health_History', 'Mood_Swings', 'Coping_Struggles', 'Work_Interest', 'Social_Weakness', 'care_options']
data[categorical_cols] = data[categorical_cols].apply(lambda col: label_encoder.fit_transform(col))

In [10]:
# Check Categorical Columns for unexpected string values
for col in categorical_cols:
    print(col, data[col].unique())

Gender [0 1]
Country [34 25  0  4 33 29 30 22 21 15  1 16 11 26  3  6 27 13 31 10 17 18  2 28
 23  7 32  9 19 14 20  5 12  8 24]
Occupation [1 4 0 2 3]
self_employed [0 1]
family_history [1 0]
Growing_Stress [2 1 0]
Changes_Habits [1 2 0]
Mental_Health_History [2 1 0]
Mood_Swings [2 1 0]
Coping_Struggles [0 1]
Work_Interest [1 0 2]
Social_Weakness [2 1 0]
care_options [2 1 0]


In [11]:
# Identify unexpected string values and handle them appropriately
# Example: If 'Yes' and 'No' are unexpected string values, map them to 1 and 0 respectively
data.replace({'Growing_Stress': {'Yes': 1, 'No': 0},'treatment': {'Yes': 1, 'No': 0}, 'Changes_Habits': {'Yes': 1, 'No': 0}, 'Mental_Health_History': {'Yes': 1, 'No': 0}, 'Mood_Swings': {'Yes': 1, 'No': 0}, 'Coping_Struggles': {'Yes': 1, 'No': 0}, 'Work_Interest': {'Yes': 1, 'No': 0}, 'Social_Weakness': {'Yes': 1, 'No': 0}}, inplace=True)

In [12]:
# Now, re-encode categorical variables
data[categorical_cols] = data[categorical_cols].apply(lambda col: label_encoder.fit_transform(col))

In [13]:
data.head()

Unnamed: 0,Timestamp,Gender,Country,Occupation,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mental_Health_History,Mood_Swings,Coping_Struggles,Work_Interest,Social_Weakness,mental_health_interview,care_options
3,2014-08-27 11:37:00,0,34,1,0,1,1,1-14 days,2,1,2,2,0,1,2,Maybe,2
4,2014-08-27 11:43:00,0,34,1,0,1,1,1-14 days,2,1,2,2,0,1,2,No,2
5,2014-08-27 11:49:00,0,25,1,0,0,1,1-14 days,2,1,2,2,0,1,2,Maybe,1
6,2014-08-27 11:51:00,0,0,1,0,1,1,1-14 days,2,1,2,2,0,1,2,No,1
7,2014-08-27 11:52:00,0,34,1,0,0,0,1-14 days,2,1,2,2,0,1,2,No,0


In [14]:
# Split the dataset into features and target variable
X = data.drop(['Timestamp', 'mental_health_interview'], axis=1)
y = data['mental_health_interview']

In [15]:
X.shape

(287162, 15)

In [17]:
# Replace '15-30 days' with 25 in the Days_Indoors column
X.replace({'Days_Indoors': {'15-30 days': 25}}, inplace=True)

# Convert the column to numeric
X['Days_Indoors'] = pd.to_numeric(X['Days_Indoors'], errors='coerce')  # Convert to numeric, coerce errors to NaN

# Drop rows with NaN values from both X and y
X = X.dropna()
y = y[X.index]  # Align y with X after dropping rows

In [18]:
# Encode the target variable
y = label_encoder.fit_transform(y)

In [19]:
X.shape

(52867, 15)

In [20]:
y.shape

(52867,)

In [21]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
X_train.head()

Unnamed: 0,Gender,Country,Occupation,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mental_Health_History,Mood_Swings,Coping_Struggles,Work_Interest,Social_Weakness,care_options
81271,1,34,2,0,1,1,25.0,2,1,1,1,0,2,2,2
153218,1,34,4,0,0,0,25.0,1,0,0,1,1,2,2,0
44111,0,33,1,0,0,0,25.0,2,1,1,2,1,1,0,0
76522,1,4,3,0,1,1,25.0,2,2,0,0,0,1,0,2
20843,0,34,2,0,0,0,25.0,1,1,0,0,0,0,1,2


In [23]:
X_train.shape

(42293, 15)

In [24]:
y_train.shape

(42293,)

In [25]:
# Train a Decision Tree Classifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [27]:
# Evaluate the model on the test set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [28]:
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

Accuracy: 0.8185171174579157
              precision    recall  f1-score   support

           0       0.58      0.30      0.40      1868
           1       0.85      0.95      0.90      8438
           2       0.66      0.19      0.30       268

    accuracy                           0.82     10574
   macro avg       0.69      0.48      0.53     10574
weighted avg       0.79      0.82      0.79     10574

