In [2]:
#importing the necesssary python libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

In [3]:
#reading  the dataset
df=pd.read_csv("C:\\Users\\USER\\OneDrive\\Documents\\Mental Health\\Indicators_of_Anxiety_or_Depression_Based_on_Reported_Frequency_of_Symptoms_During_Last_7_Days.csv")
print(df.head())

                         Indicator              Group          State  \
0  Symptoms of Depressive Disorder  National Estimate  United States   
1  Symptoms of Depressive Disorder             By Age  United States   
2  Symptoms of Depressive Disorder             By Age  United States   
3  Symptoms of Depressive Disorder             By Age  United States   
4  Symptoms of Depressive Disorder             By Age  United States   

        Subgroup Phase  Time Period     Time Period Label  \
0  United States     1            1  Apr 23 - May 5, 2020   
1  18 - 29 years     1            1  Apr 23 - May 5, 2020   
2  30 - 39 years     1            1  Apr 23 - May 5, 2020   
3  40 - 49 years     1            1  Apr 23 - May 5, 2020   
4  50 - 59 years     1            1  Apr 23 - May 5, 2020   

  Time Period Start Date Time Period End Date  Value  Low CI  High CI  \
0             04/23/2020           05/05/2020   23.5    22.7     24.3   
1             04/23/2020           05/05/2020   32.7  

In [4]:
#checking the number of rows anc columns in this dataset using the df.info() function
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13905 entries, 0 to 13904
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Indicator               13905 non-null  object 
 1   Group                   13905 non-null  object 
 2   State                   13905 non-null  object 
 3   Subgroup                13905 non-null  object 
 4   Phase                   13905 non-null  object 
 5   Time Period             13905 non-null  int64  
 6   Time Period Label       13905 non-null  object 
 7   Time Period Start Date  13905 non-null  object 
 8   Time Period End Date    13905 non-null  object 
 9   Value                   13284 non-null  float64
 10  Low CI                  13284 non-null  float64
 11  High CI                 13284 non-null  float64
 12  Confidence Interval     13284 non-null  object 
 13  Quartile Range          9180 non-null   object 
dtypes: float64(3), int64(1), object(10)
me

Using the data.info() function,we can see that thee dataset has 14 columns,13905 rows.
Most columns have zero missing values(as shown by the non null column).
The data values in this dataset are of two data types.Object or string data types and float data types(data types that have decimal values).

In [5]:
#descriptive statistics of the dataset.
#this will help in determining how to handle the missing values in the numeric columns
print(df.describe())

        Time Period         Value        Low CI       High CI
count  13905.000000  13284.000000  13284.000000  13284.000000
mean      29.951456     29.687060     26.060020     33.559335
std       18.052128      8.222772      8.000789      8.644455
min        1.000000      6.400000      4.500000      7.900000
25%       14.000000     24.200000     20.700000     27.800000
50%       31.000000     29.100000     25.400000     33.000000
75%       46.000000     34.200000     30.500000     38.300000
max       60.000000     85.200000     79.900000     89.500000


We can fill in the missing values using the median(50%).
It is appropriate to use when dealing with skewed distribution,or when there are outliers present in the dataset

In [5]:
#Filling in the missing values in the 5 columns using the median
df["Value"].fillna(df["Value"].median(),inplace=True)
df["Low CI"].fillna(df["Low CI"].median(),inplace=True)                 
df["High CI"].fillna(df["High CI"].median(),inplace=True)
#Filling in the missing values using the mode

df["Confidence Interval"].fillna(df["Confidence Interval"].mode(),inplace=True)
df["Quartile Range"].fillna(df["Quartile Range"].mode(),inplace=True)

In [6]:
#Converting the date columns from object to date and time columns using to_datetime() function
df["Time Period Start Date"]=pd.to_datetime(df["Time Period Start Date"])
print(df)
df["Time Period End Date"]=pd.to_datetime(df["Time Period End Date"])
print(df)

                                               Indicator              Group  \
0                        Symptoms of Depressive Disorder  National Estimate   
1                        Symptoms of Depressive Disorder             By Age   
2                        Symptoms of Depressive Disorder             By Age   
3                        Symptoms of Depressive Disorder             By Age   
4                        Symptoms of Depressive Disorder             By Age   
...                                                  ...                ...   
13900  Symptoms of Anxiety Disorder or Depressive Dis...           By State   
13901  Symptoms of Anxiety Disorder or Depressive Dis...           By State   
13902  Symptoms of Anxiety Disorder or Depressive Dis...           By State   
13903  Symptoms of Anxiety Disorder or Depressive Dis...           By State   
13904  Symptoms of Anxiety Disorder or Depressive Dis...           By State   

               State       Subgroup Phase  Time Per

In [7]:
#EXPLORATORY DATA ANALYSIS
#Distribution of people with Symptoms of Anxiety Disorder,Symptoms of Depressive Disorder and
# or Symptoms of Anxiety Disorder or Depressive Disorder
import plotly.express as px
symptoms_count=df["Indicator"].value_counts()
fig_symptoms_counts=px.pie(symptoms_count,names=symptoms_count.index,title="distribution of people with symptoms")
fig_symptoms_counts.show()


From this pie chart,people with the different symptoms of disorders have approximately the same percentage in all the three groups.
this means that approximately 4,630 people have Symptoms of Depressive Disorder,approximately 4,630 people have Symptoms of Anxiety Disorder and approximately 4,630 people have Symptoms of Depressive Disorder or Anxiety Disorder.

In [8]:
#distribution of the group column
Groups_counts=df["Group"].value_counts()
fig_group=px.bar(Groups_counts,x=Groups_counts.index,
                 y=Groups_counts.values,
                 title="Group Distribution")
fig_group.show()

## The Respondents

The respondents are divided into 9 groups, namely:

- By state
- By age
- By race
- By education
- By sex
- By gender identity
- By sexual orientation
- By national estimate

## Majority by State

The majority of respondents are grouped by state, as shown by the bar chart above.

## Anxiety and Depressive Symptoms

Most respondents with symptoms of anxiety or depressive disorder come from the United States. There are approximately 9180 cases.

## Age Groups

The respondents are also classified into different age groups:There are approximately 1449 cases.

- 18 - 29 years
- 30 - 39 years
- 40 - 49 years
- 50 - 59 years
- 60 - 69 years
- 70 - 79 years
- 80 years and above

## Hispanic Ethnicity

The respondents are classified into different Hispanic ethnicities, such as:

- Hispanic or Latino
- Non-Hispanic White, single race
- Non-Hispanic Black, single race
- Non-Hispanic Asian, single race
- Non-Hispanic, other races and multiple races

Among them, there are approximately 1035 cases of individuals experiencing symptoms of anxiety or symptoms of depression or both.

## Education Groups

The respondents are also classified by education into different groups:

- Those with less than a high school diploma
- Those with a high school diploma or GED
- Those with some college/Associate's degree
- Those with a Bachelor's degree or higher

Among them,there are approximately 825 cases of individuals experiencing symptoms of anxiety or symptoms of depression or both.

## Gender

They are also classified by sex, that is, female and male.

Among them,there are approximately 414 cases of individuals experiencing symptoms of anxiety or symptoms of depression or both.

They are also classified by gender identity where others identify as:
- Cis-gender male
- Cis-gender female
- Transgender
- Gay or lesbian
- Straight or Bisexual.

There are approximately 279 cases of individuals experiencing symptoms of anxiety or symptoms of depression or both in this group.

They are also classified by disability, that is, those with a disability and those without a disability.

In this group,there are 234 cases of individuals who are experiencing symptoms of anxiety or symptoms of depression or both.

The last two classifications are sexual orientation and national estimate.

There are 279 cases of individuals who are experiencing symptoms of anxiety or symptoms of depression or both in the sex orientation and 207 cases of individuals who are experiencing symptoms of anxiety or symptoms of depression or both in the national estimate group.

The statistics below show the total number of each group.


In [9]:
subgroup_counts = df['Subgroup'].value_counts()
# Get the list of all unique subgroups
unique_subgroups = df['Subgroup'].unique()
# Display the counts and the list of subgroups
print("Counts of each age group:")
print(subgroup_counts) 
print(unique_subgroups)

Counts of each age group:
Subgroup
United States                      207
Hispanic or Latino                 207
18 - 29 years                      207
Bachelor's degree or higher        207
Some college/Associate's degree    207
                                  ... 
Cis-gender female                   93
Transgender                         93
Gay or lesbian                      93
Straight                            93
Bisexual                            93
Name: count, Length: 78, dtype: int64
['United States' '18 - 29 years' '30 - 39 years' '40 - 49 years'
 '50 - 59 years' '60 - 69 years' '70 - 79 years' '80 years and above'
 'Male' 'Female' 'Hispanic or Latino' 'Non-Hispanic White, single race'
 'Non-Hispanic Black, single race' 'Non-Hispanic Asian, single race'
 'Non-Hispanic, other races and multiple races'
 'Less than a high school diploma' 'High school diploma or GED'
 "Some college/Associate's degree" "Bachelor's degree or higher" 'Alabama'
 'Alaska' 'Arizona' 'Arkansas' 'Cal

In [10]:
#Distribution of the state column
State_count=df["State"].value_counts()
fig_state=px.bar(State_count,
                 x=State_count.index,
                 y=State_count.values,
                 title="Distribution of States")
fig_state.update_traces(marker_color=["blue", "green"]) 
fig_state.show()

Most of the respondents were from the United States as shown in the bar chart above.

In [11]:
#checking how many categories there are in the time period label.
category_counts = df["Time Period Label"].value_counts().reset_index()
category_counts.columns = ["Category","Count"]

fig = px.bar(category_counts, x='Category', y='Count', title='Category Count in "time label" Column')
fig.update_xaxes(categoryorder='total ascending')  # Sort x-axis categories
fig.show()

In [11]:
#Distribution of the phase
fig_phase = px.histogram(df, x="Phase", 
                             title="Distribution of every phase")

fig_phase.show()

The phase column is likely to represent the different stages or time intervals this study was conducted.

According to the histogram shown above,each bar represents a different phase in the progression of symptoms of depression or anxiety. 


The height of each bar corresponds to the number of cases or individuals experiencing symptoms during that specific phase.

That is,the phase labeled 1 signifies that there were 2520 individuals who reported symptoms of anxiety and or depression during this phase.It seems to be the phase with the highest number of individuals who reported  symptoms of anxiety and or depression.

Likewise, the phase labeled -1  represents a period when approximately 620 individuals reported symptoms of depression or anxiety.

In [12]:
#Distribution of the start of every time period
# Count the occurrences of when  each time period started
fig_time_start = px.histogram(df, x="Time Period Start Date", 
                             title="Distribution of the start of every time period")

fig_time_start.show()
                      

In [13]:
#Distribution of the end of every time period
# Count the occurrences of when  each time period ended
fig_time_end = px.histogram(df, x="Time Period End Date", 
                             title="Distribution of the end of every time period")

fig_time_end.show()
