## Installing Packages and Importing Libraries

In [1]:
pip install pymongo
pip install sqlalchemy
from pymongo import MongoClient
from bson import ObjectId
from sqlalchemy import create_engine
import pandas as pd
import numpy as np 

Note: you may need to restart the kernel to use updated packages.


## Importing Data from MongoDB

In [171]:
server = MongoClient("mongodb://localhost:27017/") # Connecting to the MongoDB server

db = server["mental_health"] # Accessing the database

anxiety_collection = db["anxietydata"] # Access the collections
reddit_collection = db["reddit_data"]

anxiety_data = anxiety_collection.find() # Fetching the collections
reddit_data = reddit_collection.find()

## Converting into DataFrame

In [63]:
anxiety_df = pd.DataFrame(anxiety_data) # Converting to dataframes
reddit_df = pd.DataFrame(reddit_data) 

In [65]:
print(anxiety_df.head()) 

                        _id  age gender        bmi          who_bmi  \
0  674f7991b82ea70eb504096e   19   male  33.333333  Class I Obesity   
1  674f7991b82ea70eb504096f   19   male  28.731921       Overweight   
2  674f7991b82ea70eb5040970   18   male  22.790329           Normal   
3  674f7991b82ea70eb5040971   18   male  22.837370           Normal   
4  674f7991b82ea70eb5040972   18   male  19.591837           Normal   

   phq_score depression_severity depressiveness suicidal depression_diagnosis  \
0          9                Mild          False    False                False   
1          9                Mild          False    False                False   
2          6                Mild          False    False                False   
3         10            Moderate           True    False                False   
4          6                Mild          False    False                False   

  depression_treatment anxiety_severity anxiousness anxiety_diagnosis  \
0            

In [67]:
print(reddit_df.head()) 

                        _id subreddit          author        date  \
0  6750d08a6e2e1cc9b09d1940   anxiety   Watch_Me_Get_  01-01-2018   
1  6750d08a6e2e1cc9b09d1941   anxiety    ABrokenBeing  01-01-2018   
2  6750d08a6e2e1cc9b09d1942   anxiety  Imhereforgames  01-01-2018   
3  6750d08a6e2e1cc9b09d1943   anxiety    cmatrisciano  01-01-2018   
4  6750d08a6e2e1cc9b09d1944   anxiety  relapsemachine  01-01-2018   

                                                post  \
0  Does anyone else like taking long walks while ...   
1  Rant about anxiety meds - I've been self medic...   
2  Meditation is making me anxious I do a couple ...   
3  Need help/opinions on medication So I was prec...   
4  Strange symptom of anxiety? Since my breakdown...   

   automated_readability_index  coleman_liau_index  \
0                     5.007692            8.594266   
1                     5.962747            7.543635   
2                    -0.716269            1.459650   
3                     3.632522  

## Data Cleaning and Processing

In [77]:
anxiety_df = anxiety_df.replace(r'^\s*$', np.nan, regex=True) # Converting blank spaces, or whitespaces into NaN
reddit_df = reddit_df.replace(r'^\s*$', np.nan, regex=True)

anxiety_nan_count = anxiety_df.isna().sum() # Counting the number of rows having NaN
reddit_nan_count = reddit_df.isna().sum()

print("Count of null values in each column of anxiety_data") # Printing the number of rows having NaN in each column
print(anxiety_nan_count)
print("Count of null values in each column of reddit_data")
print(reddit_nan_count)

Count of null values in each column of anxiety_data
_id                      0
age                      0
gender                   0
bmi                      0
who_bmi                  0
phq_score                0
depression_severity      8
depressiveness           6
suicidal                 2
depression_diagnosis     2
depression_treatment     8
anxiety_severity         0
anxiousness             12
anxiety_diagnosis        8
anxiety_treatment        4
epworth_score           16
sleepiness              16
dtype: int64
Count of null values in each column of reddit_data
_id            0
subreddit      0
author         0
date           0
post           0
              ..
tfidf_wors     0
tfidf_would    0
tfidf_wrong    0
tfidf_x200b    0
tfidf_year     0
Length: 351, dtype: int64


In [83]:
anxiety_df['epworth_score'].skew() # Finding the skewness 
print("The skewness of epworth_score:",anxiety_df['epworth_score'].skew())

The skewness of epworth_score: 1.2408174555061204


In [85]:
epworthscore_median = anxiety_df['epworth_score'].median() # Finding the median
print("The median of epworth_score:",median_value)

anxiety_df['epworth_score'] = anxiety_df['epworth_score'].fillna(anxiety_df['epworth_score'].median()) # Replace NaN values with the median 

The median of epworth_score: 6.0


In [87]:
anxiety_nan_count = anxiety_df.isna().sum()
print(anxiety_nan_count)

_id                      0
age                      0
gender                   0
bmi                      0
who_bmi                  0
phq_score                0
depression_severity      8
depressiveness           6
suicidal                 2
depression_diagnosis     2
depression_treatment     8
anxiety_severity         0
anxiousness             12
anxiety_diagnosis        8
anxiety_treatment        4
epworth_score            0
sleepiness              16
dtype: int64


In [133]:
morethan_data = anxiety_df[(anxiety_df['epworth_score'] > 9) & (anxiety_df['sleepiness'] != "True")]
print("Count:",morethan_data.shape[0])

lessthan_data = anxiety_df[(anxiety_df['epworth_score'] <= 9) & (anxiety_df['sleepiness'] != "False")]
print("Count:",lessthan_data.shape[0])
print(lessthan_data)

Count: 0
Count: 16
                           _id  age  gender        bmi     who_bmi  phq_score  \
29    674f7991b82ea70eb504098b   18  female  21.604938      Normal          2   
161   674f7991b82ea70eb5040a0f   18    male  28.089888  Overweight          2   
167   674f7991b82ea70eb5040a15   18  female  19.948060      Normal          4   
213   674f7991b82ea70eb5040a43   19  female  22.206331      Normal          8   
381   674f7991b82ea70eb5040aeb   19  female  22.939751      Normal          7   
437   674f7991b82ea70eb5040b23   19    male  26.234568  Overweight         10   
465   674f7991b82ea70eb5040b3f   19    male  26.446281  Overweight          3   
541   674f7991b82ea70eb5040b8b   21  female  23.111111      Normal          5   
811   674f7991b82ea70eb5040c99   18  female  21.604938      Normal          2   
949   674f7991b82ea70eb5040d23   18    male  28.089888  Overweight          2   
950   674f7991b82ea70eb5040d24   18  female  19.948060      Normal          4   
996   674

In [135]:
anxiety_df['sleepiness'] = anxiety_df.apply(
    lambda row: 'False' if row['epworth_score'] <= 9 and pd.isna(row['sleepiness']) else row['sleepiness'],
    axis=1
) # Replacing NaN values with 'False'

anxiety_df['anxiousness'] = anxiety_df['anxiousness'].fillna(anxiety_df['anxiousness'].mode()[0]) # Replacing NaN values with mode


anxiety_nan_count = anxiety_df.isna().sum()
print(anxiety_nan_count)

_id                     0
age                     0
gender                  0
bmi                     0
who_bmi                 0
phq_score               0
depression_severity     8
depressiveness          6
suicidal                2
depression_diagnosis    2
depression_treatment    8
anxiety_severity        0
anxiousness             0
anxiety_diagnosis       8
anxiety_treatment       4
epworth_score           0
sleepiness              0
dtype: int64


In [151]:
reddit_df = reddit_df.filter(items=['_id', 'date','post']) # Remove unwanted columns

In [159]:
# Renaming columns
anxiety_df.columns = ['ID', 'Age', 'Gender','BMI','BMI Category','PHQ Score','Depression Severity', 'Depressiveness', 'Suicidal',
                     'Depression Diagnosis','Depression Treatment','Anxiety Severity','Anxiousness','Anxiety Diagnosis','Anxiety Treatment',
                      'Epworth Score','Sleepiness']

reddit_df.columns = ['ID', 'Date', 'Reddit Post']

In [161]:
print(anxiety_df.dtypes)
print("-" * 50)
print(reddit_df.dtypes)

ID                       object
Age                       int64
Gender                   object
BMI                     float64
BMI Category             object
PHQ Score                 int64
Depression Severity      object
Depressiveness           object
Suicidal                 object
Depression Diagnosis     object
Depression Treatment     object
Anxiety Severity         object
Anxiousness              object
Anxiety Diagnosis        object
Anxiety Treatment        object
Epworth Score           float64
Sleepiness               object
dtype: object
--------------------------------------------------
ID             object
Date           object
Reddit Post    object
dtype: object


In [163]:
# Converting object datatype to string before importing into PostgreSQL

anxiety_df = anxiety_df.astype({
    'ID': 'str',        
    'Gender': 'str',      
    'BMI Category': 'str',   
    'Depression Severity': 'str',
    'Depressiveness': 'str',
    'Depression Diagnosis': 'str',
    'Depression Treatment': 'str',
    'Anxiety Severity': 'str',
    'Anxiousness': 'str',
    'Anxiety Diagnosis': 'str',
    'Anxiety Treatment': 'str',
    'Sleepiness': 'str'
})

reddit_df = reddit_df.astype({
    'ID': 'str',        
    'Date': 'str',      
    'Reddit Post': 'str'
})

In [167]:
anxiety_df['Age Group'] = np.where(anxiety_df['Age'] <= 29, '18 - 29 years','30 - 39 years')

In [169]:
print(anxiety_df)

                            ID  Age  Gender        BMI     BMI Category  \
0     674f7991b82ea70eb504096e   19    male  33.333333  Class I Obesity   
1     674f7991b82ea70eb504096f   19    male  28.731921       Overweight   
2     674f7991b82ea70eb5040970   18    male  22.790329           Normal   
3     674f7991b82ea70eb5040971   18    male  22.837370           Normal   
4     674f7991b82ea70eb5040972   18    male  19.591837           Normal   
...                        ...  ...     ...        ...              ...   
1561  674f7992b82ea70eb5040f87   22    male  25.308642       Overweight   
1562  674f7992b82ea70eb5040f88   24  female  21.096191           Normal   
1563  674f7992b82ea70eb5040f89   22  female  23.033168           Normal   
1564  674f7992b82ea70eb5040f8a   22    male  22.720438           Normal   
1565  674f7992b82ea70eb5040f8b   22    male  22.598140           Normal   

      PHQ Score Depression Severity Depressiveness Suicidal  \
0             9                Mild 

## Exporting Data into PostgreSQL

In [189]:
connection_string = 'postgresql+psycopg2://postgres:testpassword@localhost:5432/apdv_project' # PostgreSQL connection string

db_connect = create_engine(database_uri) # Connecting to database in PostgreSQL

anxiety_df.to_sql('anxiety', db_connect, if_exists='replace', index=False) # Exporting data into PostgreSQL
reddit_df.to_sql('reddit', db_connect, if_exists='replace', index=False)

print("Data imported into PostgreSQL successfully.")

Data imported into PostgreSQL successfully.
