In [1]:
import pandas as pd
import re

In [2]:
data_dir = "data/a/combined_data_raw.csv"
df = pd.read_csv(data_dir, index_col=0)

In [3]:
df.head()

Unnamed: 0,statement,status
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 53043 entries, 0 to 53042
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   statement  52681 non-null  object
 1   status     53043 non-null  object
dtypes: object(2)
memory usage: 1.2+ MB


## Basic cleaning

In [5]:
# Check for missing values
print(f"Number of rows with missing values: {df.isnull().sum()}")

# Check for duplicates
print(f"Number of duplicate rows: {df.duplicated(subset=['statement']).sum()}")

Number of rows with missing values: statement    362
status         0
dtype: int64
Number of duplicate rows: 1969


In [6]:
# Drop rows that contain empty values
df = df.dropna() 

# Drop rows that contain duplicate values in the ‘statement’ column and keep only the first row
df = df.drop_duplicates(subset=['statement'], keep='first')

df.reset_index(drop = True, inplace = True)

In [7]:
# Recheck for missing values
print(f"Number of rows with missing values: {df.isnull().sum()}")

# Recheck for duplicates
print(f"Number of duplicate rows: {df.duplicated(subset=['statement']).sum()}")

Number of rows with missing values: statement    0
status       0
dtype: int64
Number of duplicate rows: 0


## Deep cleaning

In [8]:
# Change the data type of ‘statement’ and ‘status’ columns to string
df = df.astype({"statement":str, "status":str})

In [9]:
emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)

stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [10]:
def clean_text(text):
    text = text.lower()
    # remove stopwords
    for word in stopwords:
        if word[0] == "'":
            text = re.sub(rf"{word}\b", "", text)
        else:
            text = re.sub(rf"\b{word}\b", "", text)

    text = re.sub(r'[!"“’#$%&()\*\+,-\./:;<=>?@\[\\\]^_`{|}~\']', ' ', text) # remove punctuation mark
    text = re.sub(emoj, ' ', text) # remove emoji
    text = re.sub(r'\d+', ' ', text) # remove number
    text = re.sub(r'(.)\1+', r'\1', text) # remove repeated character
    text = re.sub(r' [a-z] ', ' ', text) # remove single character
    text = re.sub(r'\s+', ' ', text) # remove multiple spaces
    return text.strip()

In [11]:
# CLEAN!!!
df['statement'] = df['statement'].apply(clean_text)
df = df[df['statement'] != ""]

## Very deep cleaning

In [13]:
# Data distribution analysis of each label
df.status.value_counts()

status
Normal                  15962
Depression              15086
Suicidal                10639
Anxiety                  3617
Bipolar                  2501
Stress                   2293
Personality disorder      895
Name: count, dtype: int64

In [14]:
# Adding word count column for further analysis
df['word_count'] = df['statement'].apply(lambda x: len(x.split()))

In [16]:
# Define bins and labels for word count ranges
bins = [0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, float('inf')]  # Adjust as needed
labels = ['1-100', '101-200', '201-300', '301-400', '401-500', '501-600', '601-700', '701-800', '801-900', '901-1000', '+1000']

# Add a column to categorize statements into ranges
df['word_count_range'] = pd.cut(df['word_count'], bins=bins, labels=labels, right=True)

In [17]:
# Count the number of statements in each range
df['word_count_range'].value_counts().sort_index()

word_count_range
1-100       42851
101-200      5932
201-300      1422
301-400       450
401-500       179
501-600        72
601-700        40
701-800        19
801-900         8
901-1000        6
+1000          14
Name: count, dtype: int64

In [19]:
# Group by word count range and label, then count occurrences
df.groupby(['word_count_range', 'status']).size().unstack(fill_value=0)

  df.groupby(['word_count_range', 'status']).size().unstack(fill_value=0)


status,Anxiety,Bipolar,Depression,Normal,Personality disorder,Stress,Suicidal
word_count_range,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1-100,2777,1756,11225,15959,621,2073,8440
101-200,628,539,2798,3,208,164,1592
201-300,142,150,669,0,44,40,377
301-400,43,37,215,0,15,9,131
401-500,17,11,83,0,5,5,58
501-600,6,4,42,0,1,1,18
601-700,2,1,24,0,0,0,13
701-800,2,2,11,0,0,1,3
801-900,0,0,7,0,0,0,1
901-1000,0,0,4,0,0,0,2


In [38]:
df_export_candidate = df[(df['word_count'] >= 10) & (df['word_count'] <= 500)].reset_index(drop=True)
df_export_candidate.status.value_counts()

status
Depression              14115
Suicidal                 9686
Normal                   4771
Anxiety                  3098
Bipolar                  2463
Stress                   2270
Personality disorder      860
Name: count, dtype: int64

In [39]:
df_export_candidate = df_export_candidate.sort_values(by='word_count', ascending=False)
df_export_candidate = df_export_candidate.groupby('status').head(800)
df_export_candidate.reset_index(drop=True, inplace=True)
df_export_candidate

Unnamed: 0,statement,status,word_count,word_count_range
0,crospost not know start going long not expect ...,Depression,499,401-500
1,not know start going long not expect anyone re...,Suicidal,498,401-500
2,reflecting life lot lately always find going b...,Depression,498,401-500
3,since can remember always sad child inside bla...,Suicidal,497,401-500
4,m alone tough pretend whole person amp b haven...,Depression,496,401-500
...,...,...,...,...
5595,fatherlesnes mesed wasn parents weren maried r...,Personality disorder,19,1-100
5596,someone whistles around mean re judging intern...,Personality disorder,19,1-100
5597,not sure lol everything kinda just hurts sad r...,Personality disorder,19,1-100
5598,people avpd not sad two diferent disorders som...,Personality disorder,19,1-100


In [40]:
df_export_candidate.status.value_counts()

status
Depression              800
Suicidal                800
Anxiety                 800
Stress                  800
Personality disorder    800
Bipolar                 800
Normal                  800
Name: count, dtype: int64

In [42]:
df_export_candidate.drop(['word_count', 'word_count_range'], axis=1, inplace=True)

In [43]:
df_export_candidate.to_csv('data/a/cleaned_data.csv', index=False)