In [1]:
!python -m pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m59.4 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [1]:
import pandas as pd
import re
import spacy

In [2]:
!mkdir data
!wget -O "data/combined_data.csv" "https://drive.google.com/uc?export=download&id=1GJn2kEIBgto2OyD7-h2HQOv_NJUriqJh"

--2024-12-09 04:56:01--  https://drive.google.com/uc?export=download&id=1GJn2kEIBgto2OyD7-h2HQOv_NJUriqJh
Resolving drive.google.com (drive.google.com)... 172.217.203.113, 172.217.203.101, 172.217.203.102, ...
Connecting to drive.google.com (drive.google.com)|172.217.203.113|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://drive.usercontent.google.com/download?id=1GJn2kEIBgto2OyD7-h2HQOv_NJUriqJh&export=download [following]
--2024-12-09 04:56:01--  https://drive.usercontent.google.com/download?id=1GJn2kEIBgto2OyD7-h2HQOv_NJUriqJh&export=download
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 173.194.212.132, 2607:f8b0:400c:c11::84
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|173.194.212.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 31469558 (30M) [application/octet-stream]
Saving to: ‘data/combined_data.csv’


2024-12-09 04:56:05 (137 MB/s) - ‘data/combined_d

In [3]:
data_dir = "data/combined_data.csv"
df = pd.read_csv(data_dir, index_col=0)

In [4]:
df.head()

Unnamed: 0,statement,status
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 53043 entries, 0 to 53042
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   statement  52681 non-null  object
 1   status     53043 non-null  object
dtypes: object(2)
memory usage: 1.2+ MB


## Basic cleaning

In [6]:
# Check for missing values
print(f"Number of rows with missing values: {df.isnull().sum()}")

# Check for duplicates
print(f"Number of duplicate rows: {df.duplicated(subset=['statement']).sum()}")

Number of rows with missing values: statement    362
status         0
dtype: int64
Number of duplicate rows: 1969


In [7]:
# Drop rows that contain empty values
df = df.dropna()

# Drop rows that contain duplicate values in the ‘statement’ column and keep only the first row
df = df.drop_duplicates(subset=['statement'], keep='first')

df.reset_index(drop = True, inplace = True)

In [8]:
# Recheck for missing values
print(f"Number of rows with missing values: {df.isnull().sum()}")

# Recheck for duplicates
print(f"Number of duplicate rows: {df.duplicated(subset=['statement']).sum()}")

Number of rows with missing values: statement    0
status       0
dtype: int64
Number of duplicate rows: 0


## Deep cleaning

In [9]:
# Change the data type of ‘statement’ and ‘status’ columns to string
df = df.astype({"statement":str, "status":str})

In [10]:
emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)

stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [11]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

In [None]:
!which python

'which' is not recognized as an internal or external command,
operable program or batch file.


In [13]:
def clean_text(text):
    text = text.lower()

    # Hapus angka
    text = re.sub(r'\d+', '', text)

    # Hapus emoji (Unicode Range)
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    doc = nlp(text)
    cleaned_text = []
    for token in doc:
        # remove stopwords
        if token.is_stop:
            continue
        # replace verb with its lemma
        elif token.pos_ == "VERB":
            cleaned_text.append(token.lemma_)
        else:
            cleaned_text.append(token.text)

    text = " ".join(cleaned_text)

    # Hapus links
    text = re.sub(r"(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*)", " ", text)

    # Hapus punctuation marks
    text = re.sub(r'[^\w\s]', '', text)

    # Hapus karakter berulang
    text = re.sub(r'(.)\1+', r'\1', text)

    # Hapus karakter tunggal (misalnya huruf yang berdiri sendiri)
    text = re.sub(r'\b\w\b', '', text)

    # Hapus spasi ganda atau lebih
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

In [16]:
# CLEAN!!!
df['statement'] = df['statement'].apply(clean_text)
df = df[df['statement'] != ""]

## Very deep cleaning

In [17]:
# Data distribution analysis of each label
df.status.value_counts()

Unnamed: 0_level_0,count
status,Unnamed: 1_level_1
Normal,15817
Depression,15082
Suicidal,10636
Anxiety,3616
Bipolar,2501
Stress,2293
Personality disorder,895


In [18]:
# Adding word count column for further analysis
df['word_count'] = df['statement'].apply(lambda x: len(x.split()))

In [19]:
# Define bins and labels for word count ranges
bins = [0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, float('inf')]  # Adjust as needed
labels = ['1-100', '101-200', '201-300', '301-400', '401-500', '501-600', '601-700', '701-800', '801-900', '901-1000', '+1000']

# Add a column to categorize statements into ranges
df['word_count_range'] = pd.cut(df['word_count'], bins=bins, labels=labels, right=True)

In [20]:
# Count the number of statements in each range
df['word_count_range'].value_counts().sort_index()

Unnamed: 0_level_0,count
word_count_range,Unnamed: 1_level_1
1-100,45644
101-200,4029
201-300,793
301-400,235
401-500,72
501-600,37
601-700,10
701-800,5
801-900,5
901-1000,5


In [21]:
# Group by word count range and label, then count occurrences
df.groupby(['word_count_range', 'status']).size().unstack(fill_value=0)

  df.groupby(['word_count_range', 'status']).size().unstack(fill_value=0)


status,Anxiety,Bipolar,Depression,Normal,Personality disorder,Stress,Suicidal
word_count_range,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1-100,3027,2008,12607,15817,729,2155,9301
101-200,468,388,1908,0,132,109,1024
201-300,87,77,378,0,26,18,207
301-400,23,20,111,0,5,8,68
401-500,8,4,37,0,2,2,19
501-600,3,2,22,0,0,0,10
601-700,0,1,7,0,0,1,1
701-800,0,0,3,0,0,0,2
801-900,0,0,4,0,0,0,1
901-1000,0,0,4,0,0,0,1


In [22]:
df_export_candidate = df[(df['word_count'] >= 10) & (df['word_count'] <= 500)].reset_index(drop=True)
df_export_candidate.status.value_counts()

Unnamed: 0_level_0,count
status,Unnamed: 1_level_1
Depression,13700
Suicidal,9111
Normal,3529
Anxiety,2988
Bipolar,2447
Stress,2227
Personality disorder,841


In [None]:
# Count the number of examples for each label
label_counts = df_export_candidate['status'].value_counts()

# Find the label with the minimum count
min_label = label_counts.idxmin()
min_count = label_counts.min()

print(f"Label with the lowest number of examples: {min_label}")
print(f"Number of examples: {min_count}")

Label with the lowest number of examples: Personality disorder
Number of examples: 841


In [None]:
df_export_candidate = df_export_candidate.sort_values(by='word_count', ascending=False)
df_export_candidate = df_export_candidate.groupby('status').head(min_count)
df_export_candidate.reset_index(drop=True, inplace=True)
df_export_candidate

Unnamed: 0,statement,status,word_count,word_count_range
0,lengthy post sumary life far time line litle j...,Depression,500,401-500
1,know try symptoms hard point argue lose people...,Depression,499,401-500
2,hi know begin describe situation brain bit fo...,Depression,498,401-500
3,strugle lot life moment basicaly fel lost make...,Suicidal,494,401-500
4,sunday night try kil hang choke pas awake viol...,Depression,492,401-500
...,...,...,...,...
5882,eye color know relate avpd curious view polhtp...,Personality disorder,10,1-100
5883,diagnose autism spectrum disorder exhibit symp...,Personality disorder,10,1-100
5884,lie realize lie people rationalize protect fel...,Personality disorder,10,1-100
5885,internet adiction adict internet like rest wor...,Personality disorder,10,1-100


In [23]:
df_export_candidate.status.value_counts()

Unnamed: 0_level_0,count
status,Unnamed: 1_level_1
Depression,13700
Suicidal,9111
Normal,3529
Anxiety,2988
Bipolar,2447
Stress,2227
Personality disorder,841


In [24]:
df_export_candidate.drop(['word_count', 'word_count_range'], axis=1, inplace=True)

In [25]:
df_export_candidate = df_export_candidate.sample(frac=1).reset_index(drop=True)

In [26]:
df_export_candidate.to_csv('data/cleaned_data.csv', index=False)

In [27]:
from google.colab import files

# Unduh file
files.download('data/cleaned_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>